Uppdatera main samd README

This commit is contained in:
jwradhe 2024-10-28 22:50:46 +01:00
parent cd32796b3e
commit 0e8a994162
3 changed files with 217009 additions and 23 deletions

View File

@ -14,7 +14,7 @@ I will use 4 datasets from kaggle, 3 datasets from streaming-sites Netflix,
Amazon Prime and Disney Plus, also 1 from a IMDB dataset. Amazon Prime and Disney Plus, also 1 from a IMDB dataset.
### Model: ### Model:
I will use k-Nearest Neighbors (k-NN) alhorithm that can help me find other titles based on features I will use NearestNeighbors (NN) alhorithm that can help me find other titles based on features
like Title, Release year, Description, Cast, Director and genres. like Title, Release year, Description, Cast, Director and genres.
### Features: ### Features:

216911
dataset/dataset_tmdb.csv Normal file

File diff suppressed because it is too large Load Diff

119
main.py
View File

@ -1,9 +1,11 @@
import pandas as pd import pandas as pd
import re import re
import os import os
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from textwrap import dedent
class LoadData: class LoadData:
def __init__(self): def __init__(self):
self.data = None self.data = None
self.loaded_datasets = [] self.loaded_datasets = []
@ -18,8 +20,7 @@ class LoadData:
def clean_text(self, text): def clean_text(self, text):
if isinstance(text, str): if isinstance(text, str):
cleaned = re.sub(r'[^\x00-\x7F]+', '', text) cleaned = re.sub(r'[^\x00-\x7F]+', '', text)
cleaned = cleaned.replace('#', '') cleaned = cleaned.replace('#', '').replace('"', '')
cleaned = cleaned.replace('"', '')
return cleaned.strip() return cleaned.strip()
return '' return ''
@ -27,7 +28,7 @@ class LoadData:
try: try:
df = pd.read_csv(f'dataset/{dataset_path}') df = pd.read_csv(f'dataset/{dataset_path}')
df['stream'] = stream df['stream'] = stream
if stream not in 'IMDB': if stream != 'IMDB':
df = df.drop(columns=['show_id', 'date_added', 'duration', 'rating'], errors='ignore') df = df.drop(columns=['show_id', 'date_added', 'duration', 'rating'], errors='ignore')
df = df.rename(columns={'listed_in': 'genres'}) df = df.rename(columns={'listed_in': 'genres'})
else: else:
@ -58,47 +59,121 @@ class LoadData:
print(f'Data from {", ".join(self.loaded_datasets)} imported.') print(f'Data from {", ".join(self.loaded_datasets)} imported.')
def clean_data(self): def clean_data(self):
self.data.dropna(subset=['title', 'genres', 'description'], inplace=True)
string_columns = self.data.select_dtypes(include=['object']) string_columns = self.data.select_dtypes(include=['object'])
self.data[string_columns.columns] = string_columns.apply(lambda col: col.map(self.clean_text, na_action='ignore')) self.data[string_columns.columns] = string_columns.apply(lambda col: col.map(self.clean_text, na_action='ignore'))
self.data = self.data[~self.data['title'].str.strip().isin(['', ':'])] self.data = self.data[~self.data['title'].str.strip().isin(['', ':'])]
print(f'Data cleaned') self.data['genres'] = self.data['genres'].str.split(', ').apply(lambda x: [genre.strip() for genre in x])
self.data = self.data[self.data['genres'].map(lambda x: len(x) > 0)]
print(f'Data cleaned. {self.data.shape[0]} records remaining.')
class UserData: class UserData:
def __init__(self): def __init__(self):
self.user_data = None self.user_data = None
def input(self): def input(self):
self.user_data = input("Which Movie or TV-Serie do you prefer: ") self.user_data = input("Which Movie or TV Series do you prefer: ")
return self.user_data.lower() return self.user_data.strip().lower()
class Recommendations: class TrainModel:
def __init__(self, title_data):
self.recommendation_model = None
self.title_data = title_data
self.title_vectors = None
self.vectorizer = TfidfVectorizer()
self.preprocess_data()
def __init__(self): def preprocess_data(self):
self.result = None self.title_data['genres'] = self.title_data['genres'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')
self.title_data['combined_text'] = (
self.title_data['title'].fillna('') + ' ' +
self.title_data['director'].fillna('') + ' ' +
self.title_data['cast'].fillna('') + ' ' +
self.title_data['genres'] + ' ' +
self.title_data['description'].fillna('')
)
self.title_data['combined_text'] = self.title_data['combined_text'].str.lower()
self.title_data['combined_text'] = self.title_data['combined_text'].str.replace(r'[^a-z\s]', '', regex=True)
self.title_vectors = self.vectorizer.fit_transform(self.title_data['combined_text'])
def get_recommendations(self, user_data, title_data): def preprocess_user_input(self, user_input):
if title_data is not None and not title_data.empty: user_vector = self.vectorizer.transform([user_input])
return user_vector
self.results = "Här ska de komma rekommendationer" def train(self):
self.recommendation_model = NearestNeighbors(n_neighbors=10, metric='cosine')
self.recommendation_model.fit(self.title_vectors)
print(self.results)
class RecommendationLoader:
def __init__(self, model, title_data):
self.model = model
self.title_data = title_data
def run(self):
while True:
user_data = UserData()
user_input = user_data.input()
if user_input in ['exit', 'quit']:
print("Program will exit now. Thanks for using!")
break
self.get_recommendations(user_input)
print("\nWrite 'exit' or 'quit' to end the program.")
def get_recommendations(self, user_data):
user_vector = self.model.preprocess_user_input(user_data)
distances, indices = self.model.recommendation_model.kneighbors(user_vector, n_neighbors=10)
recommendations = self.title_data.iloc[indices[0]]
self.display_recommendations(user_data, recommendations)
def display_recommendations(self, user_data, recommendations):
print(f'\nRecommendations based on "{user_data}":\n')
if not recommendations.empty:
movie_recommendations = recommendations[recommendations['type'] == 'Movie']
tv_show_recommendations = recommendations[recommendations['type'] == 'TV Show']
if not movie_recommendations.empty:
print("\n#################### Recommended Movies: ####################")
for i, (_, row) in enumerate(movie_recommendations.iterrows(), start=1):
print(dedent(f"""
{i}. {row['title']} ({row['release_year']}) ({row['genres']})
Description: {row['description']}
Director: {row['director']}
Cast: {row['cast']}
===============================================================
"""))
if not tv_show_recommendations.empty:
print("\n#################### Recommended TV Shows: ####################")
for i, (_, row) in enumerate(tv_show_recommendations.iterrows(), start=1):
print(dedent(f"""
{i}. {row['title']} ({row['release_year']}) ({row['genres']})
Description: {row['description']}
Director: {row['director']}
Cast: {row['cast']}
===============================================================
"""))
else: else:
print("No data available to search.") print("No recommendations found.")
def main(): def main():
data_loader = LoadData() data_loader = LoadData()
title_data = data_loader.load_data() title_data = data_loader.load_data()
user_data = UserData() model = TrainModel(title_data)
user_input = user_data.input() model.train()
recommendations = Recommendations() recommendations = RecommendationLoader(model, title_data)
recommendations.get_recommendations(user_data, title_data) recommendations.run()
if __name__ == "__main__": if __name__ == "__main__":
main() main()