Uppdatera main samd README
This commit is contained in:
parent
cd32796b3e
commit
0e8a994162
@ -14,7 +14,7 @@ I will use 4 datasets from kaggle, 3 datasets from streaming-sites Netflix,
|
|||||||
Amazon Prime and Disney Plus, also 1 from a IMDB dataset.
|
Amazon Prime and Disney Plus, also 1 from a IMDB dataset.
|
||||||
|
|
||||||
### Model:
|
### Model:
|
||||||
I will use k-Nearest Neighbors (k-NN) alhorithm that can help me find other titles based on features
|
I will use NearestNeighbors (NN) alhorithm that can help me find other titles based on features
|
||||||
like Title, Release year, Description, Cast, Director and genres.
|
like Title, Release year, Description, Cast, Director and genres.
|
||||||
|
|
||||||
### Features:
|
### Features:
|
||||||
|
|||||||
216911
dataset/dataset_tmdb.csv
Normal file
216911
dataset/dataset_tmdb.csv
Normal file
File diff suppressed because it is too large
Load Diff
117
main.py
117
main.py
@ -1,9 +1,11 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
from sklearn.neighbors import NearestNeighbors
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from textwrap import dedent
|
||||||
|
|
||||||
class LoadData:
|
class LoadData:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.data = None
|
self.data = None
|
||||||
self.loaded_datasets = []
|
self.loaded_datasets = []
|
||||||
@ -18,8 +20,7 @@ class LoadData:
|
|||||||
def clean_text(self, text):
|
def clean_text(self, text):
|
||||||
if isinstance(text, str):
|
if isinstance(text, str):
|
||||||
cleaned = re.sub(r'[^\x00-\x7F]+', '', text)
|
cleaned = re.sub(r'[^\x00-\x7F]+', '', text)
|
||||||
cleaned = cleaned.replace('#', '')
|
cleaned = cleaned.replace('#', '').replace('"', '')
|
||||||
cleaned = cleaned.replace('"', '')
|
|
||||||
return cleaned.strip()
|
return cleaned.strip()
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
@ -27,7 +28,7 @@ class LoadData:
|
|||||||
try:
|
try:
|
||||||
df = pd.read_csv(f'dataset/{dataset_path}')
|
df = pd.read_csv(f'dataset/{dataset_path}')
|
||||||
df['stream'] = stream
|
df['stream'] = stream
|
||||||
if stream not in 'IMDB':
|
if stream != 'IMDB':
|
||||||
df = df.drop(columns=['show_id', 'date_added', 'duration', 'rating'], errors='ignore')
|
df = df.drop(columns=['show_id', 'date_added', 'duration', 'rating'], errors='ignore')
|
||||||
df = df.rename(columns={'listed_in': 'genres'})
|
df = df.rename(columns={'listed_in': 'genres'})
|
||||||
else:
|
else:
|
||||||
@ -58,47 +59,121 @@ class LoadData:
|
|||||||
print(f'Data from {", ".join(self.loaded_datasets)} imported.')
|
print(f'Data from {", ".join(self.loaded_datasets)} imported.')
|
||||||
|
|
||||||
def clean_data(self):
|
def clean_data(self):
|
||||||
|
self.data.dropna(subset=['title', 'genres', 'description'], inplace=True)
|
||||||
string_columns = self.data.select_dtypes(include=['object'])
|
string_columns = self.data.select_dtypes(include=['object'])
|
||||||
self.data[string_columns.columns] = string_columns.apply(lambda col: col.map(self.clean_text, na_action='ignore'))
|
self.data[string_columns.columns] = string_columns.apply(lambda col: col.map(self.clean_text, na_action='ignore'))
|
||||||
self.data = self.data[~self.data['title'].str.strip().isin(['', ':'])]
|
self.data = self.data[~self.data['title'].str.strip().isin(['', ':'])]
|
||||||
print(f'Data cleaned')
|
self.data['genres'] = self.data['genres'].str.split(', ').apply(lambda x: [genre.strip() for genre in x])
|
||||||
|
self.data = self.data[self.data['genres'].map(lambda x: len(x) > 0)]
|
||||||
|
print(f'Data cleaned. {self.data.shape[0]} records remaining.')
|
||||||
|
|
||||||
|
|
||||||
class UserData:
|
class UserData:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.user_data = None
|
self.user_data = None
|
||||||
|
|
||||||
def input(self):
|
def input(self):
|
||||||
self.user_data = input("Which Movie or TV-Serie do you prefer: ")
|
self.user_data = input("Which Movie or TV Series do you prefer: ")
|
||||||
return self.user_data.lower()
|
return self.user_data.strip().lower()
|
||||||
|
|
||||||
|
|
||||||
class Recommendations:
|
class TrainModel:
|
||||||
|
def __init__(self, title_data):
|
||||||
|
self.recommendation_model = None
|
||||||
|
self.title_data = title_data
|
||||||
|
self.title_vectors = None
|
||||||
|
self.vectorizer = TfidfVectorizer()
|
||||||
|
self.preprocess_data()
|
||||||
|
|
||||||
def __init__(self):
|
def preprocess_data(self):
|
||||||
self.result = None
|
self.title_data['genres'] = self.title_data['genres'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')
|
||||||
|
self.title_data['combined_text'] = (
|
||||||
|
self.title_data['title'].fillna('') + ' ' +
|
||||||
|
self.title_data['director'].fillna('') + ' ' +
|
||||||
|
self.title_data['cast'].fillna('') + ' ' +
|
||||||
|
self.title_data['genres'] + ' ' +
|
||||||
|
self.title_data['description'].fillna('')
|
||||||
|
)
|
||||||
|
self.title_data['combined_text'] = self.title_data['combined_text'].str.lower()
|
||||||
|
self.title_data['combined_text'] = self.title_data['combined_text'].str.replace(r'[^a-z\s]', '', regex=True)
|
||||||
|
self.title_vectors = self.vectorizer.fit_transform(self.title_data['combined_text'])
|
||||||
|
|
||||||
def get_recommendations(self, user_data, title_data):
|
def preprocess_user_input(self, user_input):
|
||||||
if title_data is not None and not title_data.empty:
|
user_vector = self.vectorizer.transform([user_input])
|
||||||
|
return user_vector
|
||||||
|
|
||||||
self.results = "Här ska de komma rekommendationer"
|
def train(self):
|
||||||
|
self.recommendation_model = NearestNeighbors(n_neighbors=10, metric='cosine')
|
||||||
|
self.recommendation_model.fit(self.title_vectors)
|
||||||
|
|
||||||
print(self.results)
|
|
||||||
|
class RecommendationLoader:
|
||||||
|
def __init__(self, model, title_data):
|
||||||
|
self.model = model
|
||||||
|
self.title_data = title_data
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
while True:
|
||||||
|
user_data = UserData()
|
||||||
|
user_input = user_data.input()
|
||||||
|
|
||||||
|
if user_input in ['exit', 'quit']:
|
||||||
|
print("Program will exit now. Thanks for using!")
|
||||||
|
break
|
||||||
|
|
||||||
|
self.get_recommendations(user_input)
|
||||||
|
print("\nWrite 'exit' or 'quit' to end the program.")
|
||||||
|
|
||||||
|
def get_recommendations(self, user_data):
|
||||||
|
user_vector = self.model.preprocess_user_input(user_data)
|
||||||
|
distances, indices = self.model.recommendation_model.kneighbors(user_vector, n_neighbors=10)
|
||||||
|
recommendations = self.title_data.iloc[indices[0]]
|
||||||
|
|
||||||
|
self.display_recommendations(user_data, recommendations)
|
||||||
|
|
||||||
|
def display_recommendations(self, user_data, recommendations):
|
||||||
|
print(f'\nRecommendations based on "{user_data}":\n')
|
||||||
|
|
||||||
|
if not recommendations.empty:
|
||||||
|
movie_recommendations = recommendations[recommendations['type'] == 'Movie']
|
||||||
|
tv_show_recommendations = recommendations[recommendations['type'] == 'TV Show']
|
||||||
|
|
||||||
|
if not movie_recommendations.empty:
|
||||||
|
print("\n#################### Recommended Movies: ####################")
|
||||||
|
for i, (_, row) in enumerate(movie_recommendations.iterrows(), start=1):
|
||||||
|
print(dedent(f"""
|
||||||
|
{i}. {row['title']} ({row['release_year']}) ({row['genres']})
|
||||||
|
Description: {row['description']}
|
||||||
|
Director: {row['director']}
|
||||||
|
Cast: {row['cast']}
|
||||||
|
|
||||||
|
===============================================================
|
||||||
|
"""))
|
||||||
|
|
||||||
|
if not tv_show_recommendations.empty:
|
||||||
|
print("\n#################### Recommended TV Shows: ####################")
|
||||||
|
for i, (_, row) in enumerate(tv_show_recommendations.iterrows(), start=1):
|
||||||
|
print(dedent(f"""
|
||||||
|
{i}. {row['title']} ({row['release_year']}) ({row['genres']})
|
||||||
|
Description: {row['description']}
|
||||||
|
Director: {row['director']}
|
||||||
|
Cast: {row['cast']}
|
||||||
|
|
||||||
|
===============================================================
|
||||||
|
"""))
|
||||||
else:
|
else:
|
||||||
print("No data available to search.")
|
print("No recommendations found.")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
data_loader = LoadData()
|
data_loader = LoadData()
|
||||||
title_data = data_loader.load_data()
|
title_data = data_loader.load_data()
|
||||||
|
|
||||||
user_data = UserData()
|
model = TrainModel(title_data)
|
||||||
user_input = user_data.input()
|
model.train()
|
||||||
|
|
||||||
recommendations = Recommendations()
|
recommendations = RecommendationLoader(model, title_data)
|
||||||
recommendations.get_recommendations(user_data, title_data)
|
recommendations.run()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
Loading…
Reference in New Issue
Block a user