TV-Show-recommender/trainmodel.py

107 lines
4.0 KiB
Python

from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack, csr_matrix
import time
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn')
############################## Train model ##############################
class TrainModel:
def __init__(self, title_data):
self.title_data = title_data
# Settings for vectorization
self.vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=0.01, max_df=0.5)
# Settings for nearest neighbors
self.model = NearestNeighbors(metric='cosine')
self.scaler = StandardScaler()
# Settings for SVD
self.svd = TruncatedSVD(n_components=300)
# ---------------------- Function: train ----------------------
def train(self):
print("Starting to train model ...")
start = time.time()
# Preprocess title data
preproccessed_data = self.preprocess_title_data()
# Train the NN model
self.model.fit(preproccessed_data)
stop = time.time()
# Count time for training
elapsed_time = stop - start
print(f'Trained model successfully in {elapsed_time:.2f} seconds.')
# ------------------------ Function: recommend ------------------------
def recommend(self, target_row, num_recommendations=40):
# Preprocess target data
target_vector = self.preprocess_target_data(target_row)
# Get nearest neighbors
distances, indices = self.model.kneighbors(target_vector, n_neighbors=num_recommendations)
recommendations = self.title_data.iloc[indices[0]].copy()
recommendations['distance'] = distances[0]
# Filter recommendations
recommendations = recommendations[
(recommendations['name'].str.lower() != target_row['name'].lower()) &
(recommendations['distance'] < 0.5)
]
return recommendations.head(num_recommendations)
# ---------------------- Function: preprocess_data ----------------------
def preprocess_title_data(self):
# Combine text fields in a new column for vectorization
self.title_data['combined_text'] = (
self.title_data['overview'].fillna('').apply(str) + ' ' +
self.title_data['genres'].fillna('').apply(str) + ' ' +
self.title_data['created_by'].fillna('').apply(str)
)
# Process combined_text column with vectorizer
text_features = self.vectorizer.fit_transform(self.title_data['combined_text'])
text_features = self.svd.fit_transform(text_features)
# Scale numerical features in the DataFrame using a scaler
self.numerical_data = self.title_data.select_dtypes(include=['number'])
# Include ratings in numerical features
if 'vote_average' in self.numerical_data.columns:
self.numerical_data = self.numerical_data[['vote_average']]
# Scale numerical features
numerical_features = self.scaler.fit_transform(self.numerical_data)
numerical_features_sparse = csr_matrix(numerical_features)
# Combine text and numerical features
combined_features = hstack([csr_matrix(text_features), numerical_features_sparse])
return combined_features
# ---------------------- Function: preprocess_target_data ----------------------
def preprocess_target_data(self, target_row):
# Create feature vector for target row
target_text_vector = self.vectorizer.transform([target_row['combined_text']])
target_text_vector = self.svd.transform(target_text_vector)
# Process numerical features of the referens target
target_numerical = target_row[self.numerical_data.columns].values.reshape(1, -1)
target_vector = hstack([csr_matrix(target_text_vector), csr_matrix(self.scaler.transform(target_numerical))])
return target_vector