Uppdaterat dataset laddning
This commit is contained in:
parent
ce2536b31f
commit
684a6159a9
85
main.py
85
main.py
@ -2,7 +2,9 @@ import pandas as pd
|
|||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.preprocessing import OneHotEncoder
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
class Load_Data:
|
class Load_Data:
|
||||||
|
|
||||||
@ -27,11 +29,13 @@ class Load_Data:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def clean_text(self, text):
|
def clean_text(self, text):
|
||||||
# Remove non-ASCII characters, # and " from title
|
|
||||||
cleaned = re.sub(r'[^\x00-\x7F]+', '', text)
|
if isinstance(text, str):
|
||||||
cleaned = cleaned.replace('#', '')
|
cleaned = re.sub(r'[^\x00-\x7F]+', '', text)
|
||||||
cleaned = cleaned.replace('"', '')
|
cleaned = cleaned.replace('#', '')
|
||||||
return cleaned.strip()
|
cleaned = cleaned.replace('"', '')
|
||||||
|
return cleaned.strip()
|
||||||
|
return ''
|
||||||
|
|
||||||
def create_data(self):
|
def create_data(self):
|
||||||
print(f'Starting to read data ...')
|
print(f'Starting to read data ...')
|
||||||
@ -42,80 +46,111 @@ class Load_Data:
|
|||||||
df_imdb = None
|
df_imdb = None
|
||||||
loaded_datasets = []
|
loaded_datasets = []
|
||||||
|
|
||||||
# Load datasets Netflix, Amazon, and Disney
|
|
||||||
try:
|
try:
|
||||||
df_netflix = pd.read_csv('dataset/data_netflix.csv')
|
df_netflix = pd.read_csv('dataset/data_netflix.csv')
|
||||||
|
df_netflix['stream'] = 'Netflix'
|
||||||
|
df_netflix = df_netflix.drop(columns=['show_id', 'date_added', 'duration', 'rating'], errors='ignore')
|
||||||
|
df_netflix = df_netflix.rename(columns={'listed_in': 'genres'})
|
||||||
loaded_datasets.append('Netflix')
|
loaded_datasets.append('Netflix')
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print("Warning: 'data_netflix.csv' not found. Skipping this dataset.")
|
print("Warning: 'data_netflix.csv' not found. Skipping this dataset.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
df_amazon = pd.read_csv('dataset/data_amazon.csv')
|
df_amazon = pd.read_csv('dataset/data_amazon.csv')
|
||||||
|
df_amazon['stream'] = 'Amazon'
|
||||||
|
df_amazon = df_amazon.drop(columns=['show_id', 'date_added', 'duration', 'rating'], errors='ignore')
|
||||||
|
df_amazon = df_amazon.rename(columns={'listed_in': 'genres'})
|
||||||
loaded_datasets.append('Amazon')
|
loaded_datasets.append('Amazon')
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print("Warning: 'data_amazon.csv' not found. Skipping this dataset.")
|
print("Warning: 'data_amazon.csv' not found. Skipping this dataset.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
df_disney = pd.read_csv('dataset/data_disney.csv')
|
df_disney = pd.read_csv('dataset/data_disney.csv')
|
||||||
|
df_disney['stream'] = 'Disney'
|
||||||
|
df_disney = df_disney.drop(columns=['show_id', 'date_added', 'duration', 'rating'], errors='ignore')
|
||||||
|
df_disney = df_disney.rename(columns={'listed_in': 'genres'})
|
||||||
loaded_datasets.append('Disney')
|
loaded_datasets.append('Disney')
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print("Warning: 'data_disney.csv' not found. Skipping this dataset.")
|
print("Warning: 'data_disney.csv' not found. Skipping this dataset.")
|
||||||
|
|
||||||
# Load IMDB dataset and rename column
|
|
||||||
try:
|
try:
|
||||||
df_imdb = pd.read_csv('dataset/data_imdb.csv')
|
df_imdb = pd.read_csv('dataset/data_imdb.csv')
|
||||||
|
df_imdb['stream'] = 'Unknown'
|
||||||
df_imdb = df_imdb.rename(columns={'releaseYear': 'release_year'})
|
df_imdb = df_imdb.rename(columns={'releaseYear': 'release_year'})
|
||||||
|
df_imdb = df_imdb.drop(columns=['numVotes', 'id','avaverageRating'], errors='ignore')
|
||||||
loaded_datasets.append('IMDB')
|
loaded_datasets.append('IMDB')
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print("Warning: 'data_imdb.csv' not found. Skipping this dataset.")
|
print("Warning: 'data_imdb.csv' not found. Skipping this dataset.")
|
||||||
|
|
||||||
# Create a list to hold non-empty dataframes
|
|
||||||
dataframes = [df for df in [df_imdb, df_netflix, df_amazon, df_disney] if df is not None]
|
dataframes = [df for df in [df_imdb, df_netflix, df_amazon, df_disney] if df is not None]
|
||||||
|
|
||||||
# Check if any dataframes were loaded
|
|
||||||
if not dataframes:
|
if not dataframes:
|
||||||
print("Error: No datasets loaded. Cannot create combined data.")
|
print("Error: No datasets loaded. Cannot create combined data.")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Concatenate all datasets
|
df_all = pd.concat(dataframes, ignore_index=True, sort=False)
|
||||||
df_all = pd.concat([df_imdb, df_netflix, df_amazon, df_disney], ignore_index=True, sort=False)
|
df_all = df_all.infer_objects(copy=False)
|
||||||
|
self.data = df_all
|
||||||
# Forward-fill and backward-fill the entire dataframe
|
|
||||||
df_all.ffill(inplace=True)
|
|
||||||
df_all.bfill(inplace=True)
|
|
||||||
|
|
||||||
df = df_all.groupby(['title', 'release_year'], as_index=False).first()
|
|
||||||
df = df.infer_objects(copy=False)
|
|
||||||
self.data = df
|
|
||||||
|
|
||||||
print(f'Data from {", ".join(loaded_datasets)} loaded successfully.')
|
print(f'Data from {", ".join(loaded_datasets)} loaded successfully.')
|
||||||
|
|
||||||
def clean_data(self):
|
def clean_data(self):
|
||||||
# Clean the dataset
|
|
||||||
string_columns = self.data.select_dtypes(include=['object'])
|
string_columns = self.data.select_dtypes(include=['object'])
|
||||||
self.data[string_columns.columns] = string_columns.apply(lambda col: col.map(self.clean_text))
|
self.data[string_columns.columns] = string_columns.apply(lambda col: col.map(self.clean_text, na_action='ignore'))
|
||||||
self.data = self.data[~self.data['title'].str.strip().isin(['', ':'])]
|
self.data = self.data[~self.data['title'].str.strip().isin(['', ':'])]
|
||||||
print(f'Data cleaned successfully.')
|
print(f'Data cleaned successfully.')
|
||||||
|
|
||||||
def save_data(self):
|
def save_data(self):
|
||||||
# Save cleaned data to CSV
|
|
||||||
self.data.to_csv(self.data_file, index=False)
|
self.data.to_csv(self.data_file, index=False)
|
||||||
print(f'Data saved to {self.data_file} successfully.')
|
print(f'Data saved to {self.data_file} successfully.')
|
||||||
|
|
||||||
def load_data(self):
|
def load_data(self):
|
||||||
# Load data from CSV
|
|
||||||
self.data = pd.read_csv(self.data_file)
|
self.data = pd.read_csv(self.data_file)
|
||||||
num_rows = self.data.shape[0]
|
num_rows = self.data.shape[0]
|
||||||
print(f'{num_rows} titles loaded successfully.')
|
print(f'{num_rows} titles loaded successfully.')
|
||||||
|
|
||||||
|
|
||||||
|
class Search:
|
||||||
|
|
||||||
|
def __init__(self, data):
|
||||||
|
self.data = data
|
||||||
|
self.preproccess()
|
||||||
|
|
||||||
|
def preproccess(self):
|
||||||
|
self.description_vectorizer = TfidfVectorizer(stop_words='english')
|
||||||
|
self.description_matrix = self.description_vectorizer.fit_transform(self.data['description'].fillna(''))
|
||||||
|
|
||||||
|
self.onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
|
||||||
|
genres_type_matrix = self.onehot_encoder.fit_transform(self.data[['genres', 'type']].fillna(''))
|
||||||
|
|
||||||
|
self.feature_matrix = np.hstack([
|
||||||
|
self.description_matrix.toarray(),
|
||||||
|
genres_type_matrix,
|
||||||
|
self.data[['release_year']].fillna(0).to_numpy()
|
||||||
|
])
|
||||||
|
|
||||||
|
def search(self, query, top_n=20):
|
||||||
|
query_vec = self.description_vectorizer.transform([query])
|
||||||
|
|
||||||
|
if hasattr(query_vec, "toarray"):
|
||||||
|
query_vec = query_vec.toarray()
|
||||||
|
|
||||||
|
similarity = cosine_similarity(query_vec, self.description_matrix).flatten()
|
||||||
|
|
||||||
|
top_indices = similarity.argsort()[-top_n:][::-1]
|
||||||
|
return self.data.iloc[top_indices][['title', 'genres', 'type', 'release_year', 'stream','description']]
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
data_loader = Load_Data()
|
data_loader = Load_Data()
|
||||||
data = data_loader.check_data()
|
data = data_loader.check_data()
|
||||||
|
|
||||||
if data is not None and not data.empty:
|
if data is not None and not data.empty:
|
||||||
user_input = input("Which Movie or TV-Series do you prefer?: ")
|
|
||||||
|
user_input = input("Which Movie or TV-Serie do you prefer: ")
|
||||||
|
search_data = Search(data)
|
||||||
|
results = search_data.search(user_input)
|
||||||
|
print(results)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print("No data available to search.")
|
print("No data available to search.")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user