72 lines
2.8 KiB
Python
72 lines
2.8 KiB
Python
import pandas as pd
|
|
from import_data import ImportData
|
|
|
|
|
|
############################## Load data ##############################
|
|
class LoadData:
|
|
def __init__(self):
|
|
self.data = None
|
|
self.filename = 'TMDB_tv_dataset_v3.csv'
|
|
|
|
|
|
# ---------------------- Function: load_data ----------------------
|
|
def load_data(self):
|
|
self.read_data()
|
|
self.clean_data()
|
|
print(f'{self.data.shape[0]} titles loaded successfully.')
|
|
return self.data
|
|
|
|
|
|
# ---------------------- Function: read_data ----------------------
|
|
def read_data(self):
|
|
print("Starting to read data ...")
|
|
try:
|
|
# Try to Read CSV file
|
|
self.data = pd.read_csv('data.csv')
|
|
print(f'{self.data.shape[0]} rows read successfully.')
|
|
except FileNotFoundError:
|
|
print("No data.csv file found. Attempting to import data...")
|
|
# If CSV file not found, try to import data from datasets instead
|
|
try:
|
|
data_importer = ImportData()
|
|
data_importer.create_data(self.filename)
|
|
data_importer.clean_data()
|
|
data_importer.save_data()
|
|
self.data = pd.read_csv('data.csv')
|
|
print(f'{self.data.shape[0]} rows imported successfully.')
|
|
except Exception as e:
|
|
print(f"Error during data import process: {e}")
|
|
|
|
|
|
# ---------------------- Function: clean_data ----------------------
|
|
def clean_data(self):
|
|
# Function to split a string into a list, or use an empty list if no valid data
|
|
def split_to_list(value):
|
|
if isinstance(value, str):
|
|
# Strip and split the string, and remove any empty items
|
|
return [item.strip() for item in value.split(',') if item.strip()]
|
|
return []
|
|
|
|
data_start = self.data.shape[0]
|
|
|
|
# Split genres, spoken_languages, networks, and created_by
|
|
self.data['genres'] = self.data['genres'].apply(split_to_list)
|
|
self.data['spoken_languages'] = self.data['spoken_languages'].apply(split_to_list)
|
|
self.data['networks'] = self.data['networks'].apply(split_to_list)
|
|
self.data['created_by'] = self.data['created_by'].apply(split_to_list)
|
|
|
|
# Drop rows that are not in English
|
|
self.data = self.data[self.data['original_language'] == 'en']
|
|
|
|
# Drop rows with empty lists in genres or spoken_languages
|
|
self.data = self.data[
|
|
self.data['genres'].map(lambda x: len(x) > 0) &
|
|
self.data['spoken_languages'].map(lambda x: len(x) > 0) &
|
|
self.data['networks'].map(lambda x: len(x) > 0)
|
|
]
|
|
|
|
# Count rows that were dropped
|
|
rows_dropped = data_start - len(self.data)
|
|
|
|
print('Data cleaned successfully, dropped ' + str(rows_dropped) + ' rows.')
|