TV-Show-recommender/read_data.py

import pandas as pd
from import_data import ImportData


############################## Load data ##############################
class LoadData:
    def __init__(self):
        self.data = None
        self.filename = 'TMDB_tv_dataset_v3.csv'


# ---------------------- Function: load_data ----------------------
    def load_data(self):
        self.read_data()
        self.clean_data()
        print(f'{self.data.shape[0]} titles loaded successfully.')
        return self.data


# ---------------------- Function: read_data ----------------------
    def read_data(self):
        print("Starting to read data ...")
        try:
            # Try to Read CSV file
            self.data = pd.read_csv('data.csv')
            print(f'{self.data.shape[0]} rows read successfully.')
        except FileNotFoundError:
            print("No data.csv file found. Attempting to import data...")
            # If CSV file not found, try to import data from datasets instead
            try:
                data_importer = ImportData()
                data_importer.create_data(self.filename)
                data_importer.clean_data()
                data_importer.save_data()
                self.data = pd.read_csv('data.csv')
                print(f'{self.data.shape[0]} rows imported successfully.')
            except Exception as e:
                print(f"Error during data import process: {e}")


# ---------------------- Function: clean_data ----------------------
    def clean_data(self):
        # Function to split a string into a list, or use an empty list if no valid data
        def split_to_list(value):
            if isinstance(value, str):
                # Strip and split the string, and remove any empty items
                return [item.strip() for item in value.split(',') if item.strip()]
            return []

        data_start = self.data.shape[0]

        # Split genres, spoken_languages, networks, and created_by
        self.data['genres'] = self.data['genres'].apply(split_to_list)
        self.data['spoken_languages'] = self.data['spoken_languages'].apply(split_to_list)
        self.data['networks'] = self.data['networks'].apply(split_to_list)
        self.data['created_by'] = self.data['created_by'].apply(split_to_list)

        # Drop rows that are not in English
        self.data = self.data[self.data['original_language'] == 'en']

        # Drop rows with empty lists in genres or spoken_languages
        self.data = self.data[
            self.data['genres'].map(lambda x: len(x) > 0) &
            self.data['spoken_languages'].map(lambda x: len(x) > 0) &
            self.data['networks'].map(lambda x: len(x) > 0)
        ]

        # Count rows that were dropped
        rows_dropped = data_start - len(self.data)

        print('Data cleaned successfully, dropped ' + str(rows_dropped) + ' rows.')