76 lines
3.0 KiB
Python
76 lines
3.0 KiB
Python
import re
|
|
import os
|
|
import pandas as pd
|
|
|
|
|
|
###############################################################
|
|
#### Class: ImportData
|
|
###############################################################
|
|
class ImportData:
|
|
|
|
def __init__(self):
|
|
self.data = None
|
|
self.loaded_datasets = []
|
|
|
|
|
|
###########################################################
|
|
#### Function: load_dataset
|
|
###########################################################
|
|
def load_dataset(self, dataset_path):
|
|
# Load data from dataset CSV file
|
|
try:
|
|
df = pd.read_csv(os.path.join(f'dataset', dataset_path))
|
|
return df
|
|
except FileNotFoundError:
|
|
print(f'Warning: "{dataset_path}" not found. Skipping this dataset.')
|
|
return None
|
|
|
|
|
|
###########################################################
|
|
#### Function: create_data
|
|
###########################################################
|
|
def create_data(self, filename):
|
|
try:
|
|
self.data = self.load_dataset(filename)
|
|
print(f'Imported data successfully.')
|
|
except FileNotFoundError:
|
|
print("No data imported, missing dataset")
|
|
return None
|
|
|
|
|
|
###########################################################
|
|
#### Function: clean_data
|
|
###########################################################
|
|
def clean_data(self):
|
|
if self.data is not None:
|
|
# Drop unnecessary columns
|
|
df_cleaned = self.data.drop(columns=['adult', 'poster_path', 'production_companies',
|
|
'in_production','backdrop_path','production_countries','status','episode_run_time',
|
|
'original_name', 'popularity', 'tagline','homepage'], errors='ignore')
|
|
|
|
# Clean text from non-ASCII characters
|
|
text_columns = ['name', 'overview','spoken_languages']
|
|
masks = [df_cleaned[col].apply(lambda x: isinstance(x, str) and bool(re.match(r'^[\x00-\x7F]*$', x)))
|
|
for col in text_columns]
|
|
combined_mask = pd.concat(masks, axis=1).all(axis=1)
|
|
|
|
self.data = df_cleaned[combined_mask]
|
|
|
|
print(f'Data cleaned. {self.data.shape[0]} records remaining.')
|
|
else:
|
|
print("No data to clean. Please load the dataset first.")
|
|
|
|
|
|
###########################################################
|
|
#### Function: save_data
|
|
###########################################################
|
|
def save_data(self):
|
|
if self.data is not None:
|
|
try:
|
|
# Sava dataframe to CSV
|
|
self.data.to_csv('data.csv', index=False)
|
|
print(f'Data saved to data.csv.')
|
|
except Exception as e:
|
|
print(f'Error saving data: {e}')
|
|
else:
|
|
print("No data to save. Please clean the data first.") |