in clean_and_create/load_data.py [0:0]
def load_and_concatenate_dataframes():
if os.path.exists('concatenated_synthetic_dataset.parquet.gzip'):
return pd.read_parquet('concatenated_synthetic_dataset.parquet.gzip')
# Directory where the .h5 files are stored
directory = '.'
# List all files in the directory
all_files = os.listdir(directory)
# Filter out the .h5 files and sort them
h5_files = sorted([f for f in all_files if re.match(r'synthetic_dataset_batch_\d+\.h5$', f)])
# Initialize an empty list to hold the dataframes
dataframes = []
# Load each .h5 file and append the dataframe to the list
for file in tqdm(h5_files, desc="Loading data"):
file_path = os.path.join(directory, file)
df = pd.read_hdf(file_path)
if '__key__' not in df.columns:
raise ValueError(f"Key column not found in {file_path}")
df.__key__ = df.__key__.apply(pd.to_numeric)
dataframes.append(df)
# Concatenate all dataframes
concatenated_df = pd.concat(dataframes, ignore_index=True)
concatenated_df.to_parquet('concatenated_synthetic_dataset.parquet.gzip', compression='gzip')
return concatenated_df