def load_and_concatenate_dataframes()

in create_only_with_pdfs/load_data.py [0:0]


def load_and_concatenate_dataframes():
    if os.path.exists('/fsx/andi/llm-swarm/concatenated_synthetic_dataset.parquet.gzip'):
        return pd.read_parquet('/fsx/andi/llm-swarm/concatenated_synthetic_dataset.parquet.gzip')  

    # Directory where the .h5 files are stored
    directory = '.'

    # List all files in the directory
    all_files = os.listdir(directory)

    # Filter out the .h5 files and sort them
    h5_files = sorted([f for f in all_files if re.match(r'synthetic_dataset_batch_\d+\.h5$', f)])

    # Initialize an empty list to hold the dataframes
    dataframes = []

    # Load each .h5 file and append the dataframe to the list
    for file in tqdm(h5_files, desc="Loading data"):
        file_path = os.path.join(directory, file)
        df = pd.read_hdf(file_path)
        if '__key__' not in df.columns:
            raise ValueError(f"Key column not found in {file_path}")
        df.__key__ = df.__key__.apply(pd.to_numeric)
        dataframes.append(df)

    # Concatenate all dataframes
    concatenated_df = pd.concat(dataframes, ignore_index=True)
    concatenated_df.to_parquet('concatenated_synthetic_dataset.parquet.gzip', compression='gzip')  

    return concatenated_df