clean_and_create/load_data.py [108:131]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            }    
    except Exception as e:
        print(f"Error processing group {key}: {e}")
        return None

def process_tar_index(tar_index, step_size, question_answer_df):
    shard_nr = tar_index//step_size
    loaded_datasets = []

    for inner_idx in range(step_size):
        tar_file = os.path.join(DATA_PATH, TAR_FILE_PATTERN.format(tar_index+inner_idx))
        try:
            print(f"Loading dataset from: {tar_file}")
            hf_dataset = datasets.load_dataset('webdataset', split='train', data_files=tar_file, cache_dir="/fsx/.cache").to_pandas()
            hf_dataset.__key__ = hf_dataset.__key__.apply(pd.to_numeric)
            loaded_datasets.append(hf_dataset)
        except Exception as e:
            print(f"Error loading dataset from: {tar_file}")
            print(e)

    hf_dataset = pd.concat(loaded_datasets, ignore_index=True)
    print(f"Concatenated datasets with {len(hf_dataset)} samples")

    hf_dataset = hf_dataset[hf_dataset['__key__'].isin(question_answer_df['__key__'].unique())] # Filter samples that are not present in q_a_df
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


create_only_with_pdfs/load_data.py [52:75]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            }    
    except Exception as e:
        print(f"Error processing group {key}: {e}")
        return None

def process_tar_index(tar_index, step_size, question_answer_df):
    shard_nr = tar_index//step_size
    loaded_datasets = []

    for inner_idx in range(step_size):
        tar_file = os.path.join(DATA_PATH, TAR_FILE_PATTERN.format(tar_index+inner_idx))
        try:
            print(f"Loading dataset from: {tar_file}")
            hf_dataset = datasets.load_dataset('webdataset', split='train', data_files=tar_file, cache_dir="/fsx/.cache").to_pandas()
            hf_dataset.__key__ = hf_dataset.__key__.apply(pd.to_numeric)
            loaded_datasets.append(hf_dataset)
        except Exception as e:
            print(f"Error loading dataset from: {tar_file}")
            print(e)

    hf_dataset = pd.concat(loaded_datasets, ignore_index=True)
    print(f"Concatenated datasets with {len(hf_dataset)} samples")

    hf_dataset = hf_dataset[hf_dataset['__key__'].isin(question_answer_df['__key__'].unique())] # Filter samples that are not present in question_answer_df
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -