in src/utils.py [0:0]
def generate_dataset(path: str, samples: int = 1500) -> Dataset:
"""
@param path: folder path of dataset_type (training, validation, testing)
@param sample: number of negative sample to use for training
(this is due to large positive vs negative imbalance)
@return: Dataset
"""
print('Generating dataframe from HTML files...')
datasets = ['training', 'validation', 'testing']
dfs = []
for ds in datasets:
dfs.append(parse_html_to_dataframe(path, ds))
df_combined = pd.concat(dfs, axis=0).reset_index(drop=True)
print('Generating huggingface dataset from dataframe...')
dataset = df_to_dataset(df_combined, samples)
return dataset