def generate_dataset()

in src/utils.py [0:0]


def generate_dataset(path: str, samples: int = 1500) -> Dataset:
    """
    @param path: folder path of dataset_type (training, validation, testing)
    @param sample: number of negative sample to use for training
                   (this is due to large positive vs negative imbalance)
    @return: Dataset
    """
    print('Generating dataframe from HTML files...')
    datasets = ['training', 'validation', 'testing']
    dfs = []
    for ds in datasets:
        dfs.append(parse_html_to_dataframe(path, ds))
    df_combined = pd.concat(dfs, axis=0).reset_index(drop=True)

    print('Generating huggingface dataset from dataframe...')
    dataset = df_to_dataset(df_combined, samples)
    return dataset