in src/utils.py [0:0]
def df_to_dataset(df: pd.DataFrame, samples: int) -> Dataset:
def fetch_df_by_type(df: pd.DataFrame, filter: str, samples: int):
df_pos = df[(df['ml_dataset'] == filter) & (df['labels'] != NEGATIVE_LABEL)]
df_neg = df[(df['ml_dataset'] == filter) & (df['labels'] == NEGATIVE_LABEL)].sample(n=samples, random_state=seed)
return pd.concat([df_pos, df_neg], axis=0).reset_index(drop=True).sample(frac=1)
ds = DatasetDict()
df_train, df_eval, df_test = (
fetch_df_by_type(df, 'training', samples),
fetch_df_by_type(df, 'validation', samples),
fetch_df_by_type(df, 'testing', samples)
)
print('Saving dataframe to desktop...')
df_combined = pd.concat([df_train, df_eval, df_test]).reset_index(drop=True)
df_combined.to_parquet('~/Desktop/dataset.parquet', index=False)
dataset_train, dataset_eval, dataset_test = (
Dataset.from_pandas(df_train),
Dataset.from_pandas(df_eval),
Dataset.from_pandas(df_test)
)
ds['train'], ds['eval'], ds['test'] = (
dataset_train,
dataset_eval,
dataset_test
)
ds['train'] = ds['train'].class_encode_column('labels')
class_label_feature = ds['train'].features['labels']
ds['test'] = ds['test'].cast_column('labels', class_label_feature)
ds['eval'] = ds['eval'].cast_column('labels', class_label_feature)
return ds