in scripts/decontaminate.py [0:0]
def cleanup(dataset: Dataset) -> Dataset:
initial_size = len(dataset)
contamination_cols = [col for col in dataset.column_names if col.startswith("contaminated_")]
for col in contamination_cols:
if col.startswith("contaminated_"):
size_prior = len(dataset)
dataset = dataset.filter(lambda x: not x[col], num_proc=8)
if len(dataset) < size_prior:
print(f"Removed {size_prior - len(dataset)} samples from '{col.replace('contaminated_', '')}'")
dataset = dataset.remove_columns(contamination_cols)
print(f"Initial size: {initial_size}, Final size: {len(dataset)}")
return dataset