generation/boilerplate_cleanup.py (25 lines of code) (raw):

import re import argparse from datasets import load_dataset patterns = [ # alien stories r"^Hello.*?[.!]\s+", #r"^I'm( so)? excited to.*?[.!]\s+", r"^My name is.*?[.!]\s+", r"^You've just arrived.*?[.!]\s+", # wikihow r"^\*\*Welcome, .*?[.!]\*\*\s+", r"^(\*\*)?Warning:.*?[.!]\s+", r"^We're thrilled.*?[.!]\s+", # middle school r"^Welcome, .*?[.!]\s+", ] patterns = [re.compile(p, flags=re.IGNORECASE|re.MULTILINE) for p in patterns] def clean_text(sample): sample['completion_unfiltered'] = sample['completion'] for pattern in patterns: sample['completion'] = pattern.sub('', sample['completion'].strip()) return sample if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--dataset", type=str, default="HuggingFaceTB/alien_stories_0_1M_llama3") args = parser.parse_args() data = load_dataset(args.dataset, split="train", cache_dir="/scratch/cosmo/cache", num_proc=32) data = data.map(clean_text, num_proc=32) data.push_to_hub(args.dataset, private=True)