filtering/deduplication/suffix_dedup.py (22 lines of code) (raw):

import argparse import os import sys from datasets import load_dataset sys.path.append("/home/piktus_huggingface_co/lumi/text-dedup") print(sys.path) from text_dedup.suffix_array import suffix_array def get_args(): """ parser = argparse.ArgumentParser() parser.add_argument('--name', type=str, required=True, help="Path to the dataset you're using on the HF hub. Pass e.g. `csv` or `json` and `data_files=path_on_disk` to load something locally") parser.add_argument('--subset', type=str, default=None, help="Subset of the dataset you're using, if needed") parser.add_argument('--data_files', type=str, default=None, help="Path to the dataset on disk if using local files") parser.add_argument('--path_on_disk', type=str, required=True, help="Path to the Rust dedup implem on your disk, see https://github.com/google-research/deduplicate-text-datasets") parser.add_argument('--cache_dir', type=str, required=True, help="Where all the suffix tree files will get built") return parser.parse_args() """ def generator_from_dataset(dataset): for item in dataset: yield item["text"] if __name__ == "__main__": # args = get_args() # dataset = load_dataset(args.name, args.subset, data_files=args.data_files, use_auth_token=True, split="train") # corpus = generator_from_dataset(dataset) ds = load_dataset("ola13/small-oscar", use_auth_token=os.environ.get("HUGGINGFACE_TOKEN")) deduplicator = suffix_array( ds["train"], dedup_name="test", k=10, merge_strategy="overlapping", google_repo_path="/home/piktus_huggingface_co/lumi/deduplicate-text-datasets/", output_dir="/mnt/disks/looking_glass_storage/dedup", column="text", ) # suffix_array(k=10, merge_strategy='overlapping', google_repo_path=args.path_on_disk, cache_dir=args.cache_dir) # slices = deduplicator.fit_predict(corpus) # for sentence, intervals in zip(corpus, slices): # print(sentence) # print([sentence.encode('utf-8')[s].decode('utf-8', errors='ignore') for s in intervals])