filtering/deduplication/suffix_dedup.py (22 lines of code) (raw):
import argparse
import os
import sys
from datasets import load_dataset
sys.path.append("/home/piktus_huggingface_co/lumi/text-dedup")
print(sys.path)
from text_dedup.suffix_array import suffix_array
def get_args():
"""
parser = argparse.ArgumentParser()
parser.add_argument('--name', type=str, required=True, help="Path to the dataset you're using on the HF hub. Pass e.g. `csv` or `json` and `data_files=path_on_disk` to load something locally")
parser.add_argument('--subset', type=str, default=None, help="Subset of the dataset you're using, if needed")
parser.add_argument('--data_files', type=str, default=None, help="Path to the dataset on disk if using local files")
parser.add_argument('--path_on_disk', type=str, required=True, help="Path to the Rust dedup implem on your disk, see https://github.com/google-research/deduplicate-text-datasets")
parser.add_argument('--cache_dir', type=str, required=True, help="Where all the suffix tree files will get built")
return parser.parse_args()
"""
def generator_from_dataset(dataset):
for item in dataset:
yield item["text"]
if __name__ == "__main__":
# args = get_args()
# dataset = load_dataset(args.name, args.subset, data_files=args.data_files, use_auth_token=True, split="train")
# corpus = generator_from_dataset(dataset)
ds = load_dataset("ola13/small-oscar", use_auth_token=os.environ.get("HUGGINGFACE_TOKEN"))
deduplicator = suffix_array(
ds["train"],
dedup_name="test",
k=10,
merge_strategy="overlapping",
google_repo_path="/home/piktus_huggingface_co/lumi/deduplicate-text-datasets/",
output_dir="/mnt/disks/looking_glass_storage/dedup",
column="text",
)
# suffix_array(k=10, merge_strategy='overlapping', google_repo_path=args.path_on_disk, cache_dir=args.cache_dir)
# slices = deduplicator.fit_predict(corpus)
# for sentence, intervals in zip(corpus, slices):
# print(sentence)
# print([sentence.encode('utf-8')[s].decode('utf-8', errors='ignore') for s in intervals])