filtering/deduplication/dedup_oscar.py (26 lines of code) (raw):
from datasets import load_from_disk
import string
def find_whitespace(text):
for i, c in enumerate(text):
if c in string.whitespace:
yield i
def
def get_segmentation(text, passage_tokens, overlap_tokens):
whitespace_idx = [-1] + list(find_whitespace(text))
unique_tokens = passage_tokens - overlap_tokens
passages = []
for i in range(0, len(whitespace_idx), unique_tokens):
if i + passage_tokens >= len(whitespace_idx):
passages.append((whitespace_idx[i] + 1, len(text)))
break
passages.append((whitespace_idx[i] + 1, whitespace_idx[i + passage_tokens] + 1))
return passages
if __name__ == "__main__":
oscar = load_from_disk("/home/piktus_huggingface_co/lumi/preprocessed_data/oscar_025")["train"]
with open("/home/piktus_huggingface_co/lumi/preprocessed_data/oscar_025/queries.txt", "w") as queries:
for line in oscar:
text = line["text"]
whitespace_idx = [-1] + list(find_whitespace(text))
for i in whitespace_idx:
if i + 101 < len(text):
queries.write(text[i+1:i+101] + "\n")