in scripts/decontaminate.py [0:0]
def build_ngram_lookup(documents: list[str], ngram_size: int = 8) -> dict[str, set[int]]:
"""Build ngram lookup for documents."""
lookup = collections.defaultdict(set)
for doc_id, document in enumerate(tqdm(documents)):
normalized_text = normalize_string(document)
ngrams = word_ngrams(normalized_text, ngram_size)
for ngram in ngrams:
lookup[ngram].add(doc_id)
return lookup