in decontamination/decontaminate.py [0:0]
def retrieve_ngrams_batch(batch, eval_ngrams, eval_datasets, eval_texts, ngram_len):
"""Find contaminated samples based on n-grams."""
new_batch = {"completion": [], "ngram": [], "bench_name": [], "bench_text": []}
for completion in batch["completion"]:
tokens = tokenize(completion)
ngrams = get_ngrams(tokens, ngram_len)
for ngram in ngrams:
if ngram in eval_ngrams:
idx = eval_ngrams[ngram]
new_batch["completion"].append(completion)
new_batch["ngram"].append(ngram)
new_batch["bench_name"].append(eval_datasets[idx])
new_batch["bench_text"].append(eval_texts[idx])
break
return new_batch