in evals/translators/nllb.py [0:0]
def translate(texts, tokenizer, model, target):
results = []
if target in LANG_CODE_MAP:
lang_code = LANG_CODE_MAP[target]
else:
lang_code = None
for lang in tokenizer.additional_special_tokens:
if lang.startswith(iso.iso3_code(target)):
assert (
lang_code is None
), "Multiple NLLB language codes found for the same language ID, need to disambiguate!"
lang_code = lang
assert lang_code is not None, f"Lang code for {target} was not found"
forced_bos_token_id = tokenizer.lang_code_to_id[lang_code]
for partition in tqdm(list(toolz.partition_all(10, texts))):
tokenized_src = tokenizer(partition, return_tensors="pt", padding=True).to(device)
generated_tokens = model.generate(**tokenized_src, forced_bos_token_id=forced_bos_token_id)
results += tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
return results