in src/pixparse/data/datasets_utils.py [0:0]
def get_additional_tokens_from_dataset(all_special_tokens:list, dataset=None, dataset_id:str="naver-clova-ix/cord-v2")->list:
"""
This util is made to run a first pass for CORD
with an instantiated tokenizer.
the additional tokens are returned as a list and can then be
added to your tokenizer and saved
to disk.
Usage:
# Instantiate tokenizer for your task
taskcfg = TaskCrullerPretrainCfg(model_name="cruller_base")
tokenizer = TokenizerHF(taskcfg.tokenizer)
all_special_tokens = tokenizer.trunk.all_special_tokens
new_special_tokens = get_additional_tokens_from_dataset(all_special_tokens, dataset_id="naver-clova-ix/cord-v2")
# Now you can add the tokens
newly_added_num = tokenizer.trunk.add_special_tokens(
{"additional_special_tokens": sorted(set(new_special_tokens))}
)
# You can resize the embeddings of your text decoder accordingly
if newly_added_num > 0:
model.text_decoder.trunk.resize_token_embeddings(
len(tokenizer.trunk)
)
# now your tokenizer will parse correctly the dataset.
"""
if dataset_id == "naver-clova-ix/cord-v2":
def collate_fn(batch):
"""
basic collator for PIL images, as returned by rvlcdip dataloader (among others)
"""
text_inputs = [
literal_eval(item["ground_truth"])["gt_parse"] for item in batch
]
return {"label": text_inputs}
cord = load_dataset(dataset_id)
loader = DataLoader(cord["train"], batch_size=32, collate_fn=collate_fn)
new_special_tokens = []
for i, batch in enumerate(loader):
for text in batch['label']:
_, batch_special_tokens = json2token(text, all_special_tokens)
new_special_tokens += batch_special_tokens
new_special_tokens = list(set(new_special_tokens))
return new_special_tokens