in dynalab/handler.py [0:0]
def preprocess_one(self, sample) -> dict:
"""
preprocess data into a format that the model can do inference on
"""
# TODO: this doesn't seem to produce good results. wrong EOS / BOS ?
tokens = self.tokenize(sample["sourceText"])
src_token = self.lang_token(sample["sourceLanguage"])
tgt_token = self.lang_token(sample["targetLanguage"])
return {
"src_tokens": [src_token] + tokens + [self.vocab.eos()],
"src_length": len(tokens) + 1,
"tgt_token": tgt_token,
}
return sample