in src/data.py [0:0]
def generate_processed_stream(self):
for utterance in get_next_utterance(self.data_directory):
md_dict, text = self.md_transformer.parse_raw_input(utterance)
input = torch.tensor(self.tokenizer.encode_text(text, add_sos=True))
output = torch.tensor(self.tokenizer.encode_text(text, add_eos=True))
text_len = torch.tensor(len(input))
if md_dict:
md = {}
md_len = {}
for curr_md_transform, curr_md in md_dict.items():
if not isinstance(curr_md, torch.Tensor):
curr_md = torch.tensor(self.tokenizer.encode_text(curr_md))
curr_md_len = torch.tensor(len(curr_md))
else:
curr_md_len = torch.tensor(1)
md[curr_md_transform] = curr_md
md_len[curr_md_transform] = curr_md_len
else:
md = None
md_len = None
sample = {"input": input,
"output": output,
"md": md,
"text_len": text_len,
"md_len": md_len}
yield sample