def generate_processed_stream()

in src/data.py [0:0]


    def generate_processed_stream(self):
        for utterance in get_next_utterance(self.data_directory):
            md_dict, text = self.md_transformer.parse_raw_input(utterance)
            input = torch.tensor(self.tokenizer.encode_text(text, add_sos=True))
            output = torch.tensor(self.tokenizer.encode_text(text, add_eos=True))
            text_len = torch.tensor(len(input))

            if md_dict:
                md = {}
                md_len = {}
                for curr_md_transform, curr_md in md_dict.items():
                    if not isinstance(curr_md, torch.Tensor):
                        curr_md = torch.tensor(self.tokenizer.encode_text(curr_md))
                        curr_md_len = torch.tensor(len(curr_md))
                    else:
                        curr_md_len = torch.tensor(1)

                    md[curr_md_transform] = curr_md
                    md_len[curr_md_transform] = curr_md_len

            else:
                md = None
                md_len = None

            sample = {"input": input,
                      "output": output,
                      "md": md,
                      "text_len": text_len,
                      "md_len": md_len}
            yield sample