def _process_data_files()

in src/tokenizer.py [0:0]


    def _process_data_files(self, dir_path):
        """
        Reads in data in self.data_path and writes out to utterance text to
        files.
        """
        line_count = 0
        file_count = 0

        curr_out_file_name = os.path.join(dir_path, f"processed_{file_count}.txt")
        out_file = open(curr_out_file_name, "w")
        for utterance in get_next_utterance(self.data_path):
            _, text = self.md_transformer.parse_raw_input(utterance)
            line_count += 1

            if (line_count % 20_000 == 0):
                line_count = 0
                file_count += 1
                curr_out_file_name = os.path.join(dir_path,\
                                                  f"processed_{file_count}.txt")
                out_file.close()
                out_file = open(curr_out_file_name, "w")

            out_file.write(text + '\n')