in src/tokenizer.py [0:0]
def _process_data_files(self, dir_path):
"""
Reads in data in self.data_path and writes out to utterance text to
files.
"""
line_count = 0
file_count = 0
curr_out_file_name = os.path.join(dir_path, f"processed_{file_count}.txt")
out_file = open(curr_out_file_name, "w")
for utterance in get_next_utterance(self.data_path):
_, text = self.md_transformer.parse_raw_input(utterance)
line_count += 1
if (line_count % 20_000 == 0):
line_count = 0
file_count += 1
curr_out_file_name = os.path.join(dir_path,\
f"processed_{file_count}.txt")
out_file.close()
out_file = open(curr_out_file_name, "w")
out_file.write(text + '\n')