in data_parsing.py [0:0]
def main(args):
#load wikitionary dump data file
start_time = time.time()
f = open(args.wiki_file, 'r')
#track status in file
curr_page = []
is_page = False
#track all info for dataset
senses = []
#scan through file and process pages sequentially
for line in f:
line = line.strip()
if line == '<page>':
is_page = True
elif line == '</page>':
s = process_page(curr_page)
if s != -1: senses.extend(s)
is_page = False
curr_page = []
else:
#drop empty lines
if is_page and len(line)>0: curr_page.append(line)
f.close()
print(len(senses), '{:.2f}'.format(time.time()-start_time))
#add post-processing to seperate out quotes, examples into seperate lists
senses, quotations, examples = post_processing(senses)
#make save dir if it doesn't exist
if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
#save senses
s_path = os.path.join(args.save_dir, 'senses.txt')
save_senses(s_path, senses)
#save examples
e_path = os.path.join(args.save_dir, 'examples.txt')
save_examples(e_path, examples)
#save quotes
q_path = os.path.join(args.save_dir, 'quotations.txt')
save_quotations(q_path, quotations)
return