in preprocess/evaluate_hypo.py [0:0]
def make_unlikelihood_dataset(args):
print('Entering make_unlikelihood_dataset')
assert args.unlike_select_ratio > 0.0 and args.unlike_select_ratio <= 1.0
if args.score_type == 'lm':
index_file_name = 'untarget.index'
if args.separate_target_untarget:
target_dir = os.path.join(args.base_dir, args.sub_dir, 'target-{}'.format(
int(args.target_select_ratio * 100)))
untarget_dir = os.path.join(args.base_dir, args.sub_dir, 'untarget-{}'.format(
int(args.unlike_select_ratio * 100)))
elif args.output_dir:
out_dir = args.output_dir
elif args.target_select_ratio > 0.0:
out_dir = os.path.join(args.base_dir, args.sub_dir, 'lm-{}-{}'.format(int(args.unlike_select_ratio * 100),
int(args.target_select_ratio * 100)))
else:
out_dir = os.path.join(args.base_dir, args.sub_dir, 'lm-{}'.format(int(args.unlike_select_ratio * 100)))
# if args.filter_ans_lm_score:
# metric_choice = 'ans_lm'
# else:
# metric_choice = 'eval_ns-ns'
assert args.metric_choice == 'ans_lm' or args.metric_choice == 'eval_ns-ns'
metric_choice = args.metric_choice
print("Making unlikelihood dataset using {}".format(metric_choice))
elif args.score_type == 'f1':
index_file_name = 'untarget_f1.index'
# out_dir = os.path.join(args.base_dir, args.sub_dir, 'f1-{}'.format(int(args.unlike_select_ratio * 100)))
metric_choice = 'lm_f1'
if args.separate_target_untarget:
target_dir = os.path.join(args.base_dir, args.sub_dir, 'f1-target-{}'.format(
int(args.target_select_ratio * 100)))
untarget_dir = os.path.join(args.base_dir, args.sub_dir, 'f1-untarget-{}'.format(
int(args.unlike_select_ratio * 100)))
elif args.output_dir:
out_dir = args.output_dir
elif args.target_select_ratio > 0.0:
out_dir = os.path.join(args.base_dir, args.sub_dir, 'f1-{}-{}'.format(int(args.unlike_select_ratio * 100),
int(args.target_select_ratio * 100)))
else:
out_dir = os.path.join(args.base_dir, args.sub_dir, 'f1-{}'.format(int(args.unlike_select_ratio * 100)))
print("Making unlikelihood dataset using {}".format(metric_choice))
elif args.score_type == 'rouge':
index_file_name = 'untarget_rouge.index'
metric_choice = 'rouge'
if args.separate_target_untarget:
raise Exception("We do not support seperate target-untarget dataset for rouge!")
elif args.output_dir:
out_dir = args.output_dir
else:
out_dir = os.path.join(args.base_dir, args.sub_dir, 'rouge-{}-{}'.format(int(args.unlike_select_ratio * 100),
int(args.unlike_select_ratio * 100)))
print("Making unlikelihood dataset using {}".format(metric_choice))
else:
raise Exception("Please specify score_type!")
# read and sort scores from index file:
target_scores = []
if args.target_index_file:
assert os.path.exists(args.target_index_file)
count_1e5 = 0
with open(args.target_index_file, 'r') as ind_f:
for index_line in ind_f:
index_dict = json.loads(index_line.strip())
target_scores.append(index_dict['avg_value'][metric_choice])
if target_scores[-1] == 1e5 or target_scores[-1] == -1e5:
count_1e5 += 1
print("Read {} lines from {}, {} scores are 1e5".format(len(target_scores), args.target_index_file,
count_1e5))
index_file = os.path.join(args.base_dir, args.sub_dir, index_file_name)
assert os.path.exists(index_file)
scores = []
select_hypo_ind = []
with open(index_file, 'r') as ind_f:
for index_line in ind_f:
index_dict = json.loads(index_line.strip())
scores.append(index_dict['avg_value'][metric_choice])
select_hypo_ind.append(index_dict['avg'][metric_choice])
print("Read {} lines from {}".format(len(scores), index_file))
additional_scores = []
additional_select_hypo_ind = []
if args.additional_index_file:
with open(args.additional_index_file, 'r') as ind_f:
for index_line in ind_f:
index_dict = json.loads(index_line.strip())
additional_scores.append(index_dict['avg_value'][metric_choice])
additional_select_hypo_ind.append(index_dict['avg'][metric_choice])
print("Read {} lines from {}".format(len(additional_scores), args.additional_index_file))
# load hypos
lm_files = sorted(list(
glob.glob(os.path.join(args.base_dir, args.sub_dir, args.pattern))))
hypos = _load_hypos(lm_files, select_hypo_ind)
if additional_scores:
assert args.additional_eval_patterns, "need to supply additional source_eval file patterns!"
print("Loading additional hypos: {}".format(args.additional_eval_patterns))
additional_hypos = _load_hypos(sorted(list(glob.glob(args.additional_eval_patterns))),
additional_select_hypo_ind)
merged_scores = []
merged_hypos = []
assert len(scores) == len(additional_scores) == len(hypos) == len(additional_hypos)
if args.select_highest:
for s, a_s, h, a_h in zip(scores, additional_scores, hypos, additional_hypos):
if s > a_s:
merged_hypos.append(h)
merged_scores.append(s)
else:
merged_hypos.append(a_h)
merged_scores.append(a_s)
else:
for s, a_s, h, a_h in zip(scores, additional_scores, hypos, additional_hypos):
if s < a_s:
merged_hypos.append(h)
merged_scores.append(s)
else:
merged_hypos.append(a_h)
merged_scores.append(a_s)
else:
merged_scores = scores
merged_hypos = hypos
if args.unlike_select_ratio == 1.0:
selected_example_ind = list(range(len(merged_scores)))
else:
if not args.select_highest:
sorted_example_ind = [i[0] for i in sorted(enumerate(merged_scores), key=lambda x: x[1])]
else:
sorted_example_ind = [i[0] for i in sorted(enumerate(merged_scores), key=lambda x: -x[1])]
selected_example_ind = sorted_example_ind[:round(args.unlike_select_ratio * len(merged_scores))]
selected_target_ind = []
if target_scores:
if args.target_select_ratio == 1.0:
selected_target_ind = list(range(len(target_scores)))
else:
sorted_target_ind = [i[0] for i in sorted(enumerate(target_scores), key=lambda x: -x[1] if x[1] != 1e5 else x[1])]
selected_target_ind = sorted_target_ind[:round(args.target_select_ratio * len(target_scores))]
print("top 5 selected target indices:", [i for i in selected_target_ind[:5]])
print("top 5 selected target scores:", [target_scores[i] for i in selected_target_ind[:5]])
print("bottom 5 selected target indices:", [i for i in selected_target_ind[-5:]])
print("bottom 5 selected target scores:", [target_scores[i] for i in selected_target_ind[-5:]])
if not args.separate_target_untarget:
selected_example_ind = list(set(selected_example_ind).intersection(set(selected_target_ind)))
# write output files
output_source_target = ""
if args.separate_target_untarget:
if not os.path.exists(target_dir):
os.mkdir(target_dir)
if not os.path.exists(untarget_dir):
os.mkdir(untarget_dir)
output_untarget = os.path.join(untarget_dir, 'train.untarget')
output_target = os.path.join(target_dir, 'train.target')
output_source_untarget = os.path.join(untarget_dir, 'train.source')
output_source_target = os.path.join(target_dir, 'train.source')
else:
if not os.path.exists(out_dir):
os.makedirs(out_dir)
output_untarget = os.path.join(out_dir, 'train.untarget')
output_target = os.path.join(out_dir, 'train.target')
output_source_untarget = os.path.join(out_dir, 'train.source')
source_file = os.path.join(args.base_dir, 'train.source')
with open(source_file, 'r') as f:
source_lines = f.readlines()
if not args.make_only_untarget:
target_file = os.path.join(args.base_dir, 'train.target')
with open(target_file, 'r') as f:
target_lines = f.readlines()
if args.separate_target_untarget:
with open(output_untarget, 'w') as out_untarget_f, \
open(output_source_untarget, 'w') as out_source_f:
for ind in selected_example_ind:
out_source_f.write(source_lines[ind])
out_untarget_f.write(merged_hypos[ind] + '\n')
print('Wrote {} examples to {} and {}'.format(len(selected_example_ind), output_untarget,
output_source_untarget))
with open(output_target, 'w') as out_target_f, \
open(output_source_target, 'w') as out_source_f:
for ind in selected_target_ind:
out_source_f.write(source_lines[ind])
out_target_f.write(target_lines[ind])
print('Wrote {} examples to {} and {}'.format(len(selected_target_ind), output_target,
output_source_target))
else:
with open(output_untarget, 'w') as out_untarget_f, \
open(output_target, 'w') as out_target_f, \
open(output_source_untarget, 'w') as out_source_f:
for ind in selected_example_ind:
out_target_f.write(target_lines[ind])
out_source_f.write(source_lines[ind])
out_untarget_f.write(merged_hypos[ind] + '\n')
print('Wrote {} examples to {} and {} and {}'.format(len(selected_example_ind), output_untarget,
output_target, output_source_untarget))
else:
with open(output_untarget, 'w') as out_untarget_f, \
open(output_source_untarget, 'w') as out_source_f:
for ind in selected_example_ind:
out_source_f.write(source_lines[ind])
out_untarget_f.write(merged_hypos[ind] + '\n')
print('Wrote {} examples to {} and {}'.format(len(selected_example_ind), output_untarget,
output_source_untarget))