def make_unlikelihood_dataset()

in preprocess/evaluate_hypo.py [0:0]


def make_unlikelihood_dataset(args):
    print('Entering make_unlikelihood_dataset')
    assert args.unlike_select_ratio > 0.0 and args.unlike_select_ratio <= 1.0
    if args.score_type == 'lm':
        index_file_name = 'untarget.index'
        if args.separate_target_untarget:
            target_dir = os.path.join(args.base_dir, args.sub_dir, 'target-{}'.format(
                int(args.target_select_ratio * 100)))
            untarget_dir = os.path.join(args.base_dir, args.sub_dir, 'untarget-{}'.format(
                int(args.unlike_select_ratio * 100)))
        elif args.output_dir:
            out_dir = args.output_dir
        elif args.target_select_ratio > 0.0:
            out_dir = os.path.join(args.base_dir, args.sub_dir, 'lm-{}-{}'.format(int(args.unlike_select_ratio * 100),
                                                                                  int(args.target_select_ratio * 100)))
        else:
            out_dir = os.path.join(args.base_dir, args.sub_dir, 'lm-{}'.format(int(args.unlike_select_ratio * 100)))
        # if args.filter_ans_lm_score:
        #     metric_choice = 'ans_lm'
        # else:
        #     metric_choice = 'eval_ns-ns'
        assert args.metric_choice == 'ans_lm' or args.metric_choice == 'eval_ns-ns'
        metric_choice = args.metric_choice
        print("Making unlikelihood dataset using {}".format(metric_choice))

    elif args.score_type == 'f1':
        index_file_name = 'untarget_f1.index'
        # out_dir = os.path.join(args.base_dir, args.sub_dir, 'f1-{}'.format(int(args.unlike_select_ratio * 100)))
        metric_choice = 'lm_f1'
        if args.separate_target_untarget:
            target_dir = os.path.join(args.base_dir, args.sub_dir, 'f1-target-{}'.format(
                int(args.target_select_ratio * 100)))
            untarget_dir = os.path.join(args.base_dir, args.sub_dir, 'f1-untarget-{}'.format(
                int(args.unlike_select_ratio * 100)))
        elif args.output_dir:
            out_dir = args.output_dir
        elif args.target_select_ratio > 0.0:
            out_dir = os.path.join(args.base_dir, args.sub_dir, 'f1-{}-{}'.format(int(args.unlike_select_ratio * 100),
                                                                                  int(args.target_select_ratio * 100)))
        else:
            out_dir = os.path.join(args.base_dir, args.sub_dir, 'f1-{}'.format(int(args.unlike_select_ratio * 100)))
        print("Making unlikelihood dataset using {}".format(metric_choice))
    elif args.score_type == 'rouge':
        index_file_name = 'untarget_rouge.index'
        metric_choice = 'rouge'
        if args.separate_target_untarget:
            raise Exception("We do not support seperate target-untarget dataset for rouge!")
        elif args.output_dir:
            out_dir = args.output_dir
        else:
            out_dir = os.path.join(args.base_dir, args.sub_dir, 'rouge-{}-{}'.format(int(args.unlike_select_ratio * 100),
                                                                                  int(args.unlike_select_ratio * 100)))
        print("Making unlikelihood dataset using {}".format(metric_choice))
    else:
        raise Exception("Please specify score_type!")

    # read and sort scores from index file:
    target_scores = []
    if args.target_index_file:
        assert os.path.exists(args.target_index_file)
        count_1e5 = 0
        with open(args.target_index_file, 'r') as ind_f:
            for index_line in ind_f:
                index_dict = json.loads(index_line.strip())
                target_scores.append(index_dict['avg_value'][metric_choice])
                if target_scores[-1] == 1e5 or target_scores[-1] == -1e5:
                    count_1e5 += 1
        print("Read {} lines from {}, {} scores are 1e5".format(len(target_scores), args.target_index_file,
                                                                count_1e5))

    index_file = os.path.join(args.base_dir, args.sub_dir, index_file_name)
    assert os.path.exists(index_file)
    scores = []
    select_hypo_ind = []
    with open(index_file, 'r') as ind_f:
        for index_line in ind_f:
            index_dict = json.loads(index_line.strip())
            scores.append(index_dict['avg_value'][metric_choice])
            select_hypo_ind.append(index_dict['avg'][metric_choice])
    print("Read {} lines from {}".format(len(scores), index_file))
    additional_scores = []
    additional_select_hypo_ind = []
    if args.additional_index_file:
        with open(args.additional_index_file, 'r') as ind_f:
            for index_line in ind_f:
                index_dict = json.loads(index_line.strip())
                additional_scores.append(index_dict['avg_value'][metric_choice])
                additional_select_hypo_ind.append(index_dict['avg'][metric_choice])
        print("Read {} lines from {}".format(len(additional_scores), args.additional_index_file))

    # load hypos
    lm_files = sorted(list(
        glob.glob(os.path.join(args.base_dir, args.sub_dir, args.pattern))))
    hypos = _load_hypos(lm_files, select_hypo_ind)

    if additional_scores:
        assert args.additional_eval_patterns, "need to supply additional source_eval file patterns!"
        print("Loading additional hypos: {}".format(args.additional_eval_patterns))
        additional_hypos = _load_hypos(sorted(list(glob.glob(args.additional_eval_patterns))),
                                       additional_select_hypo_ind)
        merged_scores = []
        merged_hypos = []
        assert len(scores) == len(additional_scores) == len(hypos) == len(additional_hypos)
        if args.select_highest:
            for s, a_s, h, a_h in zip(scores, additional_scores, hypos, additional_hypos):
                if s > a_s:
                    merged_hypos.append(h)
                    merged_scores.append(s)
                else:
                    merged_hypos.append(a_h)
                    merged_scores.append(a_s)
        else:
            for s, a_s, h, a_h in zip(scores, additional_scores, hypos, additional_hypos):
                if s < a_s:
                    merged_hypos.append(h)
                    merged_scores.append(s)
                else:
                    merged_hypos.append(a_h)
                    merged_scores.append(a_s)
    else:
        merged_scores = scores
        merged_hypos = hypos

    if args.unlike_select_ratio == 1.0:
        selected_example_ind = list(range(len(merged_scores)))
    else:
        if not args.select_highest:
            sorted_example_ind = [i[0] for i in sorted(enumerate(merged_scores), key=lambda x: x[1])]
        else:
            sorted_example_ind = [i[0] for i in sorted(enumerate(merged_scores), key=lambda x: -x[1])]
        selected_example_ind = sorted_example_ind[:round(args.unlike_select_ratio * len(merged_scores))]

    selected_target_ind = []
    if target_scores:
        if args.target_select_ratio == 1.0:
            selected_target_ind = list(range(len(target_scores)))
        else:
            sorted_target_ind = [i[0] for i in sorted(enumerate(target_scores), key=lambda x: -x[1] if x[1] != 1e5 else x[1])]
            selected_target_ind = sorted_target_ind[:round(args.target_select_ratio * len(target_scores))]
        print("top 5 selected target indices:", [i for i in selected_target_ind[:5]])
        print("top 5 selected target scores:", [target_scores[i] for i in selected_target_ind[:5]])
        print("bottom 5 selected target indices:", [i for i in selected_target_ind[-5:]])
        print("bottom 5 selected target scores:", [target_scores[i] for i in selected_target_ind[-5:]])
        if not args.separate_target_untarget:
            selected_example_ind = list(set(selected_example_ind).intersection(set(selected_target_ind)))

    # write output files
    output_source_target = ""
    if args.separate_target_untarget:
        if not os.path.exists(target_dir):
            os.mkdir(target_dir)
        if not os.path.exists(untarget_dir):
            os.mkdir(untarget_dir)
        output_untarget = os.path.join(untarget_dir, 'train.untarget')
        output_target = os.path.join(target_dir, 'train.target')
        output_source_untarget = os.path.join(untarget_dir, 'train.source')
        output_source_target = os.path.join(target_dir, 'train.source')
    else:
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
        output_untarget = os.path.join(out_dir, 'train.untarget')
        output_target = os.path.join(out_dir, 'train.target')
        output_source_untarget = os.path.join(out_dir, 'train.source')

    source_file = os.path.join(args.base_dir, 'train.source')
    with open(source_file, 'r') as f:
        source_lines = f.readlines()
    if not args.make_only_untarget:
        target_file = os.path.join(args.base_dir, 'train.target')
        with open(target_file, 'r') as f:
            target_lines = f.readlines()

        if args.separate_target_untarget:
            with open(output_untarget, 'w') as out_untarget_f, \
                    open(output_source_untarget, 'w') as out_source_f:
                for ind in selected_example_ind:
                    out_source_f.write(source_lines[ind])
                    out_untarget_f.write(merged_hypos[ind] + '\n')
            print('Wrote {} examples to {} and {}'.format(len(selected_example_ind), output_untarget,
                                                          output_source_untarget))

            with open(output_target, 'w') as out_target_f, \
                    open(output_source_target, 'w') as out_source_f:
                for ind in selected_target_ind:
                    out_source_f.write(source_lines[ind])
                    out_target_f.write(target_lines[ind])
            print('Wrote {} examples to {} and {}'.format(len(selected_target_ind), output_target,
                                                          output_source_target))
        else:
            with open(output_untarget, 'w') as out_untarget_f, \
                open(output_target, 'w') as out_target_f, \
                open(output_source_untarget, 'w') as out_source_f:
                for ind in selected_example_ind:
                    out_target_f.write(target_lines[ind])
                    out_source_f.write(source_lines[ind])
                    out_untarget_f.write(merged_hypos[ind] + '\n')
            print('Wrote {} examples to {} and {} and {}'.format(len(selected_example_ind), output_untarget,
                                                          output_target, output_source_untarget))
    else:
        with open(output_untarget, 'w') as out_untarget_f, \
            open(output_source_untarget, 'w') as out_source_f:
            for ind in selected_example_ind:
                out_source_f.write(source_lines[ind])
                out_untarget_f.write(merged_hypos[ind] + '\n')

        print('Wrote {} examples to {} and {}'.format(len(selected_example_ind), output_untarget,
                                                      output_source_untarget))