def prep_dpr_ubuntuv2()

in dpr_scale/data_prep/prep_conv_datasets.py [0:0]


def prep_dpr_ubuntuv2(infile, outfile):
    num_samples = 0
    df = pd.read_csv(infile)
    is_train = os.path.basename(infile).rstrip(".csv") == "train"
    with open(outfile, "w") as fout:
        # Context,Ground Truth Utterance,Distractor_0,Distractor_1,...,Distractor_8
        for i, row in tqdm(df.iterrows(), total=len(df)):
            if is_train:  # train only has +ve samples.
                question = row["Context"]
                pos_ctxs = [get_ctx(row["Utterance"])]
                neg_ctxs = []
            else:
                question = row["Context"]
                pos_ctxs = [get_ctx(row["Ground Truth Utterance"])]
                neg_ctxs = [get_ctx(row[f"Distractor_{i}"]) for i in range(9)]
                assert len(neg_ctxs) == 9, (len(neg_ctxs), row)
            out_json = ujson.dumps(
                {
                    "question": question,
                    "answers": [],
                    "positive_ctxs": pos_ctxs,
                    "hard_negative_ctxs": neg_ctxs,
                }
            )
            fout.write(f"{out_json}\n")
            num_samples += 1
    return num_samples