in dpr_scale/data_prep/prep_conv_datasets.py [0:0]
def prep_dpr_ubuntuv2(infile, outfile):
num_samples = 0
df = pd.read_csv(infile)
is_train = os.path.basename(infile).rstrip(".csv") == "train"
with open(outfile, "w") as fout:
# Context,Ground Truth Utterance,Distractor_0,Distractor_1,...,Distractor_8
for i, row in tqdm(df.iterrows(), total=len(df)):
if is_train: # train only has +ve samples.
question = row["Context"]
pos_ctxs = [get_ctx(row["Utterance"])]
neg_ctxs = []
else:
question = row["Context"]
pos_ctxs = [get_ctx(row["Ground Truth Utterance"])]
neg_ctxs = [get_ctx(row[f"Distractor_{i}"]) for i in range(9)]
assert len(neg_ctxs) == 9, (len(neg_ctxs), row)
out_json = ujson.dumps(
{
"question": question,
"answers": [],
"positive_ctxs": pos_ctxs,
"hard_negative_ctxs": neg_ctxs,
}
)
fout.write(f"{out_json}\n")
num_samples += 1
return num_samples