in dpr/data/tables.py [0:0]
def convert_train_jsonl_to_ctxmatch(path: str, out_file: str):
def get_table_string_for_ctx_match(table: dict): # this doesn't use caption
table_text = table["caption"] + " . "
for r in table["rows"]:
table_text += " . ".join([c["value"] for c in r["columns"]])
table_text += " . "
return table_text
results = []
with jsonlines.open(path, mode="r") as jsonl_reader:
for jline in jsonl_reader:
if len(jline["positive_ctxs"]) == 0:
continue
ctx_pos = jline["positive_ctxs"][0]
table_str = get_table_string_for_ctx_match(ctx_pos)
q = jline["question"]
results.append((q, table_str))
if len(results) % 1000 == 0:
logger.info("results %d", len(results))
shards_sz = 3000
shard = 0
for s in range(0, len(results), shards_sz):
chunk = results[s : s + shards_sz]
shard_file = out_file + ".shard_{}".format(shard)
with jsonlines.open(shard_file, mode="w") as writer:
logger.info("Saving to %s", shard_file)
for i, item in enumerate(chunk):
writer.write({"id": s + i, "question": item[0], "context": item[1]})
shard += 1