def convert_train_jsonl_to_ctxmatch()

in dpr/data/tables.py [0:0]


def convert_train_jsonl_to_ctxmatch(path: str, out_file: str):
    def get_table_string_for_ctx_match(table: dict):  # this doesn't use caption
        table_text = table["caption"] + " . "
        for r in table["rows"]:
            table_text += " . ".join([c["value"] for c in r["columns"]])
        table_text += " . "
        return table_text

    results = []
    with jsonlines.open(path, mode="r") as jsonl_reader:
        for jline in jsonl_reader:
            if len(jline["positive_ctxs"]) == 0:
                continue
            ctx_pos = jline["positive_ctxs"][0]
            table_str = get_table_string_for_ctx_match(ctx_pos)
            q = jline["question"]
            results.append((q, table_str))

            if len(results) % 1000 == 0:
                logger.info("results %d", len(results))

    shards_sz = 3000
    shard = 0

    for s in range(0, len(results), shards_sz):
        chunk = results[s : s + shards_sz]
        shard_file = out_file + ".shard_{}".format(shard)
        with jsonlines.open(shard_file, mode="w") as writer:
            logger.info("Saving to %s", shard_file)
            for i, item in enumerate(chunk):
                writer.write({"id": s + i, "question": item[0], "context": item[1]})
        shard += 1