def calc_questions_overlap()

in dpr/data/tables.py [0:0]


def calc_questions_overlap(tables_file, regular_file, dev_file):
    tab_questions = set()

    with jsonlines.open(tables_file, mode="r") as jsonl_reader:
        logger.info("Reading file %s" % tables_file)
        for jline in jsonl_reader:
            q = jline["question"]
            tab_questions.add(q)

    reg_questions = set()

    if regular_file[-4:] == ".csv":
        qas = parse_qa_csv_file(regular_file)
        for qa in qas:
            reg_questions.add(qa[0])
    else:
        with open(regular_file, "r", encoding="utf-8") as f:
            logger.info("Reading file %s" % regular_file)
            data = json.load(f)
            for item in data:
                q = item["question"]
                reg_questions.add(q)
    if dev_file:
        if dev_file[-4:] == ".csv":
            qas = parse_qa_csv_file(dev_file)
            for qa in qas:
                reg_questions.add(qa[0])
        else:
            with open(dev_file, "r", encoding="utf-8") as f:
                logger.info("Reading file %s" % dev_file)
                data = json.load(f)
                for item in data:
                    q = item["question"]
                    reg_questions.add(q)

    logger.info("tab_questions %d", len(tab_questions))
    logger.info("reg_questions %d", len(reg_questions))
    logger.info("overlap %d", len(tab_questions.intersection(reg_questions)))