in dpr/data/tables.py [0:0]
def calc_questions_overlap(tables_file, regular_file, dev_file):
tab_questions = set()
with jsonlines.open(tables_file, mode="r") as jsonl_reader:
logger.info("Reading file %s" % tables_file)
for jline in jsonl_reader:
q = jline["question"]
tab_questions.add(q)
reg_questions = set()
if regular_file[-4:] == ".csv":
qas = parse_qa_csv_file(regular_file)
for qa in qas:
reg_questions.add(qa[0])
else:
with open(regular_file, "r", encoding="utf-8") as f:
logger.info("Reading file %s" % regular_file)
data = json.load(f)
for item in data:
q = item["question"]
reg_questions.add(q)
if dev_file:
if dev_file[-4:] == ".csv":
qas = parse_qa_csv_file(dev_file)
for qa in qas:
reg_questions.add(qa[0])
else:
with open(dev_file, "r", encoding="utf-8") as f:
logger.info("Reading file %s" % dev_file)
data = json.load(f)
for item in data:
q = item["question"]
reg_questions.add(q)
logger.info("tab_questions %d", len(tab_questions))
logger.info("reg_questions %d", len(reg_questions))
logger.info("overlap %d", len(tab_questions.intersection(reg_questions)))