in mdr/retrieval/data/data_utils.py [0:0]
def combine_corpus():
hotpot_abstracts = [json.loads(l) for l in open("/private/home/xwhan/data/hotpot/tfidf/abstracts.txt").readlines()]
hotpot_title2doc = {doc["title"]: doc["text"] for doc in hotpot_abstracts}
nq_title2docs = collections.defaultdict(list)
import csv
dpr_count = 0
with open("/private/home/xwhan/code/DPR/data/wikipedia_split/psgs_w100.tsv") as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t', )
for row in reader:
if row[0] != 'id':
id_, text, title = row[0], row[1], row[2]
dpr_count += 1
nq_title2docs[title].append(text)
merged = []
for title, passages in tqdm(nq_title2docs.items()):
if title in hotpot_title2doc:
# compare the length of 1st split of dpr and abstracts
abstract = hotpot_title2doc[title].strip()
merged.append({
"title": title,
"text": abstract[:-1] if abstract.endswith(".") else abstract,
"intro": True
})
for idx, p in enumerate(passages):
p = p.strip()
merged.append({
"title": title,
"text": p[:-1] if p.endswith(".") else p,
"intro": idx == 0
})
for title, doc in hotpot_title2doc.items():
if title not in nq_title2docs:
if doc.endswith("."):
doc = doc[:-1]
merged.append({
"title": title,
"text": doc,
"intro": True
})
print(f"Merged corpus size {len(merged)}")
with open("/private/home/xwhan/data/combined/corpus/merged_no_period.txt", "w") as g:
for item in merged:
g.write(json.dumps(item) + "\n")