def combine_corpus()

in mdr/retrieval/data/data_utils.py [0:0]


def combine_corpus():
    hotpot_abstracts = [json.loads(l) for l in open("/private/home/xwhan/data/hotpot/tfidf/abstracts.txt").readlines()]
    hotpot_title2doc = {doc["title"]: doc["text"] for doc in hotpot_abstracts}
    nq_title2docs = collections.defaultdict(list)
    import csv
    dpr_count = 0
    with open("/private/home/xwhan/code/DPR/data/wikipedia_split/psgs_w100.tsv") as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t', )
        for row in reader:
            if row[0] != 'id':
                id_, text, title = row[0], row[1], row[2]
                dpr_count += 1
                nq_title2docs[title].append(text)

    merged = []
    for title, passages in tqdm(nq_title2docs.items()):
        if title in hotpot_title2doc:
            # compare the length of 1st split of dpr and abstracts
            abstract = hotpot_title2doc[title].strip()
            merged.append({
                "title": title,
                "text": abstract[:-1] if abstract.endswith(".") else abstract,
                "intro": True
            })

        for idx, p in enumerate(passages):
            p = p.strip()
            merged.append({
                "title": title,
                "text": p[:-1] if p.endswith(".") else p,
                "intro": idx == 0
            })

    for title, doc in hotpot_title2doc.items():
        if title not in nq_title2docs:
            if doc.endswith("."):
                doc = doc[:-1]
            merged.append({
                "title": title,
                "text": doc,
                "intro": True
            })

    print(f"Merged corpus size {len(merged)}")
    with open("/private/home/xwhan/data/combined/corpus/merged_no_period.txt", "w") as g:
        for item in merged:
            g.write(json.dumps(item) + "\n")