in dpr/data/reader_data.py [0:0]
def _get_gold_ctx_dict(file: str) -> Tuple[Dict[str, ReaderPassage], Dict[str, str]]:
gold_passage_infos = {} # question|question_tokens -> ReaderPassage (with title and gold ctx)
# original NQ dataset has 2 forms of same question - original, and tokenized.
# Tokenized form is not fully consisted with the original question if tokenized by some encoder tokenizers
# Specifically, this is the case for the BERT tokenizer.
# Depending of which form was used for retriever training and results generation, it may be useful to convert
# all questions to the canonical original representation.
original_questions = {} # question from tokens -> original question (NQ only)
with open(file, "r", encoding="utf-8") as f:
logger.info("Reading file %s" % file)
data = json.load(f)["data"]
for sample in data:
question = sample["question"]
question_from_tokens = sample["question_tokens"] if "question_tokens" in sample else question
original_questions[question_from_tokens] = question
title = sample["title"].lower()
context = sample["context"] # Note: This one is cased
rp = ReaderPassage(sample["example_id"], text=context, title=title)
if question in gold_passage_infos:
logger.info("Duplicate question %s", question)
rp_exist = gold_passage_infos[question]
logger.info(
"Duplicate question gold info: title new =%s | old title=%s",
title,
rp_exist.title,
)
logger.info("Duplicate question gold info: new ctx =%s ", context)
logger.info("Duplicate question gold info: old ctx =%s ", rp_exist.passage_text)
gold_passage_infos[question] = rp
gold_passage_infos[question_from_tokens] = rp
return gold_passage_infos, original_questions