in dataset-construction/src/ndb_data/construction/make_questions.py [0:0]
def generate_derivations(hypotheses_facts, facts):
derivations = []
# For each S,R,O triple in the DB,generate a question_answer derivation from it
for hyp in tqdm(hypotheses_facts.keys()):
s, r, o = hyp
# If not it's a relation that we have templates for skip it
if r not in final_templates:
continue
# Get the canonical subject/object name
subject_name = wiki.get_by_id_or_uri(s)["english_name"]
object_name = (
o if not o.startswith("Q") else wiki.get_by_id_or_uri(o)["english_name"]
)
question_types = set(final_templates[r].keys()).difference(
{"fact", "_subject", "_object"}
)
# For all quesiton types (bool, set, min, max etc)
for q_type in question_types:
try:
# Sample a question template
question = random.choice(final_templates[r][q_type])
# Swap the subject_name and object_name into the template
out = [
q.replace("$s", subject_name).replace("$o", object_name)
for q in question
]
# Make the question ID
subj_in_q = f"_{s}" if "$s" in question[0] else ""
obj_in_q = f"_{o}" if "$o" in question[0] else ""
s_key = (
question[1].split("[SEP]")[0].strip() if "$" in question[1] else ""
)
sort_key = (
(f"_{s_key}" if "$" in question[1] else "")
if r != "P47"
else "_$both"
)
qid = f"{q_type}_{r}{subj_in_q}{obj_in_q}{sort_key}"
# Add this to the derivations
derivations.append((qid, q_type, out, hypotheses_facts[hyp]))
gj = list(
generate_joins(hypotheses_facts, facts, qid, question, s, r, o)
)
derivations.extend(gj)
# If the question is boolean generate a negatively sampled false one for it
if q_type == "bool":
generated = generate_negative_bool(hypotheses_facts, question, hyp)
if generated is not None:
derivations.append(generated)
except Exception:
continue
return derivations