in dataset-construction/src/ndb_data/construction/make_questions.py [0:0]
def build_questions_for_db(database):
# Get the facts
facts = database["metadata"]["raw"]
hypotheses_facts = map_triples_to_facts(facts)
# Generate all derivations for these facts
derivations = generate_derivations(hypotheses_facts, facts)
# For each fact/derivation build a lookup table for it's height and q/a
by_qid = defaultdict(list)
q_heights = defaultdict(list)
for q in derivations:
by_qid[q[0]].append((q[2], q[3]))
q_heights[q[0]].append(q[3])
final_questions = []
# For each question in the derivations, generate it's positive answer
for qid, qs in by_qid.items():
try:
_, qh, generated_question, generated_answer = generate_positive_question(
qid, qs, q_heights
)
except TypeError:
continue
except AssertionError:
continue
# Get the participating IDs of the facts in this Q/A
ids = set()
qh1_filtered = []
for facts_for_q in qh:
local_f = []
for fact_in_triple in facts_for_q:
ids.add(fact_in_triple)
local_f.append(fact_in_triple)
if len(local_f):
qh1_filtered.append(local_f)
# Either set the context to be the size of the DB
question_height = (
len(facts)
if random.randint(0, 1)
else random.choice(list(range(max(ids), len(facts))))
)
assert len(qh1_filtered) == len(qs)
final_questions.append(
(
qid,
generated_question,
generated_answer,
[q[0][1] for q in qs],
qh1_filtered,
ids,
question_height,
)
)
# If question is a lookup then sampling between min(ids) and max(ids) should
# change the generated answer. Sampling below min(ids) would return no answer
if len(qs) > 1:
sample_choices = list(
range(min(ids) if min(ids) != max(ids) else 0, max(ids))
)
if not len(sample_choices):
continue
max_height = random.choice(sample_choices)
else:
max_height = random.choice(list(range(0, min(ids) + 1)))
# Make a negative question (with a null answer)
qs_negative = [q for q in qs if all(subfact < max_height for subfact in q[1])]
try:
_, qh2, _, negative_generated_answer = generate_positive_question(
qid, qs_negative, q_heights, max_height
)
except TypeError:
continue
except AssertionError:
continue
# print(generated_question, negative_generated_answer, max_height)
qh2_filtered = []
new_ids = set()
for facts_for_q in qh2:
local_f = []
for fact_in_triple in facts_for_q:
if fact_in_triple < max_height:
new_ids.add(fact_in_triple)
local_f.append(fact_in_triple)
if len(local_f) and all(f < max_height for f in local_f):
qh2_filtered.append(local_f)
assert len(qh2_filtered) == len(qs_negative), (qh2_filtered, qs_negative)
final_questions.append(
(
qid,
generated_question,
negative_generated_answer,
[q[0][1] for q in qs_negative],
qh2_filtered,
new_ids,
max_height,
)
)
# TODO If question is a join then sampling between a joined fact should generate zero output
# json.dumps({"metadata": {"raw": sampled}, "facts": [f['fact'] for f in sampled]})
return final_questions