def build_questions_for

def build_questions_for_db()

in dataset-construction/src/ndb_data/construction/make_questions.py [0:0]
86 lines of code
28 McCabe index (conditional complexity)

def build_questions_for_db(database):

    # Get the facts
    facts = database["metadata"]["raw"]
    hypotheses_facts = map_triples_to_facts(facts)

    # Generate all derivations for these facts
    derivations = generate_derivations(hypotheses_facts, facts)

    # For each fact/derivation build a lookup table for it's height and q/a
    by_qid = defaultdict(list)
    q_heights = defaultdict(list)
    for q in derivations:
        by_qid[q[0]].append((q[2], q[3]))
        q_heights[q[0]].append(q[3])

    final_questions = []
    # For each question in the derivations, generate it's positive answer
    for qid, qs in by_qid.items():
        try:
            _, qh, generated_question, generated_answer = generate_positive_question(
                qid, qs, q_heights
            )
        except TypeError:
            continue
        except AssertionError:
            continue

        # Get the participating IDs of the facts in this Q/A
        ids = set()
        qh1_filtered = []
        for facts_for_q in qh:
            local_f = []
            for fact_in_triple in facts_for_q:
                ids.add(fact_in_triple)
                local_f.append(fact_in_triple)

            if len(local_f):
                qh1_filtered.append(local_f)

        # Either set the context to be the size of the DB

        question_height = (
            len(facts)
            if random.randint(0, 1)
            else random.choice(list(range(max(ids), len(facts))))
        )
        assert len(qh1_filtered) == len(qs)
        final_questions.append(
            (
                qid,
                generated_question,
                generated_answer,
                [q[0][1] for q in qs],
                qh1_filtered,
                ids,
                question_height,
            )
        )

        # If question is a lookup then sampling between min(ids) and max(ids) should
        # change the generated answer. Sampling below min(ids) would return no answer
        if len(qs) > 1:
            sample_choices = list(
                range(min(ids) if min(ids) != max(ids) else 0, max(ids))
            )
            if not len(sample_choices):
                continue
            max_height = random.choice(sample_choices)
        else:
            max_height = random.choice(list(range(0, min(ids) + 1)))

        # Make a negative question (with a null answer)
        qs_negative = [q for q in qs if all(subfact < max_height for subfact in q[1])]

        try:
            _, qh2, _, negative_generated_answer = generate_positive_question(
                qid, qs_negative, q_heights, max_height
            )
        except TypeError:
            continue
        except AssertionError:
            continue
        # print(generated_question, negative_generated_answer, max_height)

        qh2_filtered = []
        new_ids = set()
        for facts_for_q in qh2:
            local_f = []
            for fact_in_triple in facts_for_q:
                if fact_in_triple < max_height:
                    new_ids.add(fact_in_triple)
                local_f.append(fact_in_triple)

            if len(local_f) and all(f < max_height for f in local_f):
                qh2_filtered.append(local_f)

        assert len(qh2_filtered) == len(qs_negative), (qh2_filtered, qs_negative)

        final_questions.append(
            (
                qid,
                generated_question,
                negative_generated_answer,
                [q[0][1] for q in qs_negative],
                qh2_filtered,
                new_ids,
                max_height,
            )
        )

        # TODO If question is a join then sampling between a joined fact should generate zero output

    # json.dumps({"metadata": {"raw": sampled}, "facts": [f['fact'] for f in sampled]})

    return final_questions