def convert_gqa_to

def convert_gqa_to_vqa()

in tools/scripts/gqa/convert_gqa_to_vqa.py [0:0]
63 lines of code
17 McCabe index (conditional complexity)

def convert_gqa_to_vqa(gqa_dir, out_dir):
    """
    Takes GQA dataset and converts it into VQA format

    Assumes GQA dir structure as:

    -gqa_dir/
      -images/
         -images/
         -objects/
         -spatial/
      -questions/
      -scenegraphs/
    """

    image_feat_path = os.path.join(gqa_dir, "images")
    extract_image_features(image_feat_path, out_dir)

    questions_dir = os.path.join(gqa_dir, "questions")

    if os.path.isfile(os.path.join(questions_dir, "train_all_questions.json")):
        print("Using previously generated train_all_questions.json file")
    else:
        merge_train(os.path.join(gqa_dir, "questions", "train_all_questions"))

    split_mapping = {
        "test": "test_all_questions.json",
        "val": "val_all_questions.json",
        "challenge": "challenge_all_questions.json",
        "train": "train_all_questions.json",
    }

    for split in split_mapping:
        for balance_type in ["balanced", "all"]:
            filename = split_mapping[split]
            csplit = split
            if balance_type == "balanced":
                filename = filename.replace("_all", "_balanced")
                csplit = split + "_balanced"

            file_path = os.path.join(questions_dir, filename)
            imdb = get_imdb(file_path)

            save_path = os.path.join(out_dir, f"imdb_{csplit}.npy")
            np.save(save_path, imdb)

    splits = ["val", "train"]
    split_type = ["balanced", "all"]

    global_answer = Counter()
    global_q = Counter()
    question_len = Counter()

    for s in splits:
        for st in split_type:
            questions_json = os.path.join(questions_dir, f"{s}_{st}_questions.json")
            questions = json.load(open(questions_json))

            print(f"Processing split {s}_{st}")

            answers = Counter()
            q_tokens = Counter()

            for _, q in tqdm.tqdm(questions.items()):
                tokens = tokenize(q["question"])
                q_tokens.update(tokens)
                global_q.update(tokens)
                answers.update([q["answer"].lower()])
                global_answer.update([q["answer"].lower()])
                question_len.update([len(tokens)])

    print("N_unique answers :", len(global_answer))
    print("N unique q tokens:", len(global_q))
    print("Min Q length", min([x for x in question_len]))
    print("Max Q length", max([x for x in question_len]))
    print("Q length distribution", question_len)

    # Save question vocabulary
    q_vocabulary = [w[0] for w in global_q.items()]
    q_vocabulary.sort()
    q_vocabulary = ["<unk>"] + q_vocabulary

    vocab_file = os.path.join(out_dir, "vocabulary_gqa.txt")
    with open(vocab_file, "w") as f:
        f.writelines([w + "\n" for w in q_vocabulary])

    # Save answer vocabulary
    answer_list = [preprocess_answer(ans[0]) for ans in global_answer.items()]
    answer_list = [t.strip() for t in answer_list if len(t.strip()) > 0]
    answer_list.sort()

    if "<unk>" not in answer_list:
        answer_list = ["<unk>"] + answer_list

    answer_file = os.path.join(out_dir, "answers_gqa.txt")
    with open(answer_file, "w") as fp:
        fp.writelines([w + "\n" for w in answer_list])