in tools/scripts/gqa/convert_gqa_to_vqa.py [0:0]
def convert_gqa_to_vqa(gqa_dir, out_dir):
"""
Takes GQA dataset and converts it into VQA format
Assumes GQA dir structure as:
-gqa_dir/
-images/
-images/
-objects/
-spatial/
-questions/
-scenegraphs/
"""
image_feat_path = os.path.join(gqa_dir, "images")
extract_image_features(image_feat_path, out_dir)
questions_dir = os.path.join(gqa_dir, "questions")
if os.path.isfile(os.path.join(questions_dir, "train_all_questions.json")):
print("Using previously generated train_all_questions.json file")
else:
merge_train(os.path.join(gqa_dir, "questions", "train_all_questions"))
split_mapping = {
"test": "test_all_questions.json",
"val": "val_all_questions.json",
"challenge": "challenge_all_questions.json",
"train": "train_all_questions.json",
}
for split in split_mapping:
for balance_type in ["balanced", "all"]:
filename = split_mapping[split]
csplit = split
if balance_type == "balanced":
filename = filename.replace("_all", "_balanced")
csplit = split + "_balanced"
file_path = os.path.join(questions_dir, filename)
imdb = get_imdb(file_path)
save_path = os.path.join(out_dir, f"imdb_{csplit}.npy")
np.save(save_path, imdb)
splits = ["val", "train"]
split_type = ["balanced", "all"]
global_answer = Counter()
global_q = Counter()
question_len = Counter()
for s in splits:
for st in split_type:
questions_json = os.path.join(questions_dir, f"{s}_{st}_questions.json")
questions = json.load(open(questions_json))
print(f"Processing split {s}_{st}")
answers = Counter()
q_tokens = Counter()
for _, q in tqdm.tqdm(questions.items()):
tokens = tokenize(q["question"])
q_tokens.update(tokens)
global_q.update(tokens)
answers.update([q["answer"].lower()])
global_answer.update([q["answer"].lower()])
question_len.update([len(tokens)])
print("N_unique answers :", len(global_answer))
print("N unique q tokens:", len(global_q))
print("Min Q length", min([x for x in question_len]))
print("Max Q length", max([x for x in question_len]))
print("Q length distribution", question_len)
# Save question vocabulary
q_vocabulary = [w[0] for w in global_q.items()]
q_vocabulary.sort()
q_vocabulary = ["<unk>"] + q_vocabulary
vocab_file = os.path.join(out_dir, "vocabulary_gqa.txt")
with open(vocab_file, "w") as f:
f.writelines([w + "\n" for w in q_vocabulary])
# Save answer vocabulary
answer_list = [preprocess_answer(ans[0]) for ans in global_answer.items()]
answer_list = [t.strip() for t in answer_list if len(t.strip()) > 0]
answer_list.sort()
if "<unk>" not in answer_list:
answer_list = ["<unk>"] + answer_list
answer_file = os.path.join(out_dir, "answers_gqa.txt")
with open(answer_file, "w") as fp:
fp.writelines([w + "\n" for w in answer_list])