in paper/experiments/viggo/preprocess.py [0:0]
def preprocess(in_folder, out_folder, classification_dir):
in_folder = Path(in_folder)
out_folder = Path(out_folder)
out_folder.mkdir(parents=True, exist_ok=True)
splits = {"viggo-test.csv": "test.json", "viggo-train.csv": "train.json", "viggo-valid.csv": "validation.json"}
for split in splits:
df = pd.read_csv(in_folder / split)
data = df.to_dict(orient="records")
original_data = deepcopy(data)
classification_data = []
for item in data:
mr = item["mr"]
parsed = parse_mr(mr)
new_params = [
f"<{key}> {key.replace('_', ' ')}: [ {value} ]" for key, value in zip(parsed["keys"], parsed["values"])
]
new_mr = f"<{parsed['intro']}> {parsed['intro'].replace('_', ' ')} ( {', '.join(new_params)}> )"
item["new_mr"] = new_mr
valid_values = [x for x in parsed["values"] if x]
swapping_candidates = [valid_values]
cutting_candidates = [valid_values]
rand_item = None
while rand_item is None or rand_item == item:
rand_item = random.choice(original_data)
random_text = rand_item["ref"]
distractors, classification_items = get_distractors(
new_mr,
item["ref"],
swapping_candidates,
cutting_candidates,
random_text,
num_candidates=10,
max_per_operation=10,
)
classification_data.extend(classification_items)
item["ref"] = distractors + [item["ref"]]
json.dump(data, open(out_folder / (splits[split]), "w"), indent=2)
write_classification_data(classification_data, classification_dir, splits[split].replace(".json", ""))
generate_from_json(out_folder, out_folder / "special_tokens.txt", fields={"new_mr": "amr"})
fix_text_in_dir(out_folder)