def preprocess()

in paper/experiments/viggo/preprocess.py [0:0]


def preprocess(in_folder, out_folder, classification_dir):
    in_folder = Path(in_folder)
    out_folder = Path(out_folder)

    out_folder.mkdir(parents=True, exist_ok=True)

    splits = {"viggo-test.csv": "test.json", "viggo-train.csv": "train.json", "viggo-valid.csv": "validation.json"}
    for split in splits:
        df = pd.read_csv(in_folder / split)
        data = df.to_dict(orient="records")
        original_data = deepcopy(data)
        classification_data = []

        for item in data:
            mr = item["mr"]
            parsed = parse_mr(mr)
            new_params = [
                f"<{key}> {key.replace('_', ' ')}: [ {value} ]" for key, value in zip(parsed["keys"], parsed["values"])
            ]
            new_mr = f"<{parsed['intro']}> {parsed['intro'].replace('_', ' ')} ( {', '.join(new_params)}> )"

            item["new_mr"] = new_mr

            valid_values = [x for x in parsed["values"] if x]
            swapping_candidates = [valid_values]
            cutting_candidates = [valid_values]

            rand_item = None
            while rand_item is None or rand_item == item:
                rand_item = random.choice(original_data)
            random_text = rand_item["ref"]

            distractors, classification_items = get_distractors(
                new_mr,
                item["ref"],
                swapping_candidates,
                cutting_candidates,
                random_text,
                num_candidates=10,
                max_per_operation=10,
            )
            classification_data.extend(classification_items)

            item["ref"] = distractors + [item["ref"]]

        json.dump(data, open(out_folder / (splits[split]), "w"), indent=2)
        write_classification_data(classification_data, classification_dir, splits[split].replace(".json", ""))

    generate_from_json(out_folder, out_folder / "special_tokens.txt", fields={"new_mr": "amr"})
    fix_text_in_dir(out_folder)