def preprocess()

in paper/experiments/ldc/preprocess.py [0:0]


def preprocess(in_folder, out_folder, classification_dir, num_candidates=10, max_per_operation=10):
    """Linearize the data already processed into surface texts and AMRs into our format"""

    splits = {"test": "test", "dev": "validation", "train": "train"}

    in_folder = Path(in_folder)
    out_folder = Path(out_folder)
    out_folder.mkdir(parents=True, exist_ok=True)

    for split in splits:
        amrs = (in_folder / split / "nodes.pp.txt").read_text().split("\n")
        surfaces = (in_folder / split / "surface.pp.txt").read_text().split("\n")
        raw_amrs = (in_folder / ".." / "tmp_amr" / split / "graphs.txt").read_text().split("\n")
        items = [
            {"linearized_amr": amr, "answer_text": surface, "raw_amr": raw_amr}
            for amr, surface, raw_amr in zip(amrs, surfaces, raw_amrs)
            if amr and surface
        ]

        classification_data = []
        original_items = deepcopy(items)
        for item in items:

            entities = get_entities(item["linearized_amr"])

            swapping_candidates = [entities]
            cutting_candidates = [entities]

            rand_item = None
            while rand_item is None or rand_item == item:
                rand_item = random.choice(original_items)

            random_text = rand_item["answer_text"]

            distractors, classification_items = get_distractors(
                item["linearized_amr"],
                item["answer_text"],
                swapping_candidates,
                cutting_candidates,
                random_text,
                num_candidates=num_candidates,
                max_per_operation=max_per_operation,
            )
            classification_data.extend(classification_items)

            item["answer_text"] = distractors + [item["answer_text"]]

        json.dump(items, open(out_folder / (splits[split] + ".json"), "w"), indent=2)
        write_classification_data(classification_data, classification_dir, splits[split].replace(".json", ""))

    generate_from_json(out_folder, out_folder / "special_tokens.txt", fields={"linearized_amr": "amr"})
    fix_text_in_dir(out_folder)