def parse()

in paper/experiments/webnlg/preprocess.py [0:0]


def parse(in_file, classification_data, num_candidates=5, max_per_operation=2):
    """Parse the given file and update `classification_data` with the parsed data"""

    tree = ET.parse(in_file)
    root = tree.getroot()

    entries = list(root.find("entries"))
    items = []
    for e, entry in enumerate(entries):

        tripletsets = list(entry.find("modifiedtripleset").findall("mtriple")) + list(
            entry.find("modifiedtripleset").findall("otriple")
        )
        tripletsets = [process_tripleset(x) for x in tripletsets]

        modifiedtripleset = [x["text"] for x in tripletsets]
        modifiedtripleset.sort()

        mtripleset = entry.find("modifiedtripleset")
        modtripleset = []
        raw_tripleset = ""
        for mtriple in mtripleset:
            e1, pred, e2 = mtriple.text.split(" | ")
            raw_tripleset += mtriple.text + " ||| "

            modtripleset.append(Triple(cleanup(e1), pred, cleanup(e2)))

        all_lex = entry.findall("lex")
        for lex in all_lex:

            sortedtripleset = ""
            for sent in lex.find("sortedtripleset").findall("sentence"):
                for x in sent.findall("striple"):
                    sortedtripleset += process_tripleset(x)["text"] + ", "

            references = cleanup(lex.find("references"))
            template = cleanup(lex.find("template"))

            try:
                text = lex.find("text").text
                if not text:
                    print("empty text")
                    text = ""
                    continue
            except:
                print("exception text")
                text = ""
                continue

            try:
                template = lex.find("template").text
                if not template:
                    print("empty template")
                    template = ""
                    continue
            except:
                print("exception template")
                template = ""
                continue

            # preprocess distractors
            subjects = [x["dict"]["subject"] for x in tripletsets]
            objects = [x["dict"]["object"] for x in tripletsets]
            predicates = [x["dict"]["predicate"] for x in tripletsets]

            swapping_candidates = [subjects + objects]
            cutting_candidates = [subjects + objects]

            random_text = get_nearby_text(entries, e)

            tripletset_str = " ; ".join(modifiedtripleset)

            distractors, classification_items = get_distractors(
                tripletset_str,
                text,
                swapping_candidates,
                cutting_candidates,
                random_text,
                num_candidates=num_candidates,
                max_per_operation=max_per_operation,
            )

            classification_data.extend(classification_items)

            item = {
                "raw_modifiedtripleset": raw_tripleset,
                "modifiedtripleset": " ; ".join(modifiedtripleset),
                "sortedtripleset": sortedtripleset,
                "references": references,
                "template": template,
                "text": distractors + [text],
                "num_triples": Path(in_file).parent.name,
                "category": Path(in_file).name,
                "category_type": "seen" if Path(in_file).name in seen_categories else "unseen",
            }
            items.append(item)

    return items