def preprocess()

in preprocess/utils.py [0:0]


def preprocess(dataset, line, config):
    input_, output_ = line
    input_ = input_.strip().replace("\\n", " ")
    output_ = str(output_).split("\t")[0].strip()

    if dataset=="superglue-multirc" and output_=="NO ANSWER!":
        return None

    do_handle_sep = dataset.startswith("race-") or \
            dataset in ["sciq", "social_i_qa", "wiqa", "quail",
                        "superglue-multirc"]

    if do_handle_sep:
        assert input_.count("[SEP]")==1
        input_, context = input_.split("[SEP]")

    alphabet_options = list(string.ascii_uppercase)
    if dataset in ["quail", "quarel"]:
        alphabet_options = ["(" + option + ")" for option in alphabet_options]
    else:
        alphabet_options = [" (" + option + ") " for option in alphabet_options]

    def get_sentences(options):
        sentences = []
        text = input_
        for option in options:
            if option not in text:
                break
            text1, text = text.split(option, 1)
            sentences.append(text1)
        sentences.append(text)
        return sentences

    options = []
    if config["task_type"]=="multi-choice":
        sentences = get_sentences(alphabet_options)

        if len(sentences)>1:
            sentences = [s.strip() for s in sentences]
            input_ = sentences[0]
            options = sentences[1:]

            if dataset=="quarel":
                for i, o in enumerate(options):
                    if o.endswith(" or"):
                        options[i] = o[:-3]
                    if o.endswith("."):
                        options[i] = o[:-1]
                    if output_.endswith("."):
                        output_ = output_[:-1]

                if output_ not in options and output_ + "or" in options:
                    output_ = output_ + "or"

                if output_ not in options and output_.endswith(" or") and output_[:-3] in options:
                    output_ = output_[:-3]

                if output_ not in options and output_=="ext to construction site":
                    output_ = "n" + output_

                if output_ not in options and output_=="oilet paper":
                    output_ = "t" + output_

                if output_ not in options and output_=="aster":
                    output_ = "f" + output_

            if dataset=="superglue-multirc" and len(options)==1:
                return None

            assert len(options)>=2, (dataset, line)
            assert not any(["[SEP]" in option for option in options]), (dataset, line)
            assert output_ in options, (dataset, options, line)

        if len(options)==0 and dataset=="ai2_arc":
            sentences = get_sentences([" (" + str(i) + ") " for i in range(1, 100)])

            if len(sentences)>1:
                sentences = [s.strip() for s in sentences]
                input_ = sentences[0]
                options = sentences[1:]
                assert len(options)>=2, (dataset, line)
                assert not any(["SEP" in option for option in options]), (dataset, line)
                assert output_ in options, (dataset, line)

    elif config["task_type"]=="classification":
        assert len(config["options"])>=2
        options = config["options"]

    if do_handle_sep:
        input_ = context + input_

    return json.dumps({"task": dataset, "input": input_, "output": output_, "options": options})