in preprocess/utils.py [0:0]
def preprocess(dataset, line, config):
input_, output_ = line
input_ = input_.strip().replace("\\n", " ")
output_ = str(output_).split("\t")[0].strip()
if dataset=="superglue-multirc" and output_=="NO ANSWER!":
return None
do_handle_sep = dataset.startswith("race-") or \
dataset in ["sciq", "social_i_qa", "wiqa", "quail",
"superglue-multirc"]
if do_handle_sep:
assert input_.count("[SEP]")==1
input_, context = input_.split("[SEP]")
alphabet_options = list(string.ascii_uppercase)
if dataset in ["quail", "quarel"]:
alphabet_options = ["(" + option + ")" for option in alphabet_options]
else:
alphabet_options = [" (" + option + ") " for option in alphabet_options]
def get_sentences(options):
sentences = []
text = input_
for option in options:
if option not in text:
break
text1, text = text.split(option, 1)
sentences.append(text1)
sentences.append(text)
return sentences
options = []
if config["task_type"]=="multi-choice":
sentences = get_sentences(alphabet_options)
if len(sentences)>1:
sentences = [s.strip() for s in sentences]
input_ = sentences[0]
options = sentences[1:]
if dataset=="quarel":
for i, o in enumerate(options):
if o.endswith(" or"):
options[i] = o[:-3]
if o.endswith("."):
options[i] = o[:-1]
if output_.endswith("."):
output_ = output_[:-1]
if output_ not in options and output_ + "or" in options:
output_ = output_ + "or"
if output_ not in options and output_.endswith(" or") and output_[:-3] in options:
output_ = output_[:-3]
if output_ not in options and output_=="ext to construction site":
output_ = "n" + output_
if output_ not in options and output_=="oilet paper":
output_ = "t" + output_
if output_ not in options and output_=="aster":
output_ = "f" + output_
if dataset=="superglue-multirc" and len(options)==1:
return None
assert len(options)>=2, (dataset, line)
assert not any(["[SEP]" in option for option in options]), (dataset, line)
assert output_ in options, (dataset, options, line)
if len(options)==0 and dataset=="ai2_arc":
sentences = get_sentences([" (" + str(i) + ") " for i in range(1, 100)])
if len(sentences)>1:
sentences = [s.strip() for s in sentences]
input_ = sentences[0]
options = sentences[1:]
assert len(options)>=2, (dataset, line)
assert not any(["SEP" in option for option in options]), (dataset, line)
assert output_ in options, (dataset, line)
elif config["task_type"]=="classification":
assert len(config["options"])>=2
options = config["options"]
if do_handle_sep:
input_ = context + input_
return json.dumps({"task": dataset, "input": input_, "output": output_, "options": options})