in TransferQA/data_loader.py [0:0]
def read_SGD(args, path_name, tokenizer, dataset="test"):
choice_token = " <extra_id_0> "
# read test set
all_data = []
# read from original data
for filename in os.listdir(os.path.join(path_name,dataset)):
if filename.startswith("dialogues_"):
with open(os.path.join(path_name,dataset,filename)) as f:
data = json.load(f)
all_data+=data
with open(os.path.join(path_name,dataset,"schema.json")) as f:
data = json.load(f)
check_list = ["what", "how", "whether", "which"]
schema = {}
for service in data:
schema[service["service_name"]] = {}
# collect required_slots and optional_slots
slot_collection = []
for intent in service["intents"]:
for slot in intent["required_slots"]:
slot_collection.append(slot)
for slot in intent["optional_slots"].keys():
slot_collection.append(slot)
for slot in service["slots"]:
description = slot["description"].lower()
if any(c_l in description for c_l in check_list):
description = f"{description}?"
else:
description = f"what is the {description}?"
if slot["name"] in slot_collection:
schema[service["service_name"]][slot["name"]] = (description, slot["possible_values"])
schema = adjust_sgd_questions(schema)
p_data = []
# read dialogues
for ID, dial in enumerate(all_data):
#print(ID)
dialog_history = ""
for idx, turn in enumerate(dial["turns"]):
utterance = turn["utterance"]
utterance = fix_number(utterance)
# User start the conversation
if turn["speaker"] == "USER":
assert idx%2==0
# accumulate dialogue utterances
#dialog_history += (" System: " + turn["system"] + " User: " + turn["user"])
dialog_history += (" User: " + utterance)
for fid, frame in enumerate(turn["frames"]):
# read slot values
for k in schema[frame["service"]]:
value_text = frame["state"]["slot_values"].get(k, ['none'])[0]
# for k, v in frame["state"]["slot_values"].items():
question = schema[frame["service"]][k][0]
input_text = f"extractive question: {question} context: {dialog_history}".strip().lower()
data_detail = {
"ID":ID,
"dialogue_id":dial["dialogue_id"],
"domains":frame["service"],
"turn_id":idx,
"frame_id":fid,
"intput_text":input_text,
"output_text":"dummy",
"slot_text":k,
"value_text":value_text,
"question_type": "extractive"
}
p_data.append(data_detail)
if len(schema[frame["service"]][k][1])>0 and value_text!="none":
choices = (choice_token + choice_token.join(schema[frame["service"]][k][1])).lower()
input_text = f"multi-choice question: {question} choices: {choices} context: {dialog_history}".strip().lower()
# output_text = (qa["answer"] + f" {tokenizer.eos_token}").lower()
data_detail = {
"ID":ID,
"dialogue_id":dial["dialogue_id"],
"domains":frame["service"],
"turn_id":idx,
"frame_id":fid,
"intput_text":input_text,
"output_text":"dummy",
"slot_text":k,
"value_text":value_text,
"question_type": "multi-choice"
}
p_data.append(data_detail)
# system turn
else:
assert idx%2==1
dialog_history += (" Speaker: " + utterance)
# with open(os.path.join("test",f"output.json"), 'w') as fout:
# json.dump(all_data, fout, indent=4)
for idx in range(13):
print(p_data[idx])
# print(all_data[2])
return p_data, all_data