in TransferQA/data_loader.py [0:0]
def read_data(args, path_name, SLOTS, tokenizer, description, dataset=None):
choice_token = " <extra_id_0> "
print(("Reading all files from {}".format(path_name)))
data = []
domain_counter = {}
# read files
with open(path_name) as f:
dials = json.load(f)
if dataset=="train" and args["fewshot"]>0:
random.Random(args["seed"]).shuffle(dials)
dials = dials[:int(len(dials)*args["fewshot"])]
for dial_dict in dials:
dialog_history = ""
# Counting domains
for domain in dial_dict["domains"]:
if domain not in EXPERIMENT_DOMAINS:
continue
if domain not in domain_counter.keys():
domain_counter[domain] = 0
domain_counter[domain] += 1
# Unseen domain setting
if args["only_domain"] != "none" and args["only_domain"] not in dial_dict["domains"]:
continue
if (args["except_domain"] != "none" and dataset == "test" and args["except_domain"] not in dial_dict["domains"]) or \
(args["except_domain"] != "none" and dataset != "test" and [args["except_domain"]] == dial_dict["domains"]):
continue
# Reading data
for ti, turn in enumerate(dial_dict["turns"]):
turn_id = ti
# accumulate dialogue utterances
dialog_history += (" system: " + turn["system"] + " user: " + turn["user"])
slot_values = fix_general_label_error(turn["state"]["slot_values"],SLOTS)
# input: dialogue history + slot
# output: value
# Generate domain-dependent slot list
slot_temp = SLOTS
if dataset == "train" or dataset == "dev":
if args["except_domain"] != "none":
slot_temp = [k for k in SLOTS if args["except_domain"] not in k]
slot_values = OrderedDict([(k, v) for k, v in slot_values.items() if args["except_domain"] not in k])
elif args["only_domain"] != "none":
slot_temp = [k for k in SLOTS if args["only_domain"] in k]
slot_values = OrderedDict([(k, v) for k, v in slot_values.items() if args["only_domain"] in k])
else:
if args["except_domain"] != "none":
slot_temp = [k for k in SLOTS if args["except_domain"] in k]
slot_values = OrderedDict([(k, v) for k, v in slot_values.items() if args["except_domain"] in k])
elif args["only_domain"] != "none":
slot_temp = [k for k in SLOTS if args["only_domain"] in k]
slot_values = OrderedDict([(k, v) for k, v in slot_values.items() if args["only_domain"] in k])
turn_belief_list = []
for k,v in slot_values.items():
if v!="none":
turn_belief_list.append(str(k)+'-'+str(v))
# turn_belief_list = [str(k)+'-'+str(v) for k,v in slot_values.items()]
for slot in slot_temp:
# skip unrelevant slots for out of domain setting
if args["except_domain"] != "none" and dataset !="test":
if slot.split("-")[0] not in dial_dict["domains"]:
continue
slot_lang = description[slot]["question"]
slot_text = slot
value_text = slot_values.get(slot, 'none').strip()
if args["gold_slots"]:
if value_text=="none":
continue
input_text = f"extractive question: {slot_lang} context: {dialog_history}".lower()
output_text = value_text + f" {tokenizer.eos_token}"
data_detail = {
"ID":dial_dict["dial_id"],
"domains":dial_dict["domains"],
"turn_id":turn_id,
"dialog_history":dialog_history,
"turn_belief":turn_belief_list,
"intput_text":input_text,
"output_text":output_text,
"slot_text":slot_text,
"value_text":value_text,
"question_type": "extractive"
}
data.append(data_detail)
if len(description[slot]["values"])>0 and value_text!="none":
choices = (choice_token + choice_token.join(description[slot]["values"])).lower()
input_text = f"multi-choice question: {slot_lang} choices: {choices} context: {dialog_history}".lower()
output_text = (value_text + f" {tokenizer.eos_token}").lower()
data_detail = {
"ID":dial_dict["dial_id"],
"domains":dial_dict["domains"],
"turn_id":turn_id,
"dialog_history":dialog_history,
"turn_belief":turn_belief_list,
"intput_text":input_text,
"output_text":output_text,
"slot_text":slot_text,
"value_text":value_text,
"question_type": "multi-choice"
}
data.append(data_detail)
for idx in range(10):
print(data[idx])
print("domain_counter", domain_counter)
return data, slot_temp