def read_SGD()

in TransferQA/data_loader.py [0:0]
78 lines of code
21 McCabe index (conditional complexity)

def read_SGD(args, path_name, tokenizer, dataset="test"):
    choice_token = " <extra_id_0> "

    # read test set
    all_data = []
    # read from original data
    for filename in os.listdir(os.path.join(path_name,dataset)):
        if filename.startswith("dialogues_"):
            with open(os.path.join(path_name,dataset,filename)) as f:
                data = json.load(f)
                all_data+=data

    with open(os.path.join(path_name,dataset,"schema.json")) as f:
        data = json.load(f)
        check_list = ["what", "how", "whether", "which"]
        schema = {}
        for service in data:
            schema[service["service_name"]] = {}
            # collect required_slots and optional_slots
            slot_collection = []
            for intent in service["intents"]:
                for slot in intent["required_slots"]:
                    slot_collection.append(slot)
                for slot in intent["optional_slots"].keys():
                    slot_collection.append(slot)

            for slot in service["slots"]:
                description = slot["description"].lower()
                if any(c_l in description for c_l in check_list):
                    description = f"{description}?"
                else:
                    description = f"what is the {description}?"

                if slot["name"] in slot_collection:
                    schema[service["service_name"]][slot["name"]] = (description, slot["possible_values"])

    schema = adjust_sgd_questions(schema)


    p_data = []
    # read dialogues
    for ID, dial in enumerate(all_data):
        #print(ID)
        dialog_history = ""

        for idx, turn in enumerate(dial["turns"]):
            utterance = turn["utterance"]
            utterance = fix_number(utterance)
            # User start the conversation
            if turn["speaker"] == "USER":
                assert idx%2==0
                # accumulate dialogue utterances
                #dialog_history +=  (" System: " + turn["system"] + " User: " + turn["user"])
                dialog_history +=  (" User: " + utterance)


                for fid, frame in enumerate(turn["frames"]):
                    # read slot values
                    for k in schema[frame["service"]]:
                        value_text = frame["state"]["slot_values"].get(k, ['none'])[0]

                    # for k, v in frame["state"]["slot_values"].items():

                        question = schema[frame["service"]][k][0]
                        input_text = f"extractive question: {question} context: {dialog_history}".strip().lower()
                        data_detail = {
                            "ID":ID,
                            "dialogue_id":dial["dialogue_id"],
                            "domains":frame["service"],
                            "turn_id":idx,
                            "frame_id":fid,
                            "intput_text":input_text,
                            "output_text":"dummy",
                            "slot_text":k,
                            "value_text":value_text,
                            "question_type": "extractive"
                            }
                        p_data.append(data_detail)



                        if len(schema[frame["service"]][k][1])>0 and value_text!="none":
                            choices = (choice_token + choice_token.join(schema[frame["service"]][k][1])).lower()
                            input_text = f"multi-choice question: {question} choices: {choices} context: {dialog_history}".strip().lower()
                            # output_text = (qa["answer"] + f" {tokenizer.eos_token}").lower()
                            data_detail = {
                                    "ID":ID,
                                    "dialogue_id":dial["dialogue_id"],
                                    "domains":frame["service"],
                                    "turn_id":idx,
                                    "frame_id":fid,
                                    "intput_text":input_text,
                                    "output_text":"dummy",
                                    "slot_text":k,
                                    "value_text":value_text,
                                    "question_type": "multi-choice"
                                    }
                            p_data.append(data_detail)


            # system turn
            else:
                assert idx%2==1
                dialog_history +=  (" Speaker: " + utterance)


    # with open(os.path.join("test",f"output.json"), 'w') as fout:
    #     json.dump(all_data, fout, indent=4)

    for idx in range(13):
        print(p_data[idx])
    # print(all_data[2])
    return p_data, all_data