data/multiwoz/utlis/postprocessing_dataset.py [5:61]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    text = re.sub(',', '', text)
    text = ' '.join(text.split()).strip()
    return text

def split_bs_text_by_domain(bs_text, bsdx_text):
    res_text_list = []
    bs_text = bs_text.strip('<sos_b>').strip('<eos_b>').strip()
    bsdx_text = bsdx_text.strip('<sos_b>').strip('<eos_b>').strip()
    token_list = bsdx_text.split()
    domain_list = []
    for token in token_list:
        if token.startswith('[') and token.endswith(']'):
            domain_list.append(token)
    if domain_list == 1: # only have one domain
        return [bs_text], [bsdx_text]
    else:
        bs_list, bsdx_list = [], []
        for idx in range(len(domain_list)):
            curr_domain = domain_list[idx]
            if idx == len(domain_list)-1: # last domain
                bs_text_snippet = curr_domain + ' ' + bs_text.split(curr_domain)[1].strip()
                bsdx_text_snippet = curr_domain + ' ' + bsdx_text.split(curr_domain)[1].strip()
            else:
                next_domain = domain_list[idx+1]
                bs_text_snippet = curr_domain + ' ' + bs_text.split(curr_domain)[1].split(next_domain)[0]
                bsdx_text_snippet = curr_domain + ' ' + bsdx_text.split(curr_domain)[1].split(next_domain)[0]
            bs_list.append(bs_text_snippet.strip())
            bsdx_list.append(bsdx_text_snippet.strip())
    return bs_list, bsdx_list

def parse_bs_bsdx(bs_text, bsdx_text):
    # this function deals belief state from single domain
    # we assume there is no repitive slots in the bsdx text
    dx_token_list = bsdx_text.split()

    bs_name_list = bsdx_text.split()
    token_num = len(bs_name_list)
    map_dict = {}
    res_bs_text = ''
    res_bsdx_text = ''
    for idx in range(token_num):
        curr_slot = bs_name_list[idx]
        if curr_slot.startswith('[') and curr_slot.endswith(']'):
            continue
        else:
            if idx == token_num - 1:
                #curr_value = bs_text.split(' ' + curr_slot + ' ')[-1].strip()
                curr_value = bs_text.split(' ' + curr_slot + ' ')[-1].strip()
            else:
                next_slot = bs_name_list[idx+1]
                curr_value = bs_text.split(' ' + curr_slot + ' ')[1].split(' ' + next_slot)[0].strip()
            map_dict[curr_slot] = curr_value
    for curr_slot in bs_name_list:
        if curr_slot.startswith('[') and curr_slot.endswith(']'):
            res_bs_text += curr_slot + ' '
            res_bsdx_text += curr_slot + ' '
        else:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


data/multiwoz/utlis/processing_funcs.py [4:60]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    text = re.sub(',', '', text)
    text = ' '.join(text.split()).strip()
    return text

def split_bs_text_by_domain(bs_text, bsdx_text):
    res_text_list = []
    bs_text = bs_text.strip('<sos_b>').strip('<eos_b>').strip()
    bsdx_text = bsdx_text.strip('<sos_b>').strip('<eos_b>').strip()
    token_list = bsdx_text.split()
    domain_list = []
    for token in token_list:
        if token.startswith('[') and token.endswith(']'):
            domain_list.append(token)
    if domain_list == 1: # only have one domain
        return [bs_text], [bsdx_text]
    else:
        bs_list, bsdx_list = [], []
        for idx in range(len(domain_list)):
            curr_domain = domain_list[idx]
            if idx == len(domain_list)-1: # last domain
                bs_text_snippet = curr_domain + ' ' + bs_text.split(curr_domain)[1].strip()
                bsdx_text_snippet = curr_domain + ' ' + bsdx_text.split(curr_domain)[1].strip()
            else:
                next_domain = domain_list[idx+1]
                bs_text_snippet = curr_domain + ' ' + bs_text.split(curr_domain)[1].split(next_domain)[0]
                bsdx_text_snippet = curr_domain + ' ' + bsdx_text.split(curr_domain)[1].split(next_domain)[0]
            bs_list.append(bs_text_snippet.strip())
            bsdx_list.append(bsdx_text_snippet.strip())
    return bs_list, bsdx_list

def parse_bs_bsdx(bs_text, bsdx_text):
    # this function deals belief state from single domain
    # we assume there is no repitive slots in the bsdx text
    dx_token_list = bsdx_text.split()

    bs_name_list = bsdx_text.split()
    token_num = len(bs_name_list)
    map_dict = {}
    res_bs_text = ''
    res_bsdx_text = ''
    for idx in range(token_num):
        curr_slot = bs_name_list[idx]
        if curr_slot.startswith('[') and curr_slot.endswith(']'):
            continue
        else:
            if idx == token_num - 1:
                #curr_value = bs_text.split(' ' + curr_slot + ' ')[-1].strip()
                curr_value = bs_text.split(' ' + curr_slot + ' ')[-1].strip()
            else:
                next_slot = bs_name_list[idx+1]
                curr_value = bs_text.split(' ' + curr_slot + ' ')[1].split(' ' + next_slot)[0].strip()
            map_dict[curr_slot] = curr_value
    for curr_slot in bs_name_list:
        if curr_slot.startswith('[') and curr_slot.endswith(']'):
            res_bs_text += curr_slot + ' '
            res_bsdx_text += curr_slot + ' '
        else:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -