def industry_classification_result_process()

in utils/compute_score.py [0:0]


def industry_classification_result_process(sub_df):
    '''行业情感信息抽取的数据处理'''
    contents_adj = []
    for i in range(sub_df.shape[0]):
        print(f'=============={i}==============')
        content = sub_df.iloc[i].predict.lstrip().rstrip()
        content = content.replace(':', ':').replace(',', ',').replace(' ', '').replace(';', ';').replace('。',
                                                                                                         '.').replace(
            ',\n', '\n')
        match_result = re.match('行业:(.*)情感分类:(.*)', content, re.DOTALL)
        class_info_dict = {}
        if match_result is not None:
            ind_info, class_info = match_result.groups()
            ind_info = ind_info.strip('\n.;')
            class_info = class_info.strip('\n.;')
            num_infos = get_num_infos(class_info)
            ## 情感分类为多个项目
            if len(num_infos) > 0:
                sep = num_infos[0]['sep']
                class_info_list = class_info.split(sep)
                ## 情感分类为二阶
                try:
                    if len(num_infos) > 1:
                        sec_sep = num_infos[1]['sep']
                        class_info_dict = dict([tuple(x.split(':')) for x in class_info_list])
                    else:
                        ## 情感分类为一阶
                        num_infos = get_num_infos(ind_info)
                        if len(num_infos) > 0:
                            sep = num_infos[0]['sep']
                            ind_info_list = ind_info.split(sep)
                        class_info_list_adj = extract_re_exprs(class_info_list, ['.*[为是](.*)', '.*((.*))'])
                        if None not in class_info_list_adj:
                            class_info_dict = dict(zip(ind_info_list, class_info_list_adj))
                        else:
                            max_len = max(len(ind_info_list), len(class_info_list))
                            class_info_list = pad_list(class_info_list, max_len, '无')
                            class_info_dict = dict(zip(ind_info_list, class_info_list))
                except:
                    find_res = re.findall('(\w+)[:\s]+([正|负|中|无])\w+', class_info)
                    class_info_dict = {x[0]: x[1] for x in find_res}
                    pass
            else:
                num_infos = get_num_infos(ind_info)
                if len(num_infos) > 0:
                    sep = num_infos[0]['sep']
                    ind_info_list = ind_info.split(sep)
                    class_info_dict = dict(zip(ind_info_list, [class_info] * len(ind_info_list)))
                else:
                    class_info_dict = {ind_info: class_info}
                pass
        else:
            if '抽取结果' in content:
                match_result = re.match('抽取结果.*?:(.*)', content, re.DOTALL)
                if match_result is not None:
                    match_result = strip(match_result.groups()[0])
                    match_result1 = re.match('.*情感分类结果:(.*)', match_result, re.DOTALL)
                    if match_result1 is not None:
                        class_info = match_result1.groups()[0]
                        # num_infos = [x for x in sorted(get_num_infos(ind_info), key=lambda x: x['num']) if x['num'] > 0]
                        num_infos = get_num_infos(class_info)
                        if len(num_infos) > 0:
                            sep = num_infos[0]['sep']
                            class_info_list = class_info.split(sep)
                        match_result1 = re.match('(.*)情感分类结果', match_result, re.DOTALL)
                        if match_result1 is not None:
                            ind_info = match_result1.groups()[0].strip()
                            # num_infos = [x for x in sorted(get_num_infos(ind_info), key=lambda x: x['num']) if x['num'] > 0]
                            num_infos = get_num_infos(class_info)
                            if len(num_infos) > 0:
                                sep = num_infos[0]['sep']
                                ind_info_list = ind_info.split(sep)
                                if len(class_info_list) == len(ind_info_list):
                                    class_info_dict = dict(zip(ind_info_list, class_info_list))
                                else:
                                    max_len = max(len(ind_info_list), len(class_info_list))
                                    ind_info_list = pad_list(ind_info_list, max_len, '无')
                                    class_info_list = pad_list(class_info_list, max_len, '无')
                                    class_info_dict = dict(zip(ind_info_list, class_info_list))
                    else:
                        try:
                            num_infos = get_num_infos(match_result)
                            if len(num_infos) > 0:
                                sep = num_infos[0]['sep']
                                ind_info_list = match_result.split(sep)
                                if len(num_infos) > 1:
                                    sep = num_infos[1]['sep']
                                    class_info_dict = dict([tuple(x.split(sep)) for x in ind_info_list])
                                else:
                                    class_info_dict = dict(zip(ind_info_list, ['无'] * len(ind_info_list)))
                                    pass
                            else:
                                class_info_dict = {match_result: '无'}
                                pass
                        except:
                            find_res = re.findall('(\w+)[:\s]+([正|负|中|无])\w+', match_result)
                            class_info_dict = {x[0]: x[1] for x in find_res}
                            pass
                        pass
                    pass
                else:
                    find_res = re.findall('(\w+)[:\s]+([正|负|中|无])\w+', content)
                    class_info_dict = {x[0]: x[1] for x in find_res}
                    pass
            else:
                find_result = [(strip(x[0]), strip(x[1])) for x in re.findall('^行业:(.*)情感分类:(.*)', content, re.DOTALL)]
                if len(find_result) > 0:
                    ind_info = find_result[0][0]
                    class_info = find_result[0][1]
                    num_infos = get_num_infos(ind_info)
                    if len(num_infos) > 0:
                        sep = num_infos[0]['sep']
                        ind_info_list = ind_info.split(sep)
                        num_infos = get_num_infos(class_info)
                        if num_infos > 0:
                            sep = num_infos[0]['sep']
                            class_info_list = class_info.split(sep)
                        else:
                            class_info_list = [class_info] * len(ind_info_list)
                else:
                    ## 预测结果如下:xxx:xxx,xxx:xxx
                    find_result = [strip(x) for x in re.findall('^.*结果.*?:(.*)', content, re.DOTALL)]
                    if len(find_result) > 0:
                        class_info_dict = decode_re_content(find_result[0])
                        pass
                    else:
                        class_info_dict = decode_re_content(content)
                pass
        if class_info_dict is None or len(class_info_dict) <= 0:
            find_res = re.findall('(\w+)[:\s]*([正|负|中|无])\w+', content)
            class_info_dict = {x[0]: x[1] for x in find_res}
            if len(class_info_dict) <= 0:
                print(content)
        contents_adj.append(class_info_dict)
    return contents_adj