in utils/compute_score.py [0:0]
def industry_classification_result_process(sub_df):
'''行业情感信息抽取的数据处理'''
contents_adj = []
for i in range(sub_df.shape[0]):
print(f'=============={i}==============')
content = sub_df.iloc[i].predict.lstrip().rstrip()
content = content.replace(':', ':').replace(',', ',').replace(' ', '').replace(';', ';').replace('。',
'.').replace(
',\n', '\n')
match_result = re.match('行业:(.*)情感分类:(.*)', content, re.DOTALL)
class_info_dict = {}
if match_result is not None:
ind_info, class_info = match_result.groups()
ind_info = ind_info.strip('\n.;')
class_info = class_info.strip('\n.;')
num_infos = get_num_infos(class_info)
## 情感分类为多个项目
if len(num_infos) > 0:
sep = num_infos[0]['sep']
class_info_list = class_info.split(sep)
## 情感分类为二阶
try:
if len(num_infos) > 1:
sec_sep = num_infos[1]['sep']
class_info_dict = dict([tuple(x.split(':')) for x in class_info_list])
else:
## 情感分类为一阶
num_infos = get_num_infos(ind_info)
if len(num_infos) > 0:
sep = num_infos[0]['sep']
ind_info_list = ind_info.split(sep)
class_info_list_adj = extract_re_exprs(class_info_list, ['.*[为是](.*)', '.*((.*))'])
if None not in class_info_list_adj:
class_info_dict = dict(zip(ind_info_list, class_info_list_adj))
else:
max_len = max(len(ind_info_list), len(class_info_list))
class_info_list = pad_list(class_info_list, max_len, '无')
class_info_dict = dict(zip(ind_info_list, class_info_list))
except:
find_res = re.findall('(\w+)[:\s]+([正|负|中|无])\w+', class_info)
class_info_dict = {x[0]: x[1] for x in find_res}
pass
else:
num_infos = get_num_infos(ind_info)
if len(num_infos) > 0:
sep = num_infos[0]['sep']
ind_info_list = ind_info.split(sep)
class_info_dict = dict(zip(ind_info_list, [class_info] * len(ind_info_list)))
else:
class_info_dict = {ind_info: class_info}
pass
else:
if '抽取结果' in content:
match_result = re.match('抽取结果.*?:(.*)', content, re.DOTALL)
if match_result is not None:
match_result = strip(match_result.groups()[0])
match_result1 = re.match('.*情感分类结果:(.*)', match_result, re.DOTALL)
if match_result1 is not None:
class_info = match_result1.groups()[0]
# num_infos = [x for x in sorted(get_num_infos(ind_info), key=lambda x: x['num']) if x['num'] > 0]
num_infos = get_num_infos(class_info)
if len(num_infos) > 0:
sep = num_infos[0]['sep']
class_info_list = class_info.split(sep)
match_result1 = re.match('(.*)情感分类结果', match_result, re.DOTALL)
if match_result1 is not None:
ind_info = match_result1.groups()[0].strip()
# num_infos = [x for x in sorted(get_num_infos(ind_info), key=lambda x: x['num']) if x['num'] > 0]
num_infos = get_num_infos(class_info)
if len(num_infos) > 0:
sep = num_infos[0]['sep']
ind_info_list = ind_info.split(sep)
if len(class_info_list) == len(ind_info_list):
class_info_dict = dict(zip(ind_info_list, class_info_list))
else:
max_len = max(len(ind_info_list), len(class_info_list))
ind_info_list = pad_list(ind_info_list, max_len, '无')
class_info_list = pad_list(class_info_list, max_len, '无')
class_info_dict = dict(zip(ind_info_list, class_info_list))
else:
try:
num_infos = get_num_infos(match_result)
if len(num_infos) > 0:
sep = num_infos[0]['sep']
ind_info_list = match_result.split(sep)
if len(num_infos) > 1:
sep = num_infos[1]['sep']
class_info_dict = dict([tuple(x.split(sep)) for x in ind_info_list])
else:
class_info_dict = dict(zip(ind_info_list, ['无'] * len(ind_info_list)))
pass
else:
class_info_dict = {match_result: '无'}
pass
except:
find_res = re.findall('(\w+)[:\s]+([正|负|中|无])\w+', match_result)
class_info_dict = {x[0]: x[1] for x in find_res}
pass
pass
pass
else:
find_res = re.findall('(\w+)[:\s]+([正|负|中|无])\w+', content)
class_info_dict = {x[0]: x[1] for x in find_res}
pass
else:
find_result = [(strip(x[0]), strip(x[1])) for x in re.findall('^行业:(.*)情感分类:(.*)', content, re.DOTALL)]
if len(find_result) > 0:
ind_info = find_result[0][0]
class_info = find_result[0][1]
num_infos = get_num_infos(ind_info)
if len(num_infos) > 0:
sep = num_infos[0]['sep']
ind_info_list = ind_info.split(sep)
num_infos = get_num_infos(class_info)
if num_infos > 0:
sep = num_infos[0]['sep']
class_info_list = class_info.split(sep)
else:
class_info_list = [class_info] * len(ind_info_list)
else:
## 预测结果如下:xxx:xxx,xxx:xxx
find_result = [strip(x) for x in re.findall('^.*结果.*?:(.*)', content, re.DOTALL)]
if len(find_result) > 0:
class_info_dict = decode_re_content(find_result[0])
pass
else:
class_info_dict = decode_re_content(content)
pass
if class_info_dict is None or len(class_info_dict) <= 0:
find_res = re.findall('(\w+)[:\s]*([正|负|中|无])\w+', content)
class_info_dict = {x[0]: x[1] for x in find_res}
if len(class_info_dict) <= 0:
print(content)
contents_adj.append(class_info_dict)
return contents_adj