exec_application_eval.py (47 lines of code) (raw):

from evaluator.evaluator import load_models_tokenizer, load_llama_models_tokenizer from utils.dataset import load_dataset from utils.compute_score import * from tqdm import tqdm import argparse def eval_application(args): # load model & tokenizer if 'llama' in args.model_name: model, tokenizer = load_llama_models_tokenizer(args) else: model, tokenizer = load_models_tokenizer(args) # 载入评测集 dataset = load_dataset(args.eval_type) # 大模型推理回答&记录答案 responses = [] for _, record in tqdm(dataset.iterrows()): prompt = record['instruction'] model_response, _ = model.chat( tokenizer, prompt, history=None, ) responses.append(model_response) result_path = os.path.join(args.save_result_dir, f"{args.model_name}_application_result.json") if args.save_result_dir: dataset["model_response"] = responses os.makedirs(args.save_result_dir, exist_ok=True) dataset.to_json(result_path, orient='records', force_ascii=False) # 计算应用评分 get_application_score(args) def get_application_score(args): _path = args.save_result_dir file_path = f'{_path}/{args.model_name}_application_result.json' result = {} print('Model: %s' % args.model_name) # QA rouge_l, qa_bert = compute_finqa(file_path) result['QA'] = {'rouge-L': rouge_l, 'Bert': qa_bert} # TG rouge_l_tg, _, tg_bert, _ = compute_text_generation(file_path) result['TG'] = {'rouge-L': rouge_l_tg, 'Bert': tg_bert} # MT-e2zh bleu, comet = compute_nmt_en2zh(file_path) result['MT-e2zh'] = {'BLEU': bleu, 'COMET': comet} # MT-zh2e bleu, comet = compute_nmt_zh2en(file_path) result['MT-zh2e'] = {'BLEU': bleu, 'COMET': comet} # TC acc, _ = compute_text_classification(file_path) result['TC'] = {'ACC': acc} # RE f1, _ = compute_extraction(file_path) result['RE'] = {'F1-score': f1}