evaluation/utils.py

import os import json import random import json import os import numpy as np from pathlib import Path from typing import Iterable, Union, Any from examples import get_examples def set_seed(seed: int = 42) -> None: np.random.seed(seed) random.seed(seed) os.environ["PYTHONHASHSEED"] = str(seed) print(f"Random seed set as {seed}") def load_jsonl(file: Union[str, Path]) -> Iterable[Any]: with open(file, "r", encoding="utf-8") as f: for line in f: try: yield json.loads(line) except: print("Error in loading:", line) exit() def save_jsonl(samples, save_path): # ensure path folder = os.path.dirname(save_path) os.makedirs(folder, exist_ok=True) with open(save_path, "w", encoding="utf-8") as f: for sample in samples: f.write(json.dumps(sample, ensure_ascii=False) + "\n") print("Saved to", save_path) def lower_keys(example): new_example = {} for key, value in example.items(): if key != key.lower(): new_key = key.lower() new_example[new_key] = value else: new_example[key] = value return new_example EXAMPLES = get_examples() def load_prompt(data_name, prompt_type, num_shots): if not num_shots: return [] if data_name in ["gsm_hard", "svamp", "tabmwp", "asdiv", "mawps"]: data_name = "gsm8k" if data_name in ["math_oai", "hungarian_exam", "math-oai", "aime24", "amc23"]: data_name = "math" if data_name in ["sat_math"]: data_name = "mmlu_stem" if data_name in [ "gaokao2024_I", "gaokao2024_II", "gaokao_math_qa", "gaokao2024_mix", "cn_middle_school", ]: data_name = "gaokao" if prompt_type in ["tool-integrated"]: prompt_type = "tora" return EXAMPLES[data_name][:num_shots] PROMPT_TEMPLATES = { "direct": ("Question: {input}\nAnswer: ", "{output}", "\n\n"), "cot": ("Question: {input}\nAnswer: ", "{output}", "\n\n\n"), "pal": ("Question: {input}\n\n", "{output}", "\n---\n"), "tool-integrated": ("Question: {input}\n\nSolution:\n", "{output}", "\n---\n"), "self-instruct": ("<|user|>\n{input}\n<|assistant|>\n", "{output}", "\n"), "tora": ("<|user|>\n{input}\n<|assistant|>\n", "{output}", "\n"), "wizard_zs": ( "### Instruction:\n{input}\n\n### Response: Let's think step by step.", "{output}", "\n\n\n", ), "platypus_fs": ( "### Instruction:\n{input}\n\n### Response:\n", "{output}", "\n\n\n", ), "deepseek-math": ( "User: {input}\nPlease reason step by step, " "and put your final answer within \\boxed{{}}.\n\nAssistant:", "{output}", "\n\n\n", ), "kpmath": ( "User: Please reason step by step and put your final answer at the end " 'with "The answer is: ".\n\n{input}\n\nAssistant:', "{output}", ), "jiuzhang": ( "## Question\n{input}\n\n## Solution\n", "{output}", "\n\n\n", ), "jiuzhang_tora": ( "## Question\n{input}\n\n## Code Solution\n", "{output}", "\n\n\n", ), "jiuzhang_nl": ( "## Question\n{input}\n\n## Natural Language Solution\n", "{output}", "\n\n\n", ), "mmiqc": ( 'Please solve the following problem and put your answer at the end with "The answer is: ".\n\n{input}\n\n', "{output}", "\n\n\n", ), "abel": ( "Question:\n{input}\nAnswer:\nLet's think step by step.\n", "{output}", "\n\n", ), "shepherd": ("{input}\n", "{output}", "\n\n\n"), "qwen-boxed": ( "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" "<|im_start|>user\n{input}\nPlease reason step by step, and put your final answer within \\boxed{{}}.<|im_end|>\n" "<|im_start|>assistant\n", "{output}", "\n\n", ), "qwen25-math-cot": ( "<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{{}}.<|im_end|>\n" "<|im_start|>user\n{input}<|im_end|>\n" "<|im_start|>assistant\n", "{output}", "\n\n", ), "mathstral": ( "{input}\nPlease reason step by step, and put your final answer within \\boxed{{}}.", "{output}", "\n\n", ), "internlm-math-fs": ("Question:{input}\nAnswer:", "{output}", "\n"), "internlm-math-chat": ( "<|im_start|>user\n{input}<|im_end|>\n" "<|im_start|>assistant\n", "{output}", "\n\n", ), "mistral": ( "[INST] {input}[/INST]", "{output}", "\n\n", ), "numina": ("### Problem: {input}\n### Solution:", " {output}", "\n\n"), } def construct_prompt(example, data_name, args): if args.adapt_few_shot and data_name in [ "gaokao2024_I", "gaokao2024_II", "gaokao_math_qa", "gaokao2024_mix", "cn_middle_school", ]: demos = load_prompt(data_name, args.prompt_type, 5) else: demos = load_prompt(data_name, args.prompt_type, args.num_shots) prompt_type = args.prompt_type if prompt_type == "platypus_fs": prompt_type = "cot" if prompt_type == "tool-integrated": prompt_type = "tora" prompt_temp = PROMPT_TEMPLATES[args.prompt_type] splitter = prompt_temp[2] input_template, output_template, splitter = ( prompt_temp[0], prompt_temp[1], prompt_temp[2], ) if args.prompt_type == "qwen25-math-cot": # Hotfix to support putting all demos into a single turn demo_prompt = splitter.join([q + "\n" + a for q, a in demos]) else: demo_prompt = splitter.join( [ input_template.format(input=q) + output_template.format(output=a) for q, a in demos ] ) context = input_template.format(input=example["question"]) if len(demo_prompt) == 0 or ( args.adapt_few_shot and example["gt_ans"] not in ["A", "B", "C", "D", "E"] ): full_prompt = context else: if args.prompt_type == "qwen25-math-cot": # Hotfix to supportting put all demos into a single turn full_prompt = demo_prompt + splitter + example["question"] full_prompt = input_template.format(input=full_prompt) else: full_prompt = demo_prompt + splitter + context if args.prompt_type == "platypus_fs": full_prompt_temp = ( "Below is an instruction that describes a task. " "Write a response that appropriately completes the request.\n\n" "### Instruction:\n{instruction}\n\n### Response:\n" ) full_prompt = full_prompt_temp.format(instruction=full_prompt) if prompt_type == "tora": full_prompt = ( """Integrate step-by-step reasoning and Python code to solve math problems using the following guidelines: - Analyze the question and write functions to solve the problem; the function should not take any arguments. - Present the final result in LaTeX using a `\boxed{}` without any units. - Utilize the `pi` symbol and `Rational`` from Sympy for $\pi$ and fractions, and simplify all fractions and square roots without converting them to decimal values. Here are some examples you may refer to: --- """ + full_prompt ) return full_prompt.strip(" ") # important! key_map = { "gt": "Ground Truth", "pred": "Prediction", "gt_cot": "Reference CoT", "score": "Score", } def show_sample(sample, print_all_preds=False): print("==" * 20) for key in ["idx", "type", "level", "dataset"]: if key in sample: # capitalize print("{}: {}".format(key[0].upper() + key[1:], sample[key])) print("Question:", repr(sample["question"])) if "code" in sample: if print_all_preds: for code in sample["code"]: print("-" * 20) print("code:", code) print("Execution:", sample["report"]) else: print("Solution:\n", sample["code"][0]) print("Execution:", sample["report"][0]) if "pred" in sample: print("Prediction:", repr(sample["pred"][0])) for key in ["gt", "score", "unit", "gt_cot"]: if key in sample: _key = key_map.get(key, key) print("{}: {}".format(_key, repr(sample[key]))) print()

evaluation/utils.py (228 lines of code) (raw):