in simple_evals.py [0:0]
def main():
parser = argparse.ArgumentParser(
description="Run sampling and evaluations using different samplers and evaluations."
)
parser.add_argument(
"--list-models", action="store_true", help="List available models"
)
parser.add_argument("--model", type=str, help="Select a model by name")
parser.add_argument("--debug", action="store_true", help="Run in debug mode")
parser.add_argument(
"--examples", type=int, help="Number of examples to use (overrides default)"
)
args = parser.parse_args()
models = {
# Reasoning Models
"o3": ResponsesSampler(
model="o3-2025-04-16",
reasoning_model=True,
),
"o3_high": ResponsesSampler(
model="o3-2025-04-16",
reasoning_model=True,
reasoning_effort="high",
),
"o3_low": ResponsesSampler(
model="o3-2025-04-16",
reasoning_model=True,
reasoning_effort="low",
),
# Default == Medium
"o4-mini": ResponsesSampler(
model="o4-mini-2025-04-16",
reasoning_model=True,
),
"o4-mini_high": ResponsesSampler(
model="o4-mini-2025-04-16",
reasoning_model=True,
reasoning_effort="high",
),
"o4-mini_low": ResponsesSampler(
model="o4-mini-2025-04-16",
reasoning_model=True,
reasoning_effort="low",
),
"o1": OChatCompletionSampler(
model="o1",
),
"o1-preview": OChatCompletionSampler(
model="o1-preview",
),
"o1-mini": OChatCompletionSampler(
model="o1-mini",
),
# Default == Medium
"o3-mini": OChatCompletionSampler(
model="o3-mini",
),
"o3-mini_high": OChatCompletionSampler(
model="o3-mini",
reasoning_effort="high",
),
"o3-mini_low": OChatCompletionSampler(
model="o3-mini",
reasoning_effort="low",
),
# GPT-4.1 models
"gpt-4.1": ChatCompletionSampler(
model="gpt-4.1-2025-04-14",
system_message=OPENAI_SYSTEM_MESSAGE_API,
max_tokens=2048,
),
"gpt-4.1-mini": ChatCompletionSampler(
model="gpt-4.1-mini-2025-04-14",
system_message=OPENAI_SYSTEM_MESSAGE_API,
max_tokens=2048,
),
"gpt-4.1-nano": ChatCompletionSampler(
model="gpt-4.1-nano-2025-04-14",
system_message=OPENAI_SYSTEM_MESSAGE_API,
max_tokens=2048,
),
# GPT-4o models
"gpt-4o": ChatCompletionSampler(
model="gpt-4o",
system_message=OPENAI_SYSTEM_MESSAGE_API,
max_tokens=2048,
),
"gpt-4o-mini": ChatCompletionSampler(
model="gpt-4o-mini-2024-07-18",
system_message=OPENAI_SYSTEM_MESSAGE_API,
max_tokens=2048,
),
# GPT-4.5 model
"gpt-4.5-preview": ChatCompletionSampler(
model="gpt-4.5-preview-2025-02-27",
system_message=OPENAI_SYSTEM_MESSAGE_API,
max_tokens=2048,
),
# GPT-4-turbo model
"gpt-4-turbo-2024-04-09": ChatCompletionSampler(
model="gpt-4-turbo-2024-04-09",
system_message=OPENAI_SYSTEM_MESSAGE_API,
),
# Chatgpt models:
"chatgpt-4o-latest": ChatCompletionSampler(
model="chatgpt-4o-latest",
system_message=OPENAI_SYSTEM_MESSAGE_CHATGPT,
max_tokens=2048,
),
"gpt-4-turbo-2024-04-09_chatgpt": ChatCompletionSampler(
model="gpt-4-turbo-2024-04-09",
system_message=OPENAI_SYSTEM_MESSAGE_CHATGPT,
),
# Claude models:
"claude-3-opus-20240229_empty": ClaudeCompletionSampler(
model="claude-3-opus-20240229",
system_message=CLAUDE_SYSTEM_MESSAGE_LMSYS,
),
}
if args.list_models:
print("Available models:")
for model_name in models.keys():
print(f" - {model_name}")
return
if args.model:
if args.model not in models:
print(f"Error: Model '{args.model}' not found.")
return
models = {args.model: models[args.model]}
grading_sampler = ChatCompletionSampler(model="gpt-4o")
equality_checker = ChatCompletionSampler(model="gpt-4-turbo-preview")
# ^^^ used for fuzzy matching, just for math
def get_evals(eval_name, debug_mode):
num_examples = (
args.examples if args.examples is not None else (5 if debug_mode else None)
)
# Set num_examples = None to reproduce full evals
match eval_name:
case "mmlu":
return MMLUEval(num_examples=1 if debug_mode else num_examples)
case "math":
return MathEval(
equality_checker=equality_checker,
num_examples=num_examples,
n_repeats=1 if debug_mode else 10,
)
case "gpqa":
return GPQAEval(
n_repeats=1 if debug_mode else 10, num_examples=num_examples
)
case "mgsm":
return MGSMEval(num_examples_per_lang=10 if debug_mode else 250)
case "drop":
return DropEval(
num_examples=10 if debug_mode else num_examples,
train_samples_per_prompt=3,
)
case "humaneval":
return HumanEval(num_examples=10 if debug_mode else num_examples)
case "simpleqa":
return SimpleQAEval(
grader_model=grading_sampler,
num_examples=10 if debug_mode else num_examples,
)
case "browsecomp":
return BrowseCompEval(
grader_model=grading_sampler,
num_examples=10 if debug_mode else num_examples,
)
case _:
raise Exception(f"Unrecognized eval type: {eval_name}")
evals = {
eval_name: get_evals(eval_name, args.debug)
for eval_name in ["simpleqa", "mmlu", "math", "gpqa", "mgsm", "drop", "humaneval", "browsecomp"]
}
print(evals)
debug_suffix = "_DEBUG" if args.debug else ""
print(debug_suffix)
mergekey2resultpath = {}
for model_name, sampler in models.items():
for eval_name, eval_obj in evals.items():
result = eval_obj(sampler)
# ^^^ how to use a sampler
file_stem = f"{eval_name}_{model_name}"
report_filename = f"/tmp/{file_stem}{debug_suffix}.html"
print(f"Writing report to {report_filename}")
with open(report_filename, "w") as fh:
fh.write(common.make_report(result))
metrics = result.metrics | {"score": result.score}
print(metrics)
result_filename = f"/tmp/{file_stem}{debug_suffix}.json"
with open(result_filename, "w") as f:
f.write(json.dumps(metrics, indent=2))
print(f"Writing results to {result_filename}")
mergekey2resultpath[f"{file_stem}"] = result_filename
merge_metrics = []
for eval_model_name, result_filename in mergekey2resultpath.items():
try:
result = json.load(open(result_filename, "r+"))
except Exception as e:
print(e, result_filename)
continue
result = result.get("f1_score", result.get("score", None))
eval_name = eval_model_name[: eval_model_name.find("_")]
model_name = eval_model_name[eval_model_name.find("_") + 1 :]
merge_metrics.append(
{"eval_name": eval_name, "model_name": model_name, "metric": result}
)
merge_metrics_df = pd.DataFrame(merge_metrics).pivot(
index=["model_name"], columns="eval_name"
)
print("\nAll results: ")
print(merge_metrics_df.to_markdown())
return merge_metrics