in 5-4o_fine_tuning/eval.py [0:0]
def main():
# Parse arguments
parser = argparse.ArgumentParser(
description="Run static analysis eval")
parser.add_argument("--model", type=str,
default="gpt-4o-mini",
help="OpenAI model name (base or fine-tuned)")
parser.add_argument("--n_shot", type=int, default=0,
help="# of examples for few-shot")
parser.add_argument("--use_similarity", action="store_true",
help="Enable similarity-based retrieval of dataset examples")
args = parser.parse_args()
model_name = args.model
n_shot = args.n_shot
use_similarity = args.use_similarity
# Load the eval dataset
eval_dataset = load_dataset("patched-codes/static-analysis-eval",
split="train", download_mode='force_redownload')
data = [{"file_name": item["file_name"], "source": item["source"],
"cwe": item["cwe"]} for item in eval_dataset]
manager = multiprocessing.Manager()
fixed_files = manager.list()
process_func = partial(process_file,
fixed_files=fixed_files,
model_name=model_name,
n_shot=n_shot,
use_similarity=use_similarity)
total_tests = len(data)
# Run the evals in parallel
with multiprocessing.Pool(processes=max(1, multiprocessing.cpu_count() - 2)) as pool:
results = list(tqdm(pool.imap(process_func, data), total=total_tests))
# Aggregate results and log
passing_tests = sum(results)
score = passing_tests / total_tests * 100
sanitized_model_name = f"{clean_filename(model_name)}-{n_shot}-shot" + (
"-sim" if use_similarity else "")
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
log_file_name = f"{sanitized_model_name}_{timestamp}.log"
create_log_file(log_file_name=log_file_name, model_name=model_name, score=score,
passing_tests=passing_tests,
total_tests=total_tests,
fixed_files=fixed_files,
n_shot=n_shot,
use_similarity=use_similarity)
print(
f"Results for static analysis eval: {score:.2f}%\nLog file with results: {log_file_name}")