project/nanoeval/nanoeval/solvers/computer

from __future__ import annotations import traceback from abc import ABC from datetime import timedelta from typing import Any, ClassVar, Literal from pydantic import ConfigDict from pydantic.v1.json import timedelta_isoformat from typing_extensions import deprecated from nanoeval.solvers.computer_tasks._versioning import Migration, VersionedModel from nanoeval.solvers.computer_tasks.task import Grade def _step_0_to_1(values: dict[str, Any]) -> dict[str, Any]: # Migrate correct, autograder_result to new Grade field if grader_log := values.pop("autograder_result"): values["grade"] = Grade(score=1 if values.pop("correct") else 0, grader_log=grader_log) else: assert not values.pop("correct") values["grade"] = None return values class Step(VersionedModel): """ A single step in the conversation. Contains results from the autograder. Extend this class to add more information about the step, e.g. the model's output or other metadata you'd like to include in your solver. """ schema_version: int = 1 _migrations: ClassVar[dict[int, Migration]] = { 0: _step_0_to_1, } convo: Any # If none, the step wasn't graded. grade: Grade | None elapsed: timedelta type: Literal["step"] = "step" @property def correct(self) -> bool: return self.grade is not None and self.grade.score == 1 model_config = ConfigDict(json_encoders={timedelta: timedelta_isoformat}) def _final_result_successful_0_to_1(values: dict[str, Any]) -> dict[str, Any]: # Migrate correct, autograder_result to new Grade field if grader_log := values.pop("grader_log"): values["grade"] = Grade(score=1 if values.pop("correct") else 0, grader_log=grader_log) else: assert not values.pop("correct") values["grade"] = None return values class FinalResultBase(ABC, VersionedModel): grade: Grade convo: Any = None @property def correct(self) -> bool: # type: ignore return self.grade.score == 1 class FinalResultSuccessful(FinalResultBase): finish_status: Literal["finished-successfully"] = "finished-successfully" max_steps_reached: bool = False max_tokens_reached: bool = False max_time_reached: bool = False type: Literal["final_result_successful"] = "final_result_successful" schema_version: int = 1 _migrations: ClassVar[dict[int, Migration]] = { 0: _final_result_successful_0_to_1, } def _final_result_with_exception_0_to_1(values: dict[str, Any]) -> dict[str, Any]: # Migrate correct, autograder_result to new Grade field values["grade"] = Grade(score=1 if values.pop("correct") else 0, grader_log="") return values class FinalResultWithException(FinalResultBase): # How did this episode end? Useful for error tracking. finish_status: str # The exception that was raised. exception: str traceback: str type: Literal["final_result_exception"] = "final_result_exception" @staticmethod @deprecated( "please use native nanoeval retries using nanoeval.eval.RetryableSystemError instead" ) def from_exception( e: BaseException, convo: Any | None, traceback_notes: str = "" ) -> "FinalResultWithException": finish_status: Literal[ "error-timeout", "error-model", "error-other", "error-user-machine-kernel-unresponsive", ] exception_name, tb = e.__class__.__name__, traceback.format_exc() if "TimeoutError" in exception_name: finish_status = "error-timeout" elif "SamplingError" in exception_name: # Agent sampling error finish_status = "error-model" elif "Kernel didn't respond" in tb: finish_status = "error-user-machine-kernel-unresponsive" else: finish_status = "error-other" return FinalResultWithException( grade=Grade(score=0, grader_log=""), finish_status=finish_status, exception=exception_name, traceback=tb + "\n" + traceback_notes, convo=convo, ) # Migration data schema_version: int = 1 _migrations: ClassVar[dict[int, Migration]] = { 0: _final_result_with_exception_0_to_1, } FinalResult = FinalResultSuccessful | FinalResultWithException

project/nanoeval/nanoeval/solvers/computer_tasks/steps.py (95 lines of code) (raw):