func evaluateTask()

in k8s-bench/eval.go [137:205]


func evaluateTask(ctx context.Context, config EvalConfig, taskID string, task Task, llmConfig model.LLMConfig, log io.Writer) model.TaskResult {
	result := model.TaskResult{
		Task:      taskID,
		LLMConfig: llmConfig,
	}

	taskOutputDir := filepath.Join(config.OutputDir, taskID, llmConfig.ID)

	x := &TaskExecution{
		AgentBin:      config.AgentBin,
		kubeConfig:    config.KubeConfig,
		result:        &result,
		llmConfig:     llmConfig,
		log:           log,
		task:          &task,
		taskID:        taskID,
		taskOutputDir: taskOutputDir,
	}

	taskDir := filepath.Join(config.TasksDir, taskID)
	taskDirAbs, err := filepath.Abs(taskDir)
	if err != nil {
		result.Result = "fail"
		result.Error = err.Error()
		return result
	}
	taskDir = taskDirAbs
	x.taskDir = taskDir

	defer func() {
		if err := x.runCleanup(ctx); err != nil {
			fmt.Printf("Warning: cleanup failed for task %s: %v\n", taskID, err)
		}
	}()

	if err := x.runSetup(ctx); err != nil {
		// Unexpected error
		result.Error = err.Error()
		return result
	}

	// Run the agent
	if err := x.runAgent(ctx); err != nil {
		// Unexpected error
		result.Error = err.Error()
		return result
	}

	// Run verifier if specified
	if task.Verifier != "" {
		verifierPath := filepath.Join(taskDir, task.Verifier)
		cmd := exec.CommandContext(ctx, verifierPath)
		cmd.Env = append(os.Environ(), fmt.Sprintf("KUBECONFIG=%s", x.kubeConfig))
		fmt.Printf("\nRunning verifier for task %s\n", taskID)

		err := x.runCommand(cmd)
		if err == nil {
			result.Result = "success"
		} else if _, ok := err.(*exec.ExitError); ok {
			// "Normal" script failure
			result.Result = "fail"
		} else {
			// Unexpected error
			result.Error = err.Error()
		}
	}

	return result
}