k8s-bench/eval.go

// Copyright 2025 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package main import ( "context" "errors" "fmt" "io" "os" "os/exec" "path/filepath" "strings" "github.com/GoogleCloudPlatform/kubectl-ai/k8s-bench/pkg/model" "github.com/GoogleCloudPlatform/kubectl-ai/pkg/journal" "k8s.io/klog/v2" "sigs.k8s.io/yaml" ) func runEvaluation(ctx context.Context, config EvalConfig) error { if config.OutputDir == "" { return fmt.Errorf("must set OutputDir") } tasks, err := loadTasks(config) if err != nil { return fmt.Errorf("failed to load tasks: %w", err) } var allResults []model.TaskResult for taskID, task := range tasks { fmt.Printf("Evaluating task: %s\n", taskID) for _, llmConfig := range config.LLMConfigs { taskOutputDir := "" if config.OutputDir != "" { taskOutputDir = filepath.Join(config.OutputDir, taskID, llmConfig.ID) if err := os.MkdirAll(taskOutputDir, 0755); err != nil { return fmt.Errorf("creating directory %q: %w", taskOutputDir, err) } } var log io.Writer if taskOutputDir != "" { logPath := filepath.Join(taskOutputDir, "log.txt") logFile, err := os.Create(logPath) if err != nil { return fmt.Errorf("creating log file %q: %w", logPath, err) } defer logFile.Close() log = logFile } result := evaluateTask(ctx, config, taskID, task, llmConfig, log) if taskOutputDir != "" { if err := writeToYAMLFile(filepath.Join(taskOutputDir, "results.yaml"), result); err != nil { return fmt.Errorf("writing results to file: %w", err) } } allResults = append(allResults, result) } } printResults(allResults) return nil } // writeToYAMLFile will encode the specified object as yaml, and write it to the file. func writeToYAMLFile(p string, obj any) error { data, err := yaml.Marshal(obj) if err != nil { return fmt.Errorf("marshaling to yaml: %w", err) } if err := os.WriteFile(p, data, 0644); err != nil { return fmt.Errorf("writing to file %q: %w", p, err) } return nil } func loadTasks(config EvalConfig) (map[string]Task, error) { tasks := make(map[string]Task) entries, err := os.ReadDir(config.TasksDir) if err != nil { return nil, err } for _, entry := range entries { if !entry.IsDir() { continue } taskID := entry.Name() if config.TaskPattern != "" && !strings.Contains(taskID, config.TaskPattern) { continue } taskFile := filepath.Join(config.TasksDir, taskID, "task.yaml") data, err := os.ReadFile(taskFile) if err != nil { return nil, fmt.Errorf("failed to read task file %s: %w", taskFile, err) } var task Task if err := yaml.Unmarshal(data, &task); err != nil { return nil, fmt.Errorf("failed to parse task file %s: %w", taskFile, err) } // Skip disabled tasks if task.Disabled { fmt.Printf("Skipping disabled task: %s\n", taskID) continue } tasks[taskID] = task } return tasks, nil } func evaluateTask(ctx context.Context, config EvalConfig, taskID string, task Task, llmConfig model.LLMConfig, log io.Writer) model.TaskResult { result := model.TaskResult{ Task: taskID, LLMConfig: llmConfig, } taskOutputDir := filepath.Join(config.OutputDir, taskID, llmConfig.ID) x := &TaskExecution{ AgentBin: config.AgentBin, kubeConfig: config.KubeConfig, result: &result, llmConfig: llmConfig, log: log, task: &task, taskID: taskID, taskOutputDir: taskOutputDir, } taskDir := filepath.Join(config.TasksDir, taskID) taskDirAbs, err := filepath.Abs(taskDir) if err != nil { result.Result = "fail" result.Error = err.Error() return result } taskDir = taskDirAbs x.taskDir = taskDir defer func() { if err := x.runCleanup(ctx); err != nil { fmt.Printf("Warning: cleanup failed for task %s: %v\n", taskID, err) } }() if err := x.runSetup(ctx); err != nil { // Unexpected error result.Error = err.Error() return result } // Run the agent if err := x.runAgent(ctx); err != nil { // Unexpected error result.Error = err.Error() return result } // Run verifier if specified if task.Verifier != "" { verifierPath := filepath.Join(taskDir, task.Verifier) cmd := exec.CommandContext(ctx, verifierPath) cmd.Env = append(os.Environ(), fmt.Sprintf("KUBECONFIG=%s", x.kubeConfig)) fmt.Printf("\nRunning verifier for task %s\n", taskID) err := x.runCommand(cmd) if err == nil { result.Result = "success" } else if _, ok := err.(*exec.ExitError); ok { // "Normal" script failure result.Result = "fail" } else { // Unexpected error result.Error = err.Error() } } return result } type TaskExecution struct { // kubeConfig is the path to the kubeconfig file we should use. // It will be created in IsolationModeCluster kubeConfig string // AgentBin holds the path to the agent to execute AgentBin string llmConfig model.LLMConfig result *model.TaskResult log io.Writer task *Task taskID string taskDir string // taskOutputDir is where we can create artifacts or write logs while executing the task taskOutputDir string // cleanupFunctions are a set of cleanupFunctions we run to undo anything we ran cleanupFunctions []func() error } func (x *TaskExecution) runSetup(ctx context.Context) error { log := klog.FromContext(ctx) // Create cluster if requested if x.task.Isolation == IsolationModeCluster { kubeconfigPath := filepath.Join(x.taskDir, "kubeconfig.yaml") x.kubeConfig = kubeconfigPath clusterName := fmt.Sprintf("k8s-bench-%s", x.taskID) log.Info("creating kind cluster", "name", clusterName) args := []string{ "kind", "create", "cluster", "--name", clusterName, "--wait", "5m", "--kubeconfig", kubeconfigPath, } cmd := exec.CommandContext(ctx, args[0], args[1:]...) cmd.Dir = x.taskDir x.cleanupFunctions = append(x.cleanupFunctions, func() error { args := []string{ "kind", "delete", "cluster", "--name", clusterName, "--kubeconfig", kubeconfigPath, } cmd := exec.CommandContext(ctx, args[0], args[1:]...) cmd.Dir = x.taskDir return x.runCommand(cmd) }) if err := x.runCommand(cmd); err != nil { return err } } // Run setup if specified if x.task.Setup != "" { setupPath := filepath.Join(x.taskDir, x.task.Setup) cmd := exec.CommandContext(ctx, setupPath) cmd.Dir = x.taskDir cmd.Env = append(os.Environ(), fmt.Sprintf("KUBECONFIG=%s", x.kubeConfig)) if err := x.runCommand(cmd); err != nil { return err } } return nil } func (x *TaskExecution) runCleanup(ctx context.Context) error { var errs []error // Run cleanup if specified if x.task.Cleanup != "" { cleanupPath := filepath.Join(x.taskDir, x.task.Cleanup) cmd := exec.CommandContext(ctx, cleanupPath) cmd.Dir = x.taskDir cmd.Env = append(os.Environ(), fmt.Sprintf("KUBECONFIG=%s", x.kubeConfig)) if err := x.runCommand(cmd); err != nil { fmt.Printf("Warning: cleanup failed for task %s: %v\n", x.taskID, err) } } for _, cleanup := range x.cleanupFunctions { if err := cleanup(); err != nil { errs = append(errs, err) } } return errors.Join(errs...) } func (x *TaskExecution) runAgent(ctx context.Context) error { tracePath := filepath.Join(x.taskOutputDir, "trace.yaml") args := []string{ "--kubeconfig", x.kubeConfig, "--llm-provider", x.llmConfig.ProviderID, fmt.Sprintf("--enable-tool-use-shim=%t", x.llmConfig.EnableToolUseShim), fmt.Sprintf("--quiet=%t", x.llmConfig.Quiet), "--model", x.llmConfig.ModelID, "--trace-path", tracePath, "--skip-permissions", } stdinReader, stdinWriter := io.Pipe() cmd := exec.CommandContext(ctx, x.AgentBin, args..., ) cmd.Stdin = stdinReader cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if x.log != nil { cmd.Stdout = io.MultiWriter(cmd.Stdout, x.log) cmd.Stderr = io.MultiWriter(cmd.Stderr, x.log) } cmd.Env = append(os.Environ(), fmt.Sprintf("KUBECONFIG=%s", x.kubeConfig)) go func() { // TODO: Wait for idle between sending steps? for _, step := range x.task.Script { fmt.Fprintf(stdinWriter, "%s\n", step.Prompt) } stdinWriter.Close() }() if err := cmd.Run(); err != nil { return err } // Run expectations if specified if len(x.task.Expect) != 0 { events, err := journal.ParseEventsFromFile(tracePath) if err != nil { return err } else { var lastEvent *journal.Event for _, event := range events { if event.Action == journal.ActionUIRender { lastEvent = event } } if lastEvent == nil { x.result.AddFailure("did not found ui.render event in trace") } else { lastOutput, ok := lastEvent.GetString("text") if !ok { x.result.AddFailure("did not found 'text' key in event %+v", lastEvent) } for _, expect := range x.task.Expect { if expect.Contains != "" { if !strings.Contains(lastOutput, expect.Contains) { x.result.AddFailure("expected value %q not found in output %q", expect.Contains, lastOutput) } } } } } } return nil } func (x *TaskExecution) runCommand(cmd *exec.Cmd) error { fmt.Printf("\nRunning command: %s\n", strings.Join(cmd.Args, " ")) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if x.log != nil { cmd.Stdout = io.MultiWriter(cmd.Stdout, x.log) cmd.Stderr = io.MultiWriter(cmd.Stderr, x.log) } if err := cmd.Run(); err != nil { return fmt.Errorf("running command %v: %w", strings.Join(cmd.Args, " "), err) } return nil } func printResults(allResults []model.TaskResult) { fmt.Println("\nEvaluation Results:") fmt.Println("==================") for _, result := range allResults { fmt.Printf("\nTask: %s\n", result.Task) fmt.Printf(" LLM Config: %+v\n", result.LLMConfig) fmt.Printf(" %v\n", result.Result) if result.Error != "" { fmt.Printf(" Error: %s\n", result.Error) } } }

k8s-bench/eval.go (311 lines of code) (raw):