func runEvals()

in k8s-bench/main.go [146:225]


func runEvals(ctx context.Context) error {
	config := EvalConfig{
		TasksDir: "./tasks",
	}

	// Set custom usage for 'run' subcommand
	flag.Usage = func() {
		fmt.Fprintf(os.Stderr, "Usage: %s run [options]\n\n", os.Args[0])
		fmt.Fprintf(os.Stderr, "Run K8s-bench evaluation benchmarks.\n\n")
		fmt.Fprintf(os.Stderr, "Options:\n")
		flag.PrintDefaults()
	}

	llmProvider := "gemini"
	modelList := ""
	defaultKubeConfig := "~/.kube/config"
	enableToolUseShim := true
	quiet := true

	flag.StringVar(&config.TasksDir, "tasks-dir", config.TasksDir, "Directory containing evaluation tasks")
	flag.StringVar(&config.KubeConfig, "kubeconfig", config.KubeConfig, "Path to kubeconfig file")
	flag.StringVar(&config.TaskPattern, "task-pattern", config.TaskPattern, "Pattern to filter tasks (e.g. 'pod' or 'redis')")
	flag.StringVar(&config.AgentBin, "agent-bin", config.AgentBin, "Path to kubernetes agent binary")
	flag.StringVar(&llmProvider, "llm-provider", llmProvider, "Specific LLM provider to evaluate (e.g. 'gemini' or 'ollama')")
	flag.StringVar(&modelList, "models", modelList, "Comma-separated list of models to evaluate (e.g. 'gemini-1.0,gemini-2.0')")
	flag.BoolVar(&enableToolUseShim, "enable-tool-use-shim", enableToolUseShim, "Enable tool use shim")
	flag.BoolVar(&quiet, "quiet", quiet, "Quiet mode (non-interactive mode)")
	flag.StringVar(&config.OutputDir, "output-dir", config.OutputDir, "Directory to write results to")
	flag.Parse()

	if config.KubeConfig == "" {
		config.KubeConfig = defaultKubeConfig
	}

	expandedKubeconfig, err := expandPath(config.KubeConfig)
	if err != nil {
		return fmt.Errorf("failed to expand kubeconfig path %q: %w", config.KubeConfig, err)
	}
	config.KubeConfig = expandedKubeconfig

	defaultModels := map[string][]string{
		"gemini": {"gemini-2.5-pro-preview-03-25"},
	}

	models := defaultModels
	if modelList != "" {
		if llmProvider == "" {
			return fmt.Errorf("--llm-provider is required when --models is specified")
		}
		modelSlice := strings.Split(modelList, ",")
		models = map[string][]string{
			llmProvider: modelSlice,
		}
	}

	for llmProviderID, models := range models {
		var toolUseShimStr string
		if enableToolUseShim {
			toolUseShimStr = "shim_enabled"
		} else {
			toolUseShimStr = "shim_disabled"
		}
		for _, modelID := range models {
			id := fmt.Sprintf("%s-%s-%s", toolUseShimStr, llmProviderID, modelID)
			config.LLMConfigs = append(config.LLMConfigs, model.LLMConfig{
				ID:                id,
				ProviderID:        llmProviderID,
				ModelID:           modelID,
				EnableToolUseShim: enableToolUseShim,
				Quiet:             quiet,
			})
		}
	}

	if err := runEvaluation(ctx, config); err != nil {
		return fmt.Errorf("running evaluation: %w", err)
	}

	return nil
}