in k8s-bench/main.go [318:576]
func printMarkdownResults(config AnalyzeConfig, results []model.TaskResult, resultsFilePath string) error {
// Create a buffer to hold the output
var buffer strings.Builder
buffer.WriteString("# K8s-bench Evaluation Results\n\n")
allModels := make(map[string]bool) // Track all unique models
for _, result := range results {
allModels[result.LLMConfig.ModelID] = true
}
// Convert allModels map to a sorted slice
models := make([]string, 0, len(allModels))
for model := range allModels {
models = append(models, model)
}
sort.Strings(models)
// Overall summary across all results
totalCount := len(results)
overallSuccessCount := 0
overallFailCount := 0
for _, result := range results {
if strings.Contains(strings.ToLower(result.Result), "success") {
overallSuccessCount++
} else {
overallFailCount++
}
}
// --- Model Performance Summary ---
buffer.WriteString("## Model Performance Summary\n\n")
if config.IgnoreToolUseShim {
// Simplified table ignoring shim status
buffer.WriteString("| Model | Success | Fail |\n")
buffer.WriteString("|-------|---------|------|\n")
for _, model := range models {
successCount := 0
failCount := 0
for _, result := range results {
if result.LLMConfig.ModelID == model {
if strings.Contains(strings.ToLower(result.Result), "success") {
successCount++
} else {
failCount++
}
}
}
buffer.WriteString(fmt.Sprintf("| %s | %d | %d |\n", model, successCount, failCount))
}
// Overall totals row
buffer.WriteString("| **Total** |")
buffer.WriteString(fmt.Sprintf(" %d | %d |\n\n", overallSuccessCount, overallFailCount))
} else {
// Original table grouped by tool use shim status
resultsByToolUseShim := make(map[string][]model.TaskResult)
for _, result := range results {
var toolUseShimStr string
if result.LLMConfig.EnableToolUseShim {
toolUseShimStr = "shim_enabled"
} else {
toolUseShimStr = "shim_disabled"
}
resultsByToolUseShim[toolUseShimStr] = append(resultsByToolUseShim[toolUseShimStr], result)
}
toolUseShimStrs := make([]string, 0, len(resultsByToolUseShim))
for toolUseShimStr := range resultsByToolUseShim {
toolUseShimStrs = append(toolUseShimStrs, toolUseShimStr)
}
sort.Strings(toolUseShimStrs)
// Create header row with success/fail columns for each toolUseShimStr
buffer.WriteString("| Model |")
for _, toolUseShimStr := range toolUseShimStrs {
buffer.WriteString(fmt.Sprintf(" %s Success | %s Fail |", toolUseShimStr, toolUseShimStr))
}
buffer.WriteString("\n|-------|")
for range toolUseShimStrs {
buffer.WriteString("------------|-----------|")
}
buffer.WriteString("\n")
// Add a row for each model with success/fail counts for each strategy
for _, model := range models {
buffer.WriteString(fmt.Sprintf("| %s |", model))
for _, toolUseShimStr := range toolUseShimStrs {
successCount := 0
failCount := 0
// Count success/fail for this model and toolUseShimStr
for _, result := range resultsByToolUseShim[toolUseShimStr] {
if result.LLMConfig.ModelID == model {
if strings.Contains(strings.ToLower(result.Result), "success") {
successCount++
} else {
failCount++
}
}
}
buffer.WriteString(fmt.Sprintf(" %d | %d |", successCount, failCount))
}
buffer.WriteString("\n")
}
// Add a row showing overall totals for each toolUseShimStr
buffer.WriteString("| **Total** |")
for _, toolUseShimStr := range toolUseShimStrs {
successCount := 0
failCount := 0
for _, result := range resultsByToolUseShim[toolUseShimStr] {
if strings.Contains(strings.ToLower(result.Result), "success") {
successCount++
} else {
failCount++
}
}
buffer.WriteString(fmt.Sprintf(" %d | %d |", successCount, failCount))
}
buffer.WriteString("\n\n")
}
// --- Overall Summary ---
buffer.WriteString("## Overall Summary\n\n")
buffer.WriteString(fmt.Sprintf("- Total Runs: %d\n", totalCount))
buffer.WriteString(fmt.Sprintf("- Overall Success: %d (%d%%)\n", overallSuccessCount, calculatePercentage(overallSuccessCount, totalCount)))
buffer.WriteString(fmt.Sprintf("- Overall Fail: %d (%d%%)\n\n", overallFailCount, calculatePercentage(overallFailCount, totalCount)))
// --- Detailed Results ---
if config.IgnoreToolUseShim {
// Group results by model for detailed view
resultsByModel := make(map[string][]model.TaskResult)
for _, result := range results {
resultsByModel[result.LLMConfig.ModelID] = append(resultsByModel[result.LLMConfig.ModelID], result)
}
for _, model := range models {
buffer.WriteString(fmt.Sprintf("## Model: %s\n\n", model))
buffer.WriteString("| Task | Provider | Result |\n")
buffer.WriteString("|------|----------|--------|\n")
modelSuccessCount := 0
modelFailCount := 0
modelResults := resultsByModel[model]
modelTotalCount := len(modelResults)
// Sort results within the model group for consistent output (e.g., by Task)
sort.Slice(modelResults, func(i, j int) bool {
return modelResults[i].Task < modelResults[j].Task
})
for _, result := range modelResults {
resultEmoji := "❌" // Default to failure
if strings.Contains(strings.ToLower(result.Result), "success") {
resultEmoji = "✅"
modelSuccessCount++
} else {
modelFailCount++
}
buffer.WriteString(fmt.Sprintf("| %s | %s | %s %s |\n",
result.Task,
result.LLMConfig.ProviderID,
resultEmoji, result.Result))
}
// Add summary for this model
buffer.WriteString(fmt.Sprintf("\n**%s Summary**\n\n", model))
buffer.WriteString(fmt.Sprintf("- Total: %d\n", modelTotalCount))
buffer.WriteString(fmt.Sprintf("- Success: %d (%d%%)\n", modelSuccessCount, calculatePercentage(modelSuccessCount, modelTotalCount)))
buffer.WriteString(fmt.Sprintf("- Fail: %d (%d%%)\n\n", modelFailCount, calculatePercentage(modelFailCount, modelTotalCount)))
}
} else {
// Original detailed results grouped by tool use shim status
resultsByToolUseShim := make(map[string][]model.TaskResult)
for _, result := range results {
var toolUseShimStr string
if result.LLMConfig.EnableToolUseShim {
toolUseShimStr = "shim_enabled"
} else {
toolUseShimStr = "shim_disabled"
}
resultsByToolUseShim[toolUseShimStr] = append(resultsByToolUseShim[toolUseShimStr], result)
}
toolUseShimStrs := make([]string, 0, len(resultsByToolUseShim))
for toolUseShimStr := range resultsByToolUseShim {
toolUseShimStrs = append(toolUseShimStrs, toolUseShimStr)
}
sort.Strings(toolUseShimStrs)
for _, toolUseShimStr := range toolUseShimStrs {
toolUseShimStrResults := resultsByToolUseShim[toolUseShimStr]
// Print a header for this toolUseShimStr
buffer.WriteString(fmt.Sprintf("## Tool Use: %s\n\n", toolUseShimStr))
// Create the table header
buffer.WriteString("| Task | Provider | Model | Result |\n")
buffer.WriteString("|------|----------|-------|--------|\n")
// Track success and failure counts for this strategy
successCount := 0
failCount := 0
totalCount := len(toolUseShimStrResults)
// Sort results within the group for consistent output (e.g., by Task)
sort.Slice(toolUseShimStrResults, func(i, j int) bool {
if toolUseShimStrResults[i].LLMConfig.ModelID != toolUseShimStrResults[j].LLMConfig.ModelID {
return toolUseShimStrResults[i].LLMConfig.ModelID < toolUseShimStrResults[j].LLMConfig.ModelID
}
return toolUseShimStrResults[i].Task < toolUseShimStrResults[j].Task
})
// Add each result as a row in the table
for _, result := range toolUseShimStrResults {
resultEmoji := "❌" // Default to failure
if strings.Contains(strings.ToLower(result.Result), "success") {
resultEmoji = "✅"
successCount++
} else {
failCount++
}
buffer.WriteString(fmt.Sprintf("| %s | %s | %s | %s %s |\n",
result.Task,
result.LLMConfig.ProviderID,
result.LLMConfig.ModelID,
resultEmoji, result.Result))
}
// Add summary for this toolUseShimStr
buffer.WriteString(fmt.Sprintf("\n**%s Summary**\n\n", toolUseShimStr))
buffer.WriteString(fmt.Sprintf("- Total: %d\n", totalCount))
buffer.WriteString(fmt.Sprintf("- Success: %d (%d%%)\n", successCount, calculatePercentage(successCount, totalCount)))
buffer.WriteString(fmt.Sprintf("- Fail: %d (%d%%)\n\n", failCount, calculatePercentage(failCount, totalCount)))
}
}
// --- Footer ---
buffer.WriteString("---\n\n")
buffer.WriteString(fmt.Sprintf("_Report generated on %s_\n", time.Now().Format("January 2, 2006 at 3:04 PM")))
// Get the final output
output := buffer.String()
// Write to file if path is provided, otherwise print to stdout
if resultsFilePath != "" {
if err := os.WriteFile(resultsFilePath, []byte(output), 0644); err != nil {
return fmt.Errorf("writing to file %q: %w", resultsFilePath, err)
}
fmt.Printf("Results written to %s\n", resultsFilePath)
} else {
// Print to stdout only if no file path is specified
fmt.Print(output)
}
return nil
}