skills/classification/evaluation/promptfooconfig.yaml (49 lines of code) (raw):

description: 'Classification' # Prompts defined in the prompts.py file prompts: - prompts.py:simple_classify - prompts.py:rag_classify - prompts.py:rag_chain_of_thought_classify providers: - id: anthropic:messages:claude-3-haiku-20240307 label: "Haiku: T-0.0" config: max_tokens: 4096 temperature: 0 - id: anthropic:messages:claude-3-haiku-20240307 label: "Haiku: T-0.2" config: max_tokens: 4096 temperature: 0.2 - id: anthropic:messages:claude-3-haiku-20240307 label: "Haiku: T-0.4" config: max_tokens: 4096 temperature: 0.4 - id: anthropic:messages:claude-3-haiku-20240307 label: "Haiku: T-0.6" config: max_tokens: 4096 temperature: 0.6 - id: anthropic:messages:claude-3-haiku-20240307 label: "Haiku: T-0.8" config: max_tokens: 4096 temperature: 0.8 # Read more about why we use defaultTest here: https://www.promptfoo.dev/docs/configuration/guide/#tools-and-functions:~:text=Use-,defaultTest,-apply%20a%20transform defaultTest: options: transform: file://transform.py assert: - type: icontains-any value: - 'Billing Inquiries' - 'Policy Administration' - 'Claims Assistance' - 'Coverage Explanations' - 'Quotes and Proposals' - 'Account Management' - 'Billing Disputes' - 'Claims Disputes' - 'Policy Comparisons' - 'General Inquiries' tests: dataset.csv outputPath: ../data/results.csv