ideas/product-info/evals.yaml (56 lines of code) (raw):
matrix:
# Assistant ID to use for question-answer prediction.
QUESTION_ANSWER_ASSISTANT_ID: asst_i27tBJfHmVb9sveuN9Lw9kR1
# Input file containing the questions to predict and then evaluate.
QUESTIONS_INPUT_FILE: eval-questions.yaml
# Output files for the question-answer prediction.
QUESTION_ANSWER_FILE: output-answer-${{ matrix.__matrix_id__ }}.json
QUESTION_ANSWER_CHAT_HISTORY_FILE: output-question-answer-chat-history-${{ matrix.__matrix_id__ }}.jsonl
# Output files for the question-answer evaluation.
EVAL_SCORE_FILE: output-eval-score${{ matrix.__matrix_id__ }}.json
EVAL_SCORE_CHAT_HISTORY_FILE: output-question-answer-eval-score-chat-history-${{ matrix.__matrix_id__ }}.jsonl
# Average evaluation scores output file and minimum scoring requirements.
EVAL_ALL_SCORES_FILE: output-eval-all-scores.jsonl
EVAL_AVG_SCORES_FILE: output-eval-avg-score.json
EVAL_AVG_MIN_SCORE: 7.5
EVAL_AVG_MIN_SCORE_PROMPT: |
The average evaluation score must be greater than or equal to ${{ matrix.EVAL_AVG_MIN_SCORE }}.
tests:
# pre-setup (clears the assistant ID, vector store ID, search index name, and output files)
- area: pre-setup
tags: [before]
steps:
- name: start-clean
bash: |
ai config --clear assistant.id
ai config --clear vector.store.id
ai config --clear search.index.name
rm ${{ matrix.EVAL_ALL_SCORES_FILE }}
exit 0
# question-answer prediction and evaluation (repeated for each "row" in the matrix)
- area: question-matrix-processing
matrix-file: ${{ matrix.QUESTIONS_INPUT_FILE }}
steps:
- name: question-answer-prediction
command: ai chat
arguments:
assistant-id: ${{ matrix.QUESTION_ANSWER_ASSISTANT_ID }}
question: ${{ matrix.question }}
output-answer: ${{ matrix.QUESTION_ANSWER_FILE }}
output-chat-history: ${{ matrix.QUESTION_ANSWER_CHAT_HISTORY_FILE }}
- name: question-answer-evaluation
command: ai chat
arguments:
system-prompt: '@eval-answer-system-prompt.md'
user-prompt: '@eval-answer-user-prompt.md'
var-q: ${{ matrix.question }} # The question to evaluate; shows up in the user prompt as {q}
var-truth: ${{ matrix.truth }} # The correct answer to the question; shows up in the system prompt as {truth}
var-ai: '{@${{ matrix.QUESTION_ANSWER_FILE }}}' # {@file} reads the file content as a variable
output-answer: ${{ matrix.EVAL_SCORE_FILE }}
output-add-answer: ${{ matrix.EVAL_ALL_SCORES_FILE }} # Append the evaluation score to the file.
output-chat-history: ${{ matrix.EVAL_SCORE_CHAT_HISTORY_FILE }}
# average evaluation score calculation and expectation validation
- area: post-setup
tags: [after]
steps:
- name: eval-average-calculation
command: ai chat
arguments:
built-in-functions: true # Use built-in functions to calculate the average.
system-prompt: "You are a helpful assistant can average numbers between 1 and 10."
user-prompt: 'Calculate the average of the following numbers: {numbers}'
var-numbers: '{@${{ matrix.EVAL_ALL_SCORES_FILE }}}' # {@file} reads the file contents.
output-answer: ${{ matrix.EVAL_AVG_SCORES_FILE }}
expect: ${{ matrix.EVAL_AVG_MIN_SCORE_PROMPT }}