browsecomp_eval.py [108:121]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                score = is_correct

                # Create HTML for each sample result
                html = common.jinja_env.from_string(common.HTML_JINJA).render(
                    prompt_messages=prompt_messages,
                    next_message=dict(content=response_text, role="assistant"),
                    score=score,
                    correct_answer=row["answer"],
                    extracted_answer=response_text,
                )
                convo = prompt_messages + [dict(content=response_text, role="assistant")]
                return SingleEvalResult(html=html, score=score, convo=convo, metrics={
                    "is_correct": is_correct,
                    "is_incorrect": is_incorrect,
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


simpleqa_eval.py [140:153]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                score = is_correct

                # Create HTML for each sample result
                html = common.jinja_env.from_string(common.HTML_JINJA).render(
                    prompt_messages=prompt_messages,
                    next_message=dict(content=response_text, role="assistant"),
                    score=score,
                    correct_answer=row["answer"],
                    extracted_answer=response_text,
                )
                convo = prompt_messages + [dict(content=response_text, role="assistant")]
                return SingleEvalResult(html=html, score=score, convo=convo, metrics={
                    "is_correct": is_correct,
                    "is_incorrect": is_incorrect,
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -