2_eval-design-ptn/02_azure-evaluation-sdk/CustomRetrievalEvaluator/_custom

# --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- import logging import os from typing import Dict, List, Union from typing_extensions import overload, override from azure.ai.evaluation._evaluators._common._base_prompty_eval import PromptyEvaluatorBase from azure.ai.evaluation._model_configurations import Conversation import math import re from typing import Dict, TypeVar, Union from promptflow.core import AsyncPrompty from typing_extensions import override from azure.ai.evaluation._common.constants import PROMPT_BASED_REASON_EVALUATORS from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget from azure.ai.evaluation._common.utils import construct_prompty_model_config, validate_model_config, parse_quality_evaluator_reason_score logger = logging.getLogger(__name__) class CustomRetrievalEvaluator(PromptyEvaluatorBase[Union[str, float]]): """ Evaluates retrieval score for a given query and context or a multi-turn conversation, including reasoning. The retrieval measure assesses the AI system's performance in retrieving information for additional context (e.g. a RAG scenario). Retrieval scores range from 1 to 5, with 1 being the worst and 5 being the best. High retrieval scores indicate that the AI system has successfully extracted and ranked the most relevant information at the top, without introducing bias from external knowledge and ignoring factual correctness. Conversely, low retrieval scores suggest that the AI system has failed to surface the most relevant context chunks at the top of the list and/or introduced bias and ignored factual correctness. :param model_config: Configuration for the Azure OpenAI model. :type model_config: Union[~azure.ai.evaluation.AzureOpenAIModelConfiguration, ~azure.ai.evaluation.OpenAIModelConfiguration] :return: A function that evaluates and generates metrics for "chat" scenario. :rtype: Callable .. admonition:: Example: .. literalinclude:: ../samples/evaluation_samples_evaluate.py :start-after: [START retrieval_evaluator] :end-before: [END retrieval_evaluator] :language: python :dedent: 8 :caption: Initialize and call a RetrievalEvaluator. .. note:: To align with our support of a diverse set of models, an output key without the `gpt_` prefix has been added. To maintain backwards compatibility, the old key with the `gpt_` prefix is still be present in the output; however, it is recommended to use the new key moving forward as the old key will be deprecated in the future. """ _PROMPTY_FILE = "custom-retrieval.prompty" _RESULT_KEY = "custom-retrieval" id = "test" """Evaluator identifier, experimental and to be used only with evaluation in cloud.""" @override def __init__(self, model_config): # pylint: disable=super-init-not-called current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, self._PROMPTY_FILE) super().__init__(model_config=model_config, prompty_file=prompty_path, result_key=self._RESULT_KEY) @overload def __call__( self, *, query: str, context: str, ) -> Dict[str, Union[str, float]]: """Evaluates retrieval for a given a query and context :keyword query: The query to be evaluated. Mutually exclusive with `conversation` parameter. :paramtype query: Optional[str] :keyword context: The context to be evaluated. Mutually exclusive with `conversation` parameter. :paramtype context: Optional[str] :return: The scores for Chat scenario. :rtype: Dict[str, Union[str, float]] """ @override async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # type: ignore[override] """Do a relevance evaluation. :param eval_input: The input to the evaluator. Expected to contain whatever inputs are needed for the _flow method, including context and other fields depending on the child class. :type eval_input: Dict :return: The evaluation result. :rtype: Dict """ if "query" not in eval_input and "response" not in eval_input: raise EvaluationException( message="Only text conversation inputs are supported.", internal_message="Only text conversation inputs are supported.", blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=ErrorTarget.CONVERSATION, ) llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) score = math.nan if llm_output: # Parse out score and reason from evaluators known to possess them. if self._result_key in ["custom-retrieval", "coherence", "relevance", "retrieval", "groundedness", "fluency"]: score, reason = parse_quality_evaluator_reason_score(llm_output) return { self._result_key: float(score), f"gpt_{self._result_key}": float(score), f"{self._result_key}_reason": reason, } match = re.search(r"\d", llm_output) if match: score = float(match.group()) return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)} return {self._result_key: float(score), f"gpt_{self._result_key}": float(score)} @overload def __call__( self, *, conversation: Conversation, ) -> Dict[str, Union[float, Dict[str, List[Union[str, float]]]]]: """Evaluates retrieval for a for a multi-turn evaluation. If the conversation has more than one turn, the evaluator will aggregate the results of each turn. :keyword conversation: The conversation to be evaluated. :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The scores for Chat scenario. :rtype: Dict[str, Union[float, Dict[str, List[float]]]] """ @override def __call__(self, *args, **kwargs): # pylint: disable=docstring-missing-param """Evaluates retrieval score chat scenario. Accepts either a query and context for a single evaluation, or a conversation for a multi-turn evaluation. If the conversation has more than one turn, the evaluator will aggregate the results of each turn. :keyword query: The query to be evaluated. Mutually exclusive with `conversation` parameter. :paramtype query: Optional[str] :keyword context: The context to be evaluated. Mutually exclusive with `conversation` parameter. :paramtype context: Optional[str] :keyword conversation: The conversation to be evaluated. :paramtype conversation: Optional[~azure.ai.evaluation.Conversation] :return: The scores for Chat scenario. :rtype: :rtype: Dict[str, Union[float, Dict[str, List[str, float]]]] """ return super().__call__(*args, **kwargs)

2_eval-design-ptn/02_azure-evaluation-sdk/CustomRetrievalEvaluator/_custom_retrieval.py (65 lines of code) (raw):