def _evaluate_relevance_and_accuracy()

in deploy_and_monitor/sm-model_monitor_byoc_llm_monitor/src/components/evaluator.py [0:0]


    def _evaluate_relevance_and_accuracy(self, dataset_uri: str):
        """
        Evaluates the data for relevance and accuracy using the FMEval library.

        :param dataset_uri: The path to the dataset file.
        :raises: ValueError if the dataset_uri is not a valid string.
        :return: A dictionary containing the evaluation results. If data is empty/malformed, returns an empty dictionary.
        """

        if not isinstance(dataset_uri, str):
            raise ValueError("dataset_uri must be a valid string")
        

        fileDownloader.retrieve_model(ANSWER_RELEVANCY_MODEL) # downloads / loads a 4.66GB LLM
        model = GPT4All(model=ANSWER_RELEVANCY_MODEL, verbose=False, n_batch=128, n_threads=36 if 'DOCKER_CONTAINER' in os.environ else None)
        evaluator_model = ScoreStringEvalChain.from_llm(
            llm=model, verbose=False
        )

        line_count = 0
        try:
            with open(dataset_uri, 'r') as file:
                lines = file.readlines()
        except:
            logger.error("Could not read file.")
            return {}
        
        if not lines:
            logger.info("No data to evaluate")
            return {}
        
        # Initialize our list of individualy response scores and summed total scores (for later averaging)
        results = []
        totals = {field: 0 for field in RELEVANCE_AND_ACCURACY_EVALUATIONS}
        # Randomly sample 10 prompt and responses for evaluation
        if len(lines) <= 10:
            sample_lines = lines
        else:
            sample_lines = random.sample(lines, 10)

        logger.info("Starting evaluation")
        for line in sample_lines:  
            try:
                data = json.loads(line)
                line_count += 1
                logger.info(f"Evaluating line: {line_count}")
                
                accuracy_relevance_eval_result = evaluator_model.evaluate_strings(
                    prediction=data["answer"],
                    input=data["content"],
                )

                result_dict = {
                    "prompt": data["content"],
                    "response": data["answer"],
                    "relevance_and_accuracy_analysis": accuracy_relevance_eval_result["reasoning"],
                    "relevance_and_accuracy_score": accuracy_relevance_eval_result["score"],
                }
                # Add all scores for this response to result list and sum total scores
                results.append(result_dict)
                for key, value in result_dict.items():
                    if key in totals:
                        totals[key] += value
            except ValueError as e:
                logger.warning(f"Error evaluating line, continuing: {e}")
                continue
            except (KeyError, JSONDecodeError) as e:
                logger.error(f"Data malformed {e}")
                return {}

        report_filepath = os.path.join(REPORT_PATH, RELEVANCE_AND_ACCURACY_REPORT_FILENAME)
        write_eval_result_file(report_filepath, results)

        # Returns average scores
        return {key: value / (line_count if line_count > 0 else 1) for key, value in totals.items()}