def evaluation

def evaluation_prompt()

in src/llm_judge.py [0:0]
37 lines of code
2 McCabe index (conditional complexity)

    def evaluation_prompt(self, query, results, custom_prompt=None):
        tokenizer = self.tokenizer
        model = self.model
        prompt = f'''Given the retrieval results URL metadata below, is the website primarily about query or strongly relevant to the query?

        # Search Query: {query} 
        
        # Retrieved result: {results}
        
        # Evaluation 
        Based on your reasoning, determine a relevance score between 0 and 1 (where 0 is not relevant, and 1 is highly relevant). Then, based on this score, assign a binary rating of 0 = not relevant or 1 = relevant. 

        # Output format as a Dictionary
         search_query: <search query>,
         retrieved_result: <retrieved result>,
         relevance_score: <score>,
         binary_decison: 0 or 1,
         Decision_Reason: <reason for decision>
         
         '''
        if custom_prompt: 
            print("Using custom prompt") 
        else: 
            pass
   
        messages = [
            {"role": "system", "content": "You a judge who determines whether a returned web page is relevant to a user search query. The system cannot answer questions directly, only return relevant web pages."},
            {"role": "user", "content": prompt},
        ]

        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
        )

        generation_args = {
            "max_new_tokens": 600,
            "return_full_text": False,
            "temperature": 0.0,
            "do_sample": False,
        }

        output = pipe(messages, **generation_args)
        print(output[0]['generated_text'])
        return output