in src/lighteval/metrics/llm_as_judge.py [0:0]
def __lazy_load_client(self): # noqa: C901
match self.backend:
# Both "openai" and "tgi" backends use the OpenAI-compatible API
# They are handled separately to allow for backend-specific validation and setup
case "openai" | "tgi":
if not is_openai_available():
raise RuntimeError("OpenAI backend is not available.")
if self.client is None:
from openai import OpenAI
self.client = OpenAI(
api_key=self.api_key if self.url is None else None, base_url=self.url if self.url else None
)
return self.__call_api_parallel
case "litellm":
if not is_litellm_available():
raise RuntimeError("litellm is not available.")
return self.__call_litellm
case "vllm":
if not is_vllm_available():
raise RuntimeError("vllm is not available.")
if self.pipe is None:
from vllm import LLM, SamplingParams
from vllm.transformers_utils.tokenizer import get_tokenizer
self.sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=self.max_tokens)
self.tokenizer = get_tokenizer(self.model, tokenizer_mode="auto")
self.pipe = LLM(model=self.model, max_model_len=2048, gpu_memory_utilization=0.5, dtype="float16")
return self.__call_vllm
case "transformers":
if self.pipe is None:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
transformers_model = AutoModelForCausalLM.from_pretrained(
self.model, torch_dtype=torch.float16, trust_remote_code=False, device_map="cuda"
)
tokenizer = AutoTokenizer.from_pretrained(self.model)
self.pipe = pipeline(
"text-generation",
model=transformers_model,
tokenizer=tokenizer,
max_new_tokens=self.max_tokens,
)
return self.__call_transformers
case "inference-providers":
from huggingface_hub import AsyncInferenceClient
self.client = AsyncInferenceClient(token=self.api_key, base_url=self.url, provider=self.hf_provider)
return self.__call_hf_inference_async
case _:
raise ValueError(f"Unsupported backend: {self.backend}")