lmms_eval/models/gpt4v.py (99 lines of code) (raw):
from io import BytesIO
from copy import deepcopy
import os
import base64
from typing import List, Tuple
from tqdm import tqdm
import requests as url_requests
import time
import logging
from lmms_eval.api.instance import Instance
from lmms_eval.api.model import lmms
from lmms_eval.api.registry import register_model
from lmms_eval import utils
from PIL import Image
API_TYPE = os.getenv("API_TYPE", "openai")
NUM_SECONDS_TO_SLEEP = 5
eval_logger = logging.getLogger("lmms-eval")
if API_TYPE == "openai":
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
}
elif API_TYPE == "azure":
API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
headers = {
"api-key": API_KEY,
"Content-Type": "application/json",
}
@register_model("gpt4V")
class GPT4V(lmms):
def __init__(self, **kwargs) -> None:
super().__init__()
# Manually set a image token for GPT4V so that we can search for it
# and split the text and image
# Here we just use the same token as llava for convenient
self.image_token = "<image>"
# Function to encode the image
def encode_image(self, image: Image):
output_buffer = BytesIO()
image.save(output_buffer, format="JPEG")
byte_data = output_buffer.getvalue()
base64_str = base64.b64encode(byte_data).decode("utf-8")
return base64_str
def flatten(self, input):
new_list = []
for i in input:
for j in i:
new_list.append(j)
return new_list
def generate_until(self, requests) -> List[str]:
res = []
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
# encode, pad, and truncate contexts for this batch
visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
visuals = self.flatten(visuals)
imgs = []
for visual in visuals:
img = self.encode_image(visual)
imgs.append(img)
payload = {"model": "gpt-4-vision-preview", "messages": []}
response_json = {"role": "user", "content": []}
# When there is no image token in the context, append the image to the text
if self.image_token not in contexts:
payload["messages"].append(deepcopy(response_json))
payload["messages"][0]["content"].append({"type": "text", "text": contexts})
for img in imgs:
payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
else:
contexts = contexts.split(self.image_token)
for idx, img in enumerate(imgs):
payload["messages"].append(deepcopy(response_json))
payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
# If n image tokens are in the contexts
# contexts will be splitted into n+1 chunks
# Manually add it into the payload
payload["messages"].append(deepcopy(response_json))
payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]})
if "max_new_tokens" not in gen_kwargs:
gen_kwargs["max_new_tokens"] = 1024
if "temperature" not in gen_kwargs:
gen_kwargs["temperature"] = 0
if "top_p" not in gen_kwargs:
gen_kwargs["top_p"] = None
if "num_beams" not in gen_kwargs:
gen_kwargs["num_beams"] = 1
# payload["max_tokens"] = gen_kwargs["max_new_tokens"]
# payload["temperature"] = gen_kwargs["temperature"]
for attempt in range(5):
try:
response = url_requests.post(API_URL, headers=headers, json=payload, timeout=20)
response_data = response.json()
content = response_data["choices"][0]["message"]["content"].strip()
break # If successful, break out of the loop
except Exception as e:
eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
if attempt < 5 - 1: # If we have retries left, sleep and then continue to next attempt
time.sleep(NUM_SECONDS_TO_SLEEP)
else: # If this was the last attempt, log and return empty
eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}")
content = ""
res.append(content)
pbar.update(1)
return res
def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
# TODO
assert False, "GPT4V not support"