eval/benchmark-inference.py (32 lines of code) (raw):
import torch
from PIL import Image
torch.manual_seed(0)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(0)
from models.vision_language_model import VisionLanguageModel
from data.processors import get_tokenizer, get_image_processor
from torch.utils import benchmark
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
def generate_tokens(tokens, image):
gen = model.generate(tokens, image, max_new_tokens=1000)
if __name__ == "__main__":
model = VisionLanguageModel.from_pretrained("lusxvr/nanoVLM-450M").to(device)
model.eval()
tokenizer = get_tokenizer(model.cfg.lm_tokenizer, model.cfg.vlm_extra_tokens)
image_processor = get_image_processor(model.cfg.vit_img_size)
text = "What is this?"
template = f"{tokenizer.image_token * model.cfg.mp_image_token_length}Question: {text} Answer:"
encoded_batch = tokenizer.batch_encode_plus([template], return_tensors="pt")
tokens = encoded_batch['input_ids'].to(device)
image_path = 'assets/image.png'
image = Image.open(image_path)
image = image_processor(image)
image = image.unsqueeze(0).to(device)
time = benchmark.Timer(
stmt="generate_tokens(tokens, image)",
setup='from __main__ import generate_tokens',
globals={"tokens": tokens, "image": image},
num_threads=torch.get_num_threads(),
)
print(time.timeit(10))