in inference.py [0:0]
def main(args):
dtype = torch.bfloat16
# specify the path to the model
model_path = args.model_path
vl_chat_processor: DeepseekVLV2Processor = DeepseekVLV2Processor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
vl_gpt: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
torch_dtype=dtype
)
vl_gpt = vl_gpt.cuda().eval()
# multiple images conversation example
# Please note that <|grounding|> token is specifically designed for the grounded caption feature. It is not needed for normal conversations.
conversation = [
{
"role": "<|User|>",
"content": "<image>\n<image>\n<|grounding|>In the first image, an object within the red rectangle is marked. Locate the object of the same category in the second image.",
"images": [
"images/incontext_visual_grounding_1.jpeg",
"images/icl_vg_2.jpeg"
],
},
{"role": "<|Assistant|>", "content": ""},
]
# load images and prepare for inputs
pil_images = load_pil_images(conversation)
print(f"len(pil_images) = {len(pil_images)}")
prepare_inputs = vl_chat_processor.__call__(
conversations=conversation,
images=pil_images,
force_batchify=True,
system_prompt=""
).to(vl_gpt.device, dtype=dtype)
with torch.no_grad():
if args.chunk_size == -1:
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
past_key_values = None
else:
# incremental_prefilling when using 40G GPU for vl2-small
inputs_embeds, past_key_values = vl_gpt.incremental_prefilling(
input_ids=prepare_inputs.input_ids,
images=prepare_inputs.images,
images_seq_mask=prepare_inputs.images_seq_mask,
images_spatial_crop=prepare_inputs.images_spatial_crop,
attention_mask=prepare_inputs.attention_mask,
chunk_size=args.chunk_size
)
# run the model to get the response
outputs = vl_gpt.generate(
# inputs_embeds=inputs_embeds[:, -1:],
# input_ids=prepare_inputs.input_ids[:, -1:],
inputs_embeds=inputs_embeds,
input_ids=prepare_inputs.input_ids,
images=prepare_inputs.images,
images_seq_mask=prepare_inputs.images_seq_mask,
images_spatial_crop=prepare_inputs.images_spatial_crop,
attention_mask=prepare_inputs.attention_mask,
past_key_values=past_key_values,
pad_token_id=tokenizer.eos_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
max_new_tokens=512,
# do_sample=False,
# repetition_penalty=1.1,
do_sample=True,
temperature=0.4,
top_p=0.9,
repetition_penalty=1.1,
use_cache=True,
)
answer = tokenizer.decode(outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(), skip_special_tokens=False)
print(f"{prepare_inputs['sft_format'][0]}", answer)
vg_image = parse_ref_bbox(answer, image=pil_images[-1])
if vg_image is not None:
vg_image.save("./vg.jpg", format="JPEG", quality=85)