in src/models/janus/processing_janus.js [44:122]
async _call(conversation, {
images = null,
chat_template = "default",
}={}) {
if (!images) {
images = await Promise.all(
conversation
.filter((msg) => msg.images)
.flatMap((msg) => msg.images)
.map((img) => RawImage.read(img))
);
} else if (!Array.isArray(images)) {
images = [images];
}
const tokenizer = this.tokenizer;
const result = tokenizer.apply_chat_template(conversation, {
tokenize: false,
add_generation_prompt: true,
chat_template,
});
const encode = (text) => tokenizer.encode(text, { add_special_tokens: false });
const parts = (/** @type {string} */(result))
.split(this.image_tag);
const num_images = parts.length - 1;
if (images.length !== num_images) {
throw new Error(`Number of images provided (${images.length}) does not match number of "${this.image_tag}" image tags (${num_images})`);
}
const [
image_placeholder_tag_id,
image_start_tag_id,
image_end_tag_id,
] = tokenizer.model.convert_tokens_to_ids([
this.image_tag,
this.image_start_tag,
this.image_end_tag,
]);
let input_ids = encode(parts[0]);
let images_seq_mask = new Array(input_ids.length).fill(false);
for (let i = 1; i < parts.length; ++i) {
const placeholder_image_tokens = new Array(this.num_image_tokens).fill(image_placeholder_tag_id);
const tokens = encode(parts[i]);
input_ids = mergeArrays(
input_ids,
[image_start_tag_id], placeholder_image_tokens, [image_end_tag_id],
tokens,
);
const image_mask = new Array(this.num_image_tokens).fill(true);
images_seq_mask = mergeArrays(
images_seq_mask,
[false], image_mask, [false],
new Array(tokens.length).fill(false),
);
}
const dims = [1, input_ids.length];
const final = {
input_ids: new Tensor('int64', input_ids, dims),
attention_mask: new Tensor('int64', new Array(input_ids.length).fill(1), dims),
images_seq_mask: new Tensor('bool', images_seq_mask, dims),
images_emb_mask: new Tensor(
'bool',
new Array(num_images * this.num_image_tokens).fill(true),
[1, num_images, this.num_image_tokens],
),
}
if (images && images.length > 0) {
const image_inputs = await this.image_processor(images);
// Set the batch_size dimension to 1
image_inputs.pixel_values.unsqueeze_(0);
return { ...final, ...image_inputs };
}
return final;
}