in src/models/phi3_v/image_processing_phi3_v.js [69:162]
async _call(images, {
num_crops = null,
} = {}) {
// @ts-expect-error
this._num_crops = num_crops ??= this.config.num_crops;
if (num_crops < 4 || sqrt(num_crops) % 1 !== 0) {
throw new Error("num_crops must be a square number >= 4");
}
if (!Array.isArray(images)) {
images = [images];
}
const num_images = images.length;
const imageData = await Promise.all(images.map(x => this.preprocess(x)));
const original_sizes = imageData.map(x => x.original_size);
const reshaped_input_sizes = imageData.map(x => x.reshaped_input_size);
// Process each image in batch
const all_pixel_values = [];
for (const { pixel_values } of imageData) {
pixel_values.unsqueeze_(0); // Easier processing as 4D tensor
const [height, width] = pixel_values.dims.slice(-2);
// Global image (Tensor of shape [num_channels, height, width])
const batch_pixel_values = await interpolate_4d(pixel_values, {
size: [IMAGE_SIZE, IMAGE_SIZE],
mode: 'bicubic',
});
if (num_crops > 0) {
const patches = [];
const sqrt_patches = sqrt(num_crops);
const patch_width = floor(width / sqrt_patches);
const patch_height = floor(height / sqrt_patches);
for (let y = 0; y < sqrt_patches; ++y) {
for (let x = 0; x < sqrt_patches; ++x) {
let start_x, start_y, end_x, end_y;
if (y === sqrt_patches - 1) { // At bottom
start_y = height - patch_height;
end_y = height;
} else {
start_y = y * patch_height;
end_y = (y + 1) * patch_height;
}
if (x === sqrt_patches - 1) { // At right
start_x = width - patch_width;
end_x = width;
} else {
start_x = x * patch_width;
end_x = (x + 1) * patch_width;
}
const starts = [start_y, start_x];
const ends = [end_y, end_x];
const patch = await slice(pixel_values, starts, ends, SLICE_AXES);
patches.push(patch);
}
}
const resized_tensors = await interpolate_4d(cat(patches, 0), {
size: [IMAGE_SIZE, IMAGE_SIZE],
mode: 'bicubic',
}); // [num_crops, 3, 336, 336]
// Concatenate the global image with the patches
all_pixel_values.push(cat([batch_pixel_values, resized_tensors], 0));
} else {
// Only use the global image
// NOTE: Not currently supported in modelling code
all_pixel_values.push(batch_pixel_values);
}
}
// [num_images, 1 + num_crops, num_channels=3, height, width]
const pixel_values = stack(all_pixel_values, 0);
// Calculate padded image sizes
const sizes = reshaped_input_sizes.map(x => x.map(y => IMAGE_SIZE * ceil(y / IMAGE_SIZE)));
const image_sizes = new Tensor(
'int64',
sizes.flat(),
[num_images, 2],
);
const num_img_tokens = sizes.map(
([height, width]) => this.calc_num_image_tokens_from_image_size(width, height),
);
return { pixel_values, original_sizes, reshaped_input_sizes, image_sizes, num_img_tokens };
}