in src/base/image_processors_utils.js [936:1054]
async preprocess(image, {
do_normalize = null,
do_pad = null,
do_convert_rgb = null,
do_convert_grayscale = null,
do_flip_channel_order = null,
} = {}) {
if (this.do_crop_margin) {
// NOTE: Specific to nougat processors. This is done before resizing,
// and can be interpreted as a pre-preprocessing step.
image = await this.crop_margin(image);
}
const [srcWidth, srcHeight] = image.size; // original image size
// Convert image to RGB if specified in config.
if (do_convert_rgb ?? this.do_convert_rgb) {
image = image.rgb();
} else if (do_convert_grayscale) {
image = image.grayscale();
}
// TODO:
// For efficiency reasons, it might be best to merge the resize and center crop operations into one.
// Resize all images
if (this.do_resize) {
image = await this.resize(image);
}
// Resize the image using thumbnail method.
if (this.do_thumbnail) {
// @ts-expect-error TS2345
image = await this.thumbnail(image, this.size, this.resample);
}
if (this.do_center_crop) {
let crop_width;
let crop_height;
if (Number.isInteger(this.crop_size)) {
crop_width = this.crop_size;
crop_height = this.crop_size;
} else {
crop_width = this.crop_size.width;
crop_height = this.crop_size.height;
}
image = await image.center_crop(crop_width, crop_height);
}
/** @type {HeightWidth} */
const reshaped_input_size = [image.height, image.width];
// NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
// occurs with data in the hwc format (height, width, channels),
// to emulate the behavior of the original Python code (w/ numpy).
/** @type {Float32Array} */
let pixelData = Float32Array.from(image.data);
let imgDims = [image.height, image.width, image.channels];
if (this.do_rescale) {
this.rescale(pixelData);
}
if (do_normalize ?? this.do_normalize) {
let image_mean = this.image_mean;
if (!Array.isArray(this.image_mean)) {
image_mean = new Array(image.channels).fill(image_mean);
}
let image_std = this.image_std;
if (!Array.isArray(this.image_std)) {
image_std = new Array(image.channels).fill(image_mean);
}
if (image_mean.length !== image.channels || image_std.length !== image.channels) {
throw new Error(`When set to arrays, the length of \`image_mean\` (${image_mean.length}) and \`image_std\` (${image_std.length}) must match the number of channels in the image (${image.channels}).`);
}
for (let i = 0; i < pixelData.length; i += image.channels) {
for (let j = 0; j < image.channels; ++j) {
pixelData[i + j] = (pixelData[i + j] - image_mean[j]) / image_std[j];
}
}
}
// do padding after rescaling/normalizing
if (do_pad ?? this.do_pad) {
if (this.pad_size) {
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
[pixelData, imgDims] = padded; // Update pixel data and image dimensions
} else if (this.size_divisibility) {
const [paddedWidth, paddedHeight] = enforce_size_divisibility([imgDims[1], imgDims[0]], this.size_divisibility);
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
}
}
if (do_flip_channel_order ?? this.do_flip_channel_order) {
if (imgDims[2] !== 3) {
throw new Error('Flipping channel order is only supported for RGB images.');
}
// Convert RGB to BGR
for (let i = 0; i < pixelData.length; i += 3) {
const temp = pixelData[i];
pixelData[i] = pixelData[i + 2];
pixelData[i + 2] = temp;
}
}
const pixel_values = new Tensor('float32', pixelData, imgDims)
.permute(2, 0, 1); // convert to channel dimension format (hwc -> chw)
return {
original_size: [srcHeight, srcWidth],
reshaped_input_size: reshaped_input_size,
pixel_values,
}
}