in router/src/validation.rs [609:775]
fn image_tokens(
config: &Config,
preprocessor_config: Option<&HubPreprocessorConfig>,
height: usize,
width: usize,
) -> String {
use Config::*;
use HubPreprocessorConfig::*;
match config {
Idefics => "<image>".to_string(),
Mllama => "<|image|>".to_string(),
Idefics2(config) => {
const FAKE: &str = "<fake_token_around_image>";
const IMAGE: &str = "<image>";
let slots = config.get_number_of_features(height, width);
let mut image_string = String::with_capacity(2 * FAKE.len() + slots * IMAGE.len());
image_string.push_str(FAKE);
image_string.extend(iter::repeat_n(IMAGE, slots));
image_string.push_str(FAKE);
if matches!(
preprocessor_config,
Some(Idefics2Processor(Idefics2Preprocessor {
do_image_splitting: true,
..
}))
) {
image_string = image_string.repeat(5);
};
image_string
}
Idefics3(config) => {
const FAKE: &str = "<fake_token_around_image>";
const IMAGE: &str = "<image>";
const GLOBAL_IMG: &str = "<global-img>";
let max_longest_edge_for_image_resize = config.get_max_longest_edge_for_image_resize();
// resize image if it is larger than max_longest_edge_for_image_resize keeping aspect ratio
let (height, width) = if height > max_longest_edge_for_image_resize
|| width > max_longest_edge_for_image_resize
{
let aspect_ratio = height as f32 / width as f32;
if height > width {
(
max_longest_edge_for_image_resize,
(max_longest_edge_for_image_resize as f32 / aspect_ratio) as usize,
)
} else {
(
(max_longest_edge_for_image_resize as f32 * aspect_ratio) as usize,
max_longest_edge_for_image_resize,
)
}
} else {
(height, width)
};
let image_seq_len = config.get_number_of_features();
let max_edge = config.get_max_longest_edge();
let (image_rows, image_cols) = if height > max_edge || width > max_edge {
(
(height as f32 / max_edge as f32).ceil() as usize,
(width as f32 / max_edge as f32).ceil() as usize,
)
} else {
(0, 0)
};
let mut image_string = String::new();
if image_rows == 0 && image_cols == 0 {
// Single image case
image_string.push_str(FAKE);
image_string.push_str(GLOBAL_IMG);
image_string.push_str(&IMAGE.repeat(image_seq_len));
image_string.push_str(FAKE);
} else {
// Split image case
for n_h in 0..image_rows {
for n_w in 0..image_cols {
image_string.push_str(FAKE);
image_string.push_str(&format!("<row_{}_col_{}>", n_h + 1, n_w + 1));
image_string.push_str(&IMAGE.repeat(image_seq_len));
}
image_string.push('\n');
}
image_string.push('\n');
image_string.push_str(FAKE);
image_string.push_str(GLOBAL_IMG);
image_string.push_str(&IMAGE.repeat(image_seq_len));
image_string.push_str(FAKE);
}
image_string
}
Paligemma(config) => "<image>".repeat(config.get_number_of_features(height, width)),
LlavaNext(config) => "<image>".repeat(config.get_number_of_features(height, width)),
Llama4(config) => {
const IMAGE_START: &str = "<|image_start|>";
const IMAGE: &str = "<|image|>";
const IMAGE_END: &str = "<|image_end|>";
const PATCH: &str = "<|patch|>";
const TILE_X_SEP: &str = "<|tile_x_separator|>";
const TILE_Y_SEP: &str = "<|tile_y_separator|>";
let image_height = config.image_size();
let patch_size = config.patch_size();
let pixel_shuffle_ratio = config.pixel_shuffle_ratio();
let max_patches = match preprocessor_config {
Some(HubPreprocessorConfig::Llama4Processor(cfg)) => cfg.max_patches,
_ => panic!("Expected Llama4Processor in preprocessor_config"),
};
let downsample_ratio =
(1.0 / (pixel_shuffle_ratio * pixel_shuffle_ratio)).round() as usize;
let (ratio_h, ratio_w) = config.get_aspect_ratios(height, width, max_patches);
let image_width = image_height; // Assuming pixel shape: [H][W][C]
let num_patches_per_chunk =
(image_height / patch_size) * (image_width / patch_size) / downsample_ratio;
let mut img_string = String::new();
img_string.push_str(IMAGE_START);
if ratio_h * ratio_w > 1 {
for _yy in 0..ratio_h {
for xx in 0..ratio_w {
img_string.push_str(&PATCH.repeat(num_patches_per_chunk));
if xx < ratio_w - 1 {
img_string.push_str(TILE_X_SEP);
}
}
img_string.push_str(TILE_Y_SEP);
}
}
img_string.push_str(IMAGE);
img_string.push_str(&PATCH.repeat(num_patches_per_chunk));
img_string.push_str(IMAGE_END);
img_string
}
Qwen2Vl(config) => format!(
"<|vision_start|>{:?}<|vision_end|>",
"<|image_pad|>".repeat(config.get_number_of_features(height, width))
),
Qwen2_5Vl(config) => format!(
"<|vision_start|>{:?}<|vision_end|>",
"<|image_pad|>".repeat(config.get_number_of_features(height, width))
),
Gemma3(_config) => {
// TODO: prefer using the config to determine the number of features
let num_mm_soft_tokens_per_image = 256;
format!(
"\n\n<start_of_image>{}<end_of_image>\n\n",
"<image_soft_token>".repeat(num_mm_soft_tokens_per_image)
)
}
_ => unimplemented!("Images tokens are not supported for this model configuration"),
}
}