fn image_tokens()

in router/src/validation.rs [609:775]


fn image_tokens(
    config: &Config,
    preprocessor_config: Option<&HubPreprocessorConfig>,
    height: usize,
    width: usize,
) -> String {
    use Config::*;
    use HubPreprocessorConfig::*;
    match config {
        Idefics => "<image>".to_string(),
        Mllama => "<|image|>".to_string(),
        Idefics2(config) => {
            const FAKE: &str = "<fake_token_around_image>";
            const IMAGE: &str = "<image>";

            let slots = config.get_number_of_features(height, width);

            let mut image_string = String::with_capacity(2 * FAKE.len() + slots * IMAGE.len());
            image_string.push_str(FAKE);
            image_string.extend(iter::repeat_n(IMAGE, slots));
            image_string.push_str(FAKE);

            if matches!(
                preprocessor_config,
                Some(Idefics2Processor(Idefics2Preprocessor {
                    do_image_splitting: true,
                    ..
                }))
            ) {
                image_string = image_string.repeat(5);
            };

            image_string
        }
        Idefics3(config) => {
            const FAKE: &str = "<fake_token_around_image>";
            const IMAGE: &str = "<image>";
            const GLOBAL_IMG: &str = "<global-img>";

            let max_longest_edge_for_image_resize = config.get_max_longest_edge_for_image_resize();

            // resize image if it is larger than max_longest_edge_for_image_resize keeping aspect ratio
            let (height, width) = if height > max_longest_edge_for_image_resize
                || width > max_longest_edge_for_image_resize
            {
                let aspect_ratio = height as f32 / width as f32;
                if height > width {
                    (
                        max_longest_edge_for_image_resize,
                        (max_longest_edge_for_image_resize as f32 / aspect_ratio) as usize,
                    )
                } else {
                    (
                        (max_longest_edge_for_image_resize as f32 * aspect_ratio) as usize,
                        max_longest_edge_for_image_resize,
                    )
                }
            } else {
                (height, width)
            };

            let image_seq_len = config.get_number_of_features();
            let max_edge = config.get_max_longest_edge();

            let (image_rows, image_cols) = if height > max_edge || width > max_edge {
                (
                    (height as f32 / max_edge as f32).ceil() as usize,
                    (width as f32 / max_edge as f32).ceil() as usize,
                )
            } else {
                (0, 0)
            };

            let mut image_string = String::new();

            if image_rows == 0 && image_cols == 0 {
                // Single image case
                image_string.push_str(FAKE);
                image_string.push_str(GLOBAL_IMG);
                image_string.push_str(&IMAGE.repeat(image_seq_len));
                image_string.push_str(FAKE);
            } else {
                // Split image case
                for n_h in 0..image_rows {
                    for n_w in 0..image_cols {
                        image_string.push_str(FAKE);
                        image_string.push_str(&format!("<row_{}_col_{}>", n_h + 1, n_w + 1));
                        image_string.push_str(&IMAGE.repeat(image_seq_len));
                    }
                    image_string.push('\n');
                }

                image_string.push('\n');
                image_string.push_str(FAKE);
                image_string.push_str(GLOBAL_IMG);
                image_string.push_str(&IMAGE.repeat(image_seq_len));
                image_string.push_str(FAKE);
            }

            image_string
        }
        Paligemma(config) => "<image>".repeat(config.get_number_of_features(height, width)),
        LlavaNext(config) => "<image>".repeat(config.get_number_of_features(height, width)),
        Llama4(config) => {
            const IMAGE_START: &str = "<|image_start|>";
            const IMAGE: &str = "<|image|>";
            const IMAGE_END: &str = "<|image_end|>";
            const PATCH: &str = "<|patch|>";
            const TILE_X_SEP: &str = "<|tile_x_separator|>";
            const TILE_Y_SEP: &str = "<|tile_y_separator|>";

            let image_height = config.image_size();
            let patch_size = config.patch_size();
            let pixel_shuffle_ratio = config.pixel_shuffle_ratio();
            let max_patches = match preprocessor_config {
                Some(HubPreprocessorConfig::Llama4Processor(cfg)) => cfg.max_patches,
                _ => panic!("Expected Llama4Processor in preprocessor_config"),
            };
            let downsample_ratio =
                (1.0 / (pixel_shuffle_ratio * pixel_shuffle_ratio)).round() as usize;

            let (ratio_h, ratio_w) = config.get_aspect_ratios(height, width, max_patches);
            let image_width = image_height; // Assuming pixel shape: [H][W][C]

            let num_patches_per_chunk =
                (image_height / patch_size) * (image_width / patch_size) / downsample_ratio;

            let mut img_string = String::new();
            img_string.push_str(IMAGE_START);

            if ratio_h * ratio_w > 1 {
                for _yy in 0..ratio_h {
                    for xx in 0..ratio_w {
                        img_string.push_str(&PATCH.repeat(num_patches_per_chunk));
                        if xx < ratio_w - 1 {
                            img_string.push_str(TILE_X_SEP);
                        }
                    }
                    img_string.push_str(TILE_Y_SEP);
                }
            }

            img_string.push_str(IMAGE);
            img_string.push_str(&PATCH.repeat(num_patches_per_chunk));
            img_string.push_str(IMAGE_END);

            img_string
        }
        Qwen2Vl(config) => format!(
            "<|vision_start|>{:?}<|vision_end|>",
            "<|image_pad|>".repeat(config.get_number_of_features(height, width))
        ),
        Qwen2_5Vl(config) => format!(
            "<|vision_start|>{:?}<|vision_end|>",
            "<|image_pad|>".repeat(config.get_number_of_features(height, width))
        ),
        Gemma3(_config) => {
            // TODO: prefer using the config to determine the number of features
            let num_mm_soft_tokens_per_image = 256;
            format!(
                "\n\n<start_of_image>{}<end_of_image>\n\n",
                "<image_soft_token>".repeat(num_mm_soft_tokens_per_image)
            )
        }
        _ => unimplemented!("Images tokens are not supported for this model configuration"),
    }
}