async _call()

in src/models/paligemma/processing_paligemma.js [27:82]


    async _call(/** @type {RawImage|RawImage[]} */ images, text = null, kwargs = {}) {
        if (!text) {
            console.warn(
                "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."
            )
            text = ""
        }

        if (!Array.isArray(images)) {
            images = [images]
        }

        if (!Array.isArray(text)) {
            text = [text]
        }

        const bos_token = this.tokenizer.bos_token;
        // @ts-expect-error TS2339
        const image_seq_length = this.image_processor.config.image_seq_length;
        let input_strings;
        if (text.some((t) => t.includes(IMAGE_TOKEN))) {
            input_strings = text.map(
                sample => {
                    const expanded_sample = sample.replaceAll(IMAGE_TOKEN, IMAGE_TOKEN.repeat(image_seq_length));
                    const bos_rfind_index = expanded_sample.lastIndexOf(IMAGE_TOKEN);
                    const bos_index = bos_rfind_index === -1 ? 0 : bos_rfind_index + IMAGE_TOKEN.length;
                    return expanded_sample.slice(0, bos_index) + bos_token + expanded_sample.slice(bos_index) + "\n";
                }
            )
        } else {
            console.warn(
                "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special " +
                "image tokens in the text, as many tokens as there are images per each text. It is recommended to " +
                "add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images " +
                "each text has and add special tokens."
            )

            input_strings = text.map(
                sample => build_string_from_input(
                    sample,
                    bos_token,
                    image_seq_length,
                    IMAGE_TOKEN,
                    images.length,
                )
            )
        }

        const text_inputs = this.tokenizer(input_strings, kwargs);
        const image_inputs = await this.image_processor(images, kwargs);

        return {
            ...image_inputs,
            ...text_inputs,
        }
    }