async _call()

in src/models/janus/processing_janus.js [44:122]


    async _call(conversation, {
        images = null,
        chat_template = "default",
    }={}) {
        if (!images) {
            images = await Promise.all(
                conversation
                    .filter((msg) => msg.images)
                    .flatMap((msg) => msg.images)
                    .map((img) => RawImage.read(img))
            );
        } else if (!Array.isArray(images)) {
            images = [images];
        }

        const tokenizer = this.tokenizer;
        const result = tokenizer.apply_chat_template(conversation, {
            tokenize: false,
            add_generation_prompt: true,
            chat_template,
        });

        const encode = (text) => tokenizer.encode(text, { add_special_tokens: false });
        const parts = (/** @type {string} */(result))
            .split(this.image_tag);
        const num_images = parts.length - 1;
        if (images.length !== num_images) {
            throw new Error(`Number of images provided (${images.length}) does not match number of "${this.image_tag}" image tags (${num_images})`);
        }

        const [
            image_placeholder_tag_id,
            image_start_tag_id,
            image_end_tag_id,
        ] = tokenizer.model.convert_tokens_to_ids([
            this.image_tag,
            this.image_start_tag,
            this.image_end_tag,
        ]);

        let input_ids = encode(parts[0]);
        let images_seq_mask = new Array(input_ids.length).fill(false);
        for (let i = 1; i < parts.length; ++i) {
            const placeholder_image_tokens = new Array(this.num_image_tokens).fill(image_placeholder_tag_id);
            const tokens = encode(parts[i]);
            input_ids = mergeArrays(
                input_ids,
                [image_start_tag_id], placeholder_image_tokens, [image_end_tag_id],
                tokens,
            );
            const image_mask = new Array(this.num_image_tokens).fill(true);
            images_seq_mask = mergeArrays(
                images_seq_mask,
                [false], image_mask, [false],
                new Array(tokens.length).fill(false),
            );
        }

        const dims = [1, input_ids.length];
        const final = {
            input_ids: new Tensor('int64', input_ids, dims),
            attention_mask: new Tensor('int64', new Array(input_ids.length).fill(1), dims),
            images_seq_mask: new Tensor('bool', images_seq_mask, dims),
            images_emb_mask: new Tensor(
                'bool',
                new Array(num_images * this.num_image_tokens).fill(true),
                [1, num_images, this.num_image_tokens],
            ),
        }

        if (images && images.length > 0) {
            const image_inputs = await this.image_processor(images);
            // Set the batch_size dimension to 1
            image_inputs.pixel_values.unsqueeze_(0);
            return { ...final, ...image_inputs };
        }

        return final;
    }