_retrieve_init_tokens()

in src/models.js [3349:3400]


    _retrieve_init_tokens(generation_config) {
        // prefix tokens are of the form: 
        //  - Multilingual: <|startoftranscript|> <|lang_id|> <|task|> [<|notimestamps|>]
        //  - English-only: <|startoftranscript|> [<|notimestamps|>]

        // 1. Handle <|startoftranscript|> token
        const init_tokens = [generation_config.decoder_start_token_id];

        // 2. Handle <|lang_id|> and <|task> tokens
        let language = generation_config.language;
        const task = generation_config.task;
        if (generation_config.is_multilingual) {
            if (!language) {
                // TODO: Implement language detection
                console.warn('No language specified - defaulting to English (en).');
                language = 'en';
            }

            // Add language token
            const language_code = whisper_language_to_code(language);
            const language_token = `<|${language_code}|>`;
            init_tokens.push(generation_config.lang_to_id[language_token])

            // Add task token
            // NOTE: Defaults to 'transcribe' if no task is specified
            init_tokens.push(generation_config.task_to_id[task ?? 'transcribe']);

        } else if (language || task) {
            throw new Error(
                "Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config."
            )
        }

        // 3. Handle <|notimestamps|> token
        if (
            !generation_config.return_timestamps
            && generation_config.no_timestamps_token_id
            && init_tokens.at(-1) !== generation_config.no_timestamps_token_id
        ) {
            init_tokens.push(generation_config.no_timestamps_token_id);
        } else if (
            generation_config.return_timestamps
            &&
            init_tokens.at(-1) === generation_config.no_timestamps_token_id
        ) {
            console.warn("<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`.");
            init_tokens.pop();
        }

        // let's make sure we don't pass `null` tokens as prompt tokens
        return init_tokens.filter(token => token != null);
    }