in src/tokenizers.js [3472:3512]
function _build_translation_inputs(self, raw_inputs, tokenizer_options, generate_kwargs) {
if (!('language_codes' in self) || !Array.isArray(self.language_codes)) {
throw new Error('Tokenizer must have `language_codes` attribute set and it should be an array of language ids.')
}
if (!('languageRegex' in self) || !(self.languageRegex instanceof RegExp)) {
throw new Error('Tokenizer must have `languageRegex` attribute set and it should be a regular expression.')
}
if (!('lang_to_token' in self) || typeof self.lang_to_token !== 'function') {
throw new Error('Tokenizer must have `lang_to_token` attribute set and it should be a function.')
}
const src_lang_token = generate_kwargs.src_lang;
const tgt_lang_token = generate_kwargs.tgt_lang;
// Check that the target language is valid:
if (!self.language_codes.includes(tgt_lang_token)) {
throw new Error(`Target language code "${tgt_lang_token}" is not valid. Must be one of: {${self.language_codes.join(', ')}}`);
}
// Allow `src_lang` to be optional. If not set, we'll use the tokenizer's default.
if (src_lang_token !== undefined) {
// Check that the source language is valid:
if (!self.language_codes.includes(src_lang_token)) {
throw new Error(`Source language code "${src_lang_token}" is not valid. Must be one of: {${self.language_codes.join(', ')}}`);
}
// In the same way as the Python library, we override the post-processor
// to force the source language to be first:
for (const item of self.post_processor.config.single) {
if ('SpecialToken' in item && self.languageRegex.test(item.SpecialToken.id)) {
item.SpecialToken.id = self.lang_to_token(src_lang_token);
break;
}
}
// TODO: Do the same for pair?
}
// Override the `forced_bos_token_id` to force the correct language
generate_kwargs.forced_bos_token_id = self.model.convert_tokens_to_ids([self.lang_to_token(tgt_lang_token)])[0];
return self._call(raw_inputs, tokenizer_options);
}