in core/src/tokenization.rs [286:362]
fn tokenize_input(
mut inputs: EncodingInput,
add_special_tokens: bool,
max_input_length: usize,
truncate_params: Option<TruncationParams>,
default_prompt: Option<String>,
prompt_name: Option<String>,
prompts: Option<&HashMap<String, String>>,
tokenizer: &mut Tokenizer,
) -> Result<(Option<String>, RawEncoding), TextEmbeddingsError> {
let pre_prompt = prepare_pre_prompt(default_prompt, prompt_name, prompts)?;
let input_chars = inputs.count_chars();
let limit = max_input_length * MAX_CHAR_MULTIPLIER;
if input_chars > limit {
if truncate_params.is_none() {
return Err(TextEmbeddingsError::Validation(format!(
"`inputs` must have less than {limit} characters. Given: {input_chars}"
)));
}
inputs.apply_limit(limit);
}
let encoding = match inputs {
// encode input
EncodingInput::Single(s) => {
let s = if let Some(mut pre_prompt) = pre_prompt {
pre_prompt.push_str(&s);
pre_prompt
} else {
s
};
let encoding = tokenizer
.with_truncation(truncate_params)?
.encode::<&str>(&s, add_special_tokens)?;
(Some(s), encoding)
}
EncodingInput::Dual(s1, s2) => {
if pre_prompt.is_some() {
return Err(TextEmbeddingsError::Validation(
"`prompt_name` cannot be set with dual inputs".to_string(),
));
}
(
None,
tokenizer
.with_truncation(truncate_params)?
.encode::<(String, String)>((s1, s2), add_special_tokens)?,
)
}
// input is encoded -> convert to tokenizers Encoding
EncodingInput::Ids(ids) => {
if let Some(mut pre_prompt) = pre_prompt {
let text = tokenizer.decode(&ids, true)?;
pre_prompt.push_str(&text);
let encoding = tokenizer
.with_truncation(truncate_params)?
.encode::<&str>(&pre_prompt, true)?;
(Some(pre_prompt), encoding)
} else {
let text = tokenizer.decode(&ids, false)?;
let encoding = tokenizer
.with_truncation(truncate_params)?
.encode::<&str>(&text, false)?;
(Some(text), encoding)
}
}
};
Ok(encoding)
}