in core/src/tokenization.rs [366:410]
fn encode_input(
inputs: EncodingInput,
truncate: bool,
truncation_direction: TruncationDirection,
max_input_length: usize,
position_offset: usize,
default_prompt: Option<String>,
prompt_name: Option<String>,
prompts: Option<&HashMap<String, String>>,
tokenizer: &mut Tokenizer,
) -> Result<ValidEncoding, TextEmbeddingsError> {
// Default truncation params
let truncate_params = truncate.then_some(TruncationParams {
direction: truncation_direction,
max_length: max_input_length,
strategy: TruncationStrategy::LongestFirst,
stride: 0,
});
let (_, encoding) = tokenize_input(
inputs,
true,
max_input_length,
truncate_params,
default_prompt,
prompt_name,
prompts,
tokenizer,
)?;
let seq_len = encoding.len();
if seq_len > max_input_length {
return Err(TextEmbeddingsError::Validation(format!(
"`inputs` must have less than {max_input_length} tokens. Given: {seq_len}"
)));
}
let histogram = metrics::histogram!("te_request_input_length");
histogram.record(seq_len as f64);
Ok(ValidEncoding {
input_ids: encoding.get_ids().to_vec(),
token_type_ids: encoding.get_type_ids().to_vec(),
position_ids: (position_offset as u32..(seq_len + position_offset) as u32)
.collect::<Vec<_>>(),
})
}