fn tokenize_prompt()

in src/requests.rs [675:716]


fn tokenize_prompt(
    prompt: String,
    tokenizer: Arc<Tokenizer>,
    options: &TokenizeOptions,
) -> anyhow::Result<(String, u64)> {
    let prompt_tokens = tokenizer
        .encode(prompt.clone(), false)
        .map_err(|_| anyhow::anyhow!("Error tokenizing prompt"))?;
    match options.num_tokens {
        None => {
            // check if we have a min/max number of tokens, skip prompts that are too short or too long
            if prompt_tokens.len() > options.max_tokens as usize
                || prompt_tokens.len() < options.min_tokens as usize
            {
                return Err(anyhow::anyhow!(format!(
                    "Prompt is too short or too long, skipping: {}<{}<{}",
                    options.min_tokens,
                    prompt_tokens.len(),
                    options.max_tokens
                )));
            }
            Ok((prompt, prompt_tokens.len() as u64))
        }
        Some(num_tokens) => {
            if prompt_tokens.len() < num_tokens as usize {
                return Err(anyhow::anyhow!(format!(
                    "Prompt is too short to tokenize: {}<{}",
                    prompt_tokens.len(),
                    num_tokens
                )));
            }
            let tokens = prompt_tokens
                .get_ids()
                .iter()
                .take(num_tokens as usize)
                .copied()
                .collect::<Vec<u32>>();
            let prompt = tokenizer.decode(&tokens, true).unwrap();
            Ok((prompt, num_tokens))
        }
    }
}