in tokenizers/src/tokenizer/serialization.rs [108:171]
fn visit_map<V>(self, mut map: V) -> Result<Self::Value, V::Error>
where
V: MapAccess<'de>,
{
let mut builder = TokenizerBuilder::new();
let mut tokens: Vec<AddedTokenWithId> = vec![];
while let Some(key) = map.next_key::<String>()? {
match key.as_ref() {
"version" => {
let v: String = map.next_value()?;
if &v != "1.0" {
return Err(Error::custom(format!("Unknown tokenizer version '{v}'")));
}
}
"truncation" => {
builder = builder.with_truncation(map.next_value()?);
}
"padding" => {
builder = builder.with_padding(map.next_value()?);
}
"added_tokens" => {
tokens = map.next_value()?;
}
"normalizer" => {
builder = builder.with_normalizer(map.next_value()?);
}
"pre_tokenizer" => {
builder = builder.with_pre_tokenizer(map.next_value()?);
}
"model" => {
builder = builder.with_model(map.next_value()?);
}
"decoder" => {
builder = builder.with_decoder(map.next_value()?);
}
"post_processor" => {
builder = builder.with_post_processor(map.next_value()?);
}
_ => {}
};
}
let mut tokenizer = builder
.build()
.map_err(|e| V::Error::custom(e.to_string()))?;
// We take care of deserializing the added_tokens (instead of `AddedVocabulary` directly
// because it let us check that associated IDs are still good, and warn the user otherwise
for token in &tokens {
// Warn the user if the id is different than expected
let received_id = tokenizer.token_to_id(&token.token.content);
if let Some(rid) = received_id {
if rid != token.id {
warn!(
"Warning: Token '{}' was expected to have ID '{}' but was given ID '{}'",
token.token.content, token.id, rid
);
}
}
}
let added_tokens: Vec<_> = tokens.into_iter().map(|token| token.token).collect();
tokenizer.add_tokens(&added_tokens[..]);
Ok(tokenizer)
}