in tokenizers/src/models/bpe/serialization.rs [78:155]
fn visit_map<V>(self, mut map: V) -> std::result::Result<Self::Value, V::Error>
where
V: MapAccess<'de>,
{
let mut builder = BpeBuilder::new();
let mut vocab: Option<AHashMap<String, u32>> = None;
#[derive(Debug, Deserialize)]
#[serde(untagged)]
enum MergeType {
Tuple(Vec<(String, String)>),
Legacy(Vec<String>),
}
let mut merges: Option<MergeType> = None;
while let Some(key) = map.next_key::<String>()? {
match key.as_ref() {
"dropout" => {
if let Some(dropout) = map.next_value()? {
builder = builder.dropout(dropout);
}
}
"unk_token" => {
if let Some(unk) = map.next_value()? {
builder = builder.unk_token(unk);
}
}
"continuing_subword_prefix" => {
if let Some(prefix) = map.next_value()? {
builder = builder.continuing_subword_prefix(prefix);
}
}
"end_of_word_suffix" => {
if let Some(suffix) = map.next_value()? {
builder = builder.end_of_word_suffix(suffix);
}
}
"fuse_unk" => {
if let Some(suffix) = map.next_value()? {
builder = builder.fuse_unk(suffix);
}
}
"byte_fallback" => {
if let Some(suffix) = map.next_value()? {
builder = builder.byte_fallback(suffix);
}
}
"ignore_merges" => {
if let Some(suffix) = map.next_value()? {
builder = builder.ignore_merges(suffix);
}
}
"vocab" => vocab = Some(map.next_value()?),
"merges" => merges = Some(map.next_value()?),
"type" => match map.next_value()? {
"BPE" => {}
u => {
return Err(serde::de::Error::invalid_value(
serde::de::Unexpected::Str(u),
&"BPE",
))
}
},
_ => {}
}
}
if let (Some(vocab), Some(merges)) = (vocab, merges) {
let merges = match merges {
MergeType::Tuple(merges) => merges,
MergeType::Legacy(merges) => {
convert_merges_to_hashmap(merges.into_iter(), &vocab).map_err(Error::custom)?
}
};
builder = builder.vocab_and_merges(vocab, merges);
Ok(builder.build().map_err(Error::custom)?)
} else {
Err(Error::custom("Missing vocab/merges"))
}
}