in tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs [41:76]
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
pretokenized.split(|_, normalized| {
let mut last_script = None;
let mut offset = 0;
let mut ranges: Vec<_> = normalized
.get()
.chars()
.filter_map(|c| {
let script = Some(fixed_script(c));
let result = if script != Some(Script::Any)
&& last_script != Some(Script::Any)
&& last_script != script
{
Some(offset)
} else {
None
};
offset += c.len_utf8();
if script != Some(Script::Any) {
last_script = script;
}
result
})
.collect();
ranges.push(normalized.get().len());
Ok(ranges
.windows(2)
.map(|item| {
normalized
.slice(Range::Normalized(item[0]..item[1]))
.expect("NormalizedString bad split")
})
.collect::<Vec<_>>())
})
}