in tokenizers/src/tokenizer/mod.rs [718:761]
fn encode_single_sequence(
&self,
sequence: InputSequence,
type_id: u32,
offsets_type: OffsetType,
) -> Result<Encoding> {
let encode = |is_pre_tokenized, subseq_idx, subseq| -> Result<Encoding> {
let normalized = self
.added_vocabulary
.extract_and_normalize(self.normalizer.as_ref(), subseq);
let pre_tokenized = self.do_pre_tokenize(normalized)?;
let subseq_encoding = self.do_tokenize(
pre_tokenized,
type_id,
if is_pre_tokenized {
Some(subseq_idx as u32)
} else {
None
},
offsets_type,
)?;
Ok(subseq_encoding)
};
match sequence {
InputSequence::PreTokenized(seq) => seq
.iter()
.enumerate()
.map(|(i, sequence)| encode(true, i, sequence))
.collect(),
InputSequence::PreTokenizedOwned(seq) => seq
.iter()
.enumerate()
.map(|(i, sequence)| encode(true, i, sequence))
.collect(),
InputSequence::PreTokenizedCow(seq) => seq
.iter()
.enumerate()
.map(|(i, sequence)| encode(true, i, sequence))
.collect(),
InputSequence::Raw(seq) => encode(false, 0, seq.as_ref()),
}
}