in tokenizers/src/processors/template.rs [544:643]
fn apply_template(
&self,
template: &[Piece],
mut encodings: Vec<Encoding>,
add_special_tokens: bool,
) -> Result<Vec<Encoding>> {
let final_encodings: Vec<Encoding> = template
.iter()
.flat_map(|piece| {
match piece {
Piece::Sequence { id, type_id } => {
let i = usize::from(*id != Sequence::A);
let encoding = &mut encodings[i];
encoding.set_type_ids(vec![*type_id; encoding.len()]);
encoding.set_sequence_id(i);
Some(encoding.clone())
}
Piece::SpecialToken { id, type_id } => {
if add_special_tokens {
let tok = &self.special_tokens.0[id]; // We already checked existence above
let len = tok.ids.len();
let encoding = Encoding::new(
tok.ids.clone(),
std::iter::repeat_n(*type_id, len).collect(),
tok.tokens.clone(),
// words
std::iter::repeat_n(None, len).collect(),
// offsets
std::iter::repeat_n((0, 0), len).collect(),
// special_tokens_mask
std::iter::repeat_n(1, len).collect(),
// attention_mask
std::iter::repeat_n(1, len).collect(),
// overflowing
vec![],
// sequence_range
AHashMap::new(),
);
Some(encoding)
} else {
None
}
}
}
})
.collect();
//let mut pair = if encodings.len() > 1 {
// Some(encodings.pop().unwrap())
//} else {
// None
//};
//let mut encoding = encodings.pop().unwrap();
//let pair_overflowing = pair.as_mut().map_or(vec![], |e| e.take_overflowing());
//let mut overflowing: Vec<Encoding> = encoding
// .take_overflowing()
// .iter()
// .map(|encoding| -> Result<Vec<Encoding>> {
// // 1. The pair itself
// let mut overflowings = self.apply_template(
// template,
// if encodings.len() > 1 {
// vec![encoding.clone(), encodings[1].clone()]
// } else {
// vec![encoding.clone()]
// },
// add_special_tokens,
// )?;
// // 2. Its overflowings
// for other_o in &pair_overflowing {
// overflowings.extend(self.apply_template(
// template,
// vec![encoding.clone(), other_o.clone()],
// add_special_tokens,
// )?);
// }
// Ok(overflowings)
// })
// .collect::<Result<Vec<Vec<Encoding>>>>()?
// .into_iter()
// .flatten()
// .collect();
//// We also need to combine the first sequence with all other overflowings
//overflowing.extend(
// pair_overflowing
// .into_iter()
// .map(|pair| {
// self.apply_template(template, vec![encoding.clone(), pair], add_special_tokens)
// })
// .collect::<Result<Vec<_>>>()?
// .into_iter()
// .flatten(),
//);
Ok(final_encodings)
}