in tokenizers/src/processors/template.rs [954:1128]
fn template_processing_overflowing() {
let processor = tests::get_bert_template();
assert_eq!(processor.added_tokens(false), 2);
assert_eq!(processor.added_tokens(true), 3);
use crate::Token;
let mut encoding = Encoding::from_tokens(
vec![
Token::new(12, "Hello".into(), (0, 5)),
Token::new(14, "there".into(), (6, 11)),
],
0,
);
let overflowing = Encoding::from_tokens(vec![Token::new(13, "you".into(), (12, 15))], 0);
encoding.set_overflowing(vec![overflowing]);
let mut pair = Encoding::from_tokens(
vec![
Token::new(15, "pair".into(), (0, 4)),
Token::new(16, "with".into(), (5, 9)),
],
0,
);
let pair_overflowing =
Encoding::from_tokens(vec![Token::new(17, "info".into(), (10, 14))], 0);
pair.set_overflowing(vec![pair_overflowing]);
let single_encoding = processor.process(encoding.clone(), None, true).unwrap();
assert_eq!(
single_encoding,
Encoding::new(
vec![1, 12, 14, 0],
vec![0, 0, 0, 0],
vec![
"[CLS]".into(),
"Hello".into(),
"there".into(),
"[SEP]".into()
],
vec![None, None, None, None],
vec![(0, 0), (0, 5), (6, 11), (0, 0)],
vec![1, 0, 0, 1],
vec![1, 1, 1, 1],
vec![Encoding::new(
vec![1, 13, 0],
vec![0, 0, 0],
vec!["[CLS]".into(), "you".into(), "[SEP]".into()],
vec![None, None, None],
vec![(0, 0), (12, 15), (0, 0)],
vec![1, 0, 1],
vec![1, 1, 1],
vec![],
AHashMap::from_iter(vec![(0, 1..2)]),
)],
AHashMap::from_iter(vec![(0, 1..3)]),
)
);
assert_eq!(single_encoding.token_to_sequence(2), Some(0));
assert_eq!(single_encoding.token_to_sequence(3), None);
let pair_encoding = processor.process(encoding, Some(pair), true).unwrap();
println!("{pair_encoding:#?}");
assert_eq!(
pair_encoding,
Encoding::new(
vec![1, 12, 14, 0, 15, 16, 0],
vec![0, 0, 0, 0, 1, 1, 1],
vec![
"[CLS]".into(),
"Hello".into(),
"there".into(),
"[SEP]".into(),
"pair".into(),
"with".into(),
"[SEP]".into()
],
vec![None, None, None, None, None, None, None],
vec![(0, 0), (0, 5), (6, 11), (0, 0), (0, 4), (5, 9), (0, 0)],
vec![1, 0, 0, 1, 0, 0, 1],
vec![1, 1, 1, 1, 1, 1, 1],
vec![
Encoding::new(
vec![1, 13, 0, 15, 16, 0],
vec![0, 0, 0, 1, 1, 1],
vec![
"[CLS]".into(),
"you".into(),
"[SEP]".into(),
"pair".into(),
"with".into(),
"[SEP]".into()
],
vec![None, None, None, None, None, None],
vec![(0, 0), (12, 15), (0, 0), (0, 4), (5, 9), (0, 0)],
vec![1, 0, 1, 0, 0, 1],
vec![1, 1, 1, 1, 1, 1],
vec![Encoding::new(
vec![1, 13, 0, 17, 0],
vec![0, 0, 0, 0, 1],
vec![
"[CLS]".into(),
"you".into(),
"[SEP]".into(),
"info".into(),
"[SEP]".into()
],
vec![None, None, None, None, None,],
vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)],
vec![1, 0, 1, 0, 1],
vec![1, 1, 1, 1, 1],
vec![],
AHashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
),],
AHashMap::from_iter(vec![(1, 3..5), (0, 1..2)]),
),
Encoding::new(
vec![1, 13, 0, 17, 0],
vec![0, 0, 0, 0, 1],
vec![
"[CLS]".into(),
"you".into(),
"[SEP]".into(),
"info".into(),
"[SEP]".into()
],
vec![None, None, None, None, None,],
vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)],
vec![1, 0, 1, 0, 1],
vec![1, 1, 1, 1, 1],
vec![],
AHashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
),
Encoding::new(
vec![1, 12, 14, 0, 17, 0],
vec![0, 0, 0, 0, 0, 1],
vec![
"[CLS]".into(),
"Hello".into(),
"there".into(),
"[SEP]".into(),
"info".into(),
"[SEP]".into()
],
vec![None, None, None, None, None, None],
vec![(0, 0), (0, 5), (6, 11), (0, 0), (10, 14), (0, 0)],
vec![1, 0, 0, 1, 0, 1],
vec![1, 1, 1, 1, 1, 1],
vec![Encoding::new(
vec![1, 13, 0, 17, 0],
vec![0, 0, 0, 0, 1],
vec![
"[CLS]".into(),
"you".into(),
"[SEP]".into(),
"info".into(),
"[SEP]".into()
],
vec![None, None, None, None, None,],
vec![(0, 0), (12, 15), (0, 0), (10, 14), (0, 0)],
vec![1, 0, 1, 0, 1],
vec![1, 1, 1, 1, 1],
vec![],
AHashMap::from_iter(vec![(0, 1..2), (1, 3..4)]),
),],
AHashMap::from_iter(vec![(0, 1..3), (1, 4..5)]),
)
],
AHashMap::from_iter(vec![(0, 1..3), (1, 4..6)]),
)
);
assert_eq!(pair_encoding.token_to_sequence(2), Some(0));
assert_eq!(pair_encoding.token_to_sequence(3), None);
assert_eq!(pair_encoding.token_to_sequence(4), Some(1));
assert_eq!(pair_encoding.token_to_sequence(5), Some(1));
assert_eq!(pair_encoding.token_to_sequence(6), None);
}