in tokenizers/src/tokenizer/encoding.rs [792:881]
fn mappings() {
let encoding = Encoding {
ids: vec![0; 11], // Needed for Encoding::len
tokens: vec![
// First sequence:
"He".into(),
"llo".into(),
"won".into(),
"der".into(),
"ful".into(),
"friend".into(),
"!".into(),
// Second sequence:
"How".into(),
"are".into(),
"you".into(),
"?".into(),
],
offsets: vec![
// First sequence:
(0, 2),
(2, 5),
(7, 10),
(10, 13),
(13, 16),
(17, 23),
(23, 24),
// Second sequence:
(0, 3),
(4, 7),
(8, 11),
(11, 12),
],
words: vec![
// First sequence:
Some(0),
Some(0),
Some(1),
Some(1),
Some(1),
Some(2),
Some(3),
// Second sequence:
Some(0),
Some(1),
Some(2),
Some(3),
],
sequence_ranges: AHashMap::from_iter(vec![(0, 0..7), (1, 7..11)]),
..Default::default()
};
assert_eq!(encoding.word_to_tokens(0, 0), Some((0, 2)));
assert_eq!(encoding.word_to_tokens(1, 0), Some((2, 5)));
assert_eq!(encoding.word_to_tokens(2, 0), Some((5, 6)));
assert_eq!(encoding.word_to_tokens(3, 0), Some((6, 7)));
assert_eq!(encoding.word_to_tokens(0, 1), Some((7, 8)));
assert_eq!(encoding.word_to_tokens(1, 1), Some((8, 9)));
assert_eq!(encoding.word_to_tokens(2, 1), Some((9, 10)));
assert_eq!(encoding.word_to_tokens(3, 1), Some((10, 11)));
assert_eq!(encoding.word_to_chars(0, 0), Some((0, 5)));
assert_eq!(encoding.word_to_chars(1, 0), Some((7, 16)));
assert_eq!(encoding.word_to_chars(0, 1), Some((0, 3)));
assert_eq!(encoding.word_to_chars(1, 1), Some((4, 7)));
assert_eq!(encoding.token_to_chars(0), Some((0, (0, 2))));
assert_eq!(encoding.token_to_chars(1), Some((0, (2, 5))));
assert_eq!(encoding.token_to_chars(7), Some((1, (0, 3))));
assert_eq!(encoding.token_to_chars(9), Some((1, (8, 11))));
assert_eq!(encoding.token_to_word(1), Some((0, 0)));
assert_eq!(encoding.token_to_word(2), Some((0, 1)));
assert_eq!(encoding.token_to_word(7), Some((1, 0)));
assert_eq!(encoding.token_to_word(9), Some((1, 2)));
assert_eq!(encoding.token_to_word(11), None);
assert_eq!(encoding.char_to_token(3, 0), Some(1));
assert_eq!(encoding.char_to_token(8, 0), Some(2));
assert_eq!(encoding.char_to_token(16, 0), None);
assert_eq!(encoding.char_to_token(23, 0), Some(6));
assert_eq!(encoding.char_to_token(2, 1), Some(7));
assert_eq!(encoding.char_to_token(9, 1), Some(9));
assert_eq!(encoding.char_to_word(3, 0), Some(0));
assert_eq!(encoding.char_to_word(8, 0), Some(1));
assert_eq!(encoding.char_to_word(16, 0), None);
assert_eq!(encoding.char_to_word(23, 0), Some(3));
assert_eq!(encoding.char_to_word(2, 1), Some(0));
assert_eq!(encoding.char_to_word(9, 1), Some(2));
}