fn mappings()

in tokenizers/src/tokenizer/encoding.rs [792:881]


    fn mappings() {
        let encoding = Encoding {
            ids: vec![0; 11], // Needed for Encoding::len
            tokens: vec![
                // First sequence:
                "He".into(),
                "llo".into(),
                "won".into(),
                "der".into(),
                "ful".into(),
                "friend".into(),
                "!".into(),
                // Second sequence:
                "How".into(),
                "are".into(),
                "you".into(),
                "?".into(),
            ],
            offsets: vec![
                // First sequence:
                (0, 2),
                (2, 5),
                (7, 10),
                (10, 13),
                (13, 16),
                (17, 23),
                (23, 24),
                // Second sequence:
                (0, 3),
                (4, 7),
                (8, 11),
                (11, 12),
            ],
            words: vec![
                // First sequence:
                Some(0),
                Some(0),
                Some(1),
                Some(1),
                Some(1),
                Some(2),
                Some(3),
                // Second sequence:
                Some(0),
                Some(1),
                Some(2),
                Some(3),
            ],
            sequence_ranges: AHashMap::from_iter(vec![(0, 0..7), (1, 7..11)]),
            ..Default::default()
        };
        assert_eq!(encoding.word_to_tokens(0, 0), Some((0, 2)));
        assert_eq!(encoding.word_to_tokens(1, 0), Some((2, 5)));
        assert_eq!(encoding.word_to_tokens(2, 0), Some((5, 6)));
        assert_eq!(encoding.word_to_tokens(3, 0), Some((6, 7)));
        assert_eq!(encoding.word_to_tokens(0, 1), Some((7, 8)));
        assert_eq!(encoding.word_to_tokens(1, 1), Some((8, 9)));
        assert_eq!(encoding.word_to_tokens(2, 1), Some((9, 10)));
        assert_eq!(encoding.word_to_tokens(3, 1), Some((10, 11)));

        assert_eq!(encoding.word_to_chars(0, 0), Some((0, 5)));
        assert_eq!(encoding.word_to_chars(1, 0), Some((7, 16)));
        assert_eq!(encoding.word_to_chars(0, 1), Some((0, 3)));
        assert_eq!(encoding.word_to_chars(1, 1), Some((4, 7)));

        assert_eq!(encoding.token_to_chars(0), Some((0, (0, 2))));
        assert_eq!(encoding.token_to_chars(1), Some((0, (2, 5))));
        assert_eq!(encoding.token_to_chars(7), Some((1, (0, 3))));
        assert_eq!(encoding.token_to_chars(9), Some((1, (8, 11))));

        assert_eq!(encoding.token_to_word(1), Some((0, 0)));
        assert_eq!(encoding.token_to_word(2), Some((0, 1)));
        assert_eq!(encoding.token_to_word(7), Some((1, 0)));
        assert_eq!(encoding.token_to_word(9), Some((1, 2)));
        assert_eq!(encoding.token_to_word(11), None);

        assert_eq!(encoding.char_to_token(3, 0), Some(1));
        assert_eq!(encoding.char_to_token(8, 0), Some(2));
        assert_eq!(encoding.char_to_token(16, 0), None);
        assert_eq!(encoding.char_to_token(23, 0), Some(6));
        assert_eq!(encoding.char_to_token(2, 1), Some(7));
        assert_eq!(encoding.char_to_token(9, 1), Some(9));

        assert_eq!(encoding.char_to_word(3, 0), Some(0));
        assert_eq!(encoding.char_to_word(8, 0), Some(1));
        assert_eq!(encoding.char_to_word(16, 0), None);
        assert_eq!(encoding.char_to_word(23, 0), Some(3));
        assert_eq!(encoding.char_to_word(2, 1), Some(0));
        assert_eq!(encoding.char_to_word(9, 1), Some(2));
    }