fn test_encode2()

in tokenizers/src/models/unigram/model.rs [556:608]


    fn test_encode2() {
        let sentencepieces = vec![
            ("<unk>".to_string(), 0.0),
            ("ab".to_string(), 0.0),
            ("cd".to_string(), -0.1),
            ("abc".to_string(), -0.2),
            ("a".to_string(), -0.3),
            ("b".to_string(), -0.4),
            ("c".to_string(), -0.5),
            ("ABC".to_string(), -0.5),
            ("abcdabcd".to_string(), 20.0), // User defined just max the scores.
            ("q".to_string(), 20.5),
            ("r".to_string(), 20.5),
            ("qr".to_string(), -0.5),
        ];

        let mut model = Unigram::from(sentencepieces, Some(0), false).unwrap();

        for is_optimized in &[true, false] {
            model.set_optimized(*is_optimized);
            println!("IsOptimized {is_optimized:?}");
            assert_eq!(model.encode("abc").unwrap(), vec!["abc"]);
            assert_eq!(model.encode("AB").unwrap(), vec!["AB"]);

            model.set_fuse_unk(false);
            assert_eq!(model.encode("AB").unwrap(), vec!["A", "B"]);
            model.set_fuse_unk(true);
            assert_eq!(model.encode("AB").unwrap(), vec!["AB"]);

            assert_eq!(model.encode("abcd").unwrap(), vec!["ab", "cd"]);
            assert_eq!(model.encode("abcc").unwrap(), vec!["abc", "c"]);
            assert_eq!(
                model.encode("xabcabaabcdd").unwrap(),
                vec!["x", "abc", "ab", "a", "ab", "cd", "d"]
            );
            model.set_fuse_unk(false);
            assert_eq!(
                model.encode("xyz東京").unwrap(),
                vec!["x", "y", "z", "東", "京"]
            );
            model.set_fuse_unk(true);
            assert_eq!(model.encode("xyz東京").unwrap(), vec!["xyz東京"]);

            // User encoded in original version
            assert_eq!(model.encode("ABC").unwrap(), vec!["ABC"]);
            assert_eq!(model.encode("abABCcd").unwrap(), vec!["ab", "ABC", "cd"]);
            assert_eq!(
                model.encode("ababcdabcdcd").unwrap(),
                vec!["ab", "abcdabcd", "cd"]
            );
            assert_eq!(model.encode("abqrcd").unwrap(), vec!["ab", "q", "r", "cd"]);
        }
    }