fn test_unk_token()

in tokenizers/src/models/unigram/trainer.rs [735:792]


    fn test_unk_token() {
        // 1. Should add `unk_token` as first special token
        let trainer = UnigramTrainerBuilder::default()
            .show_progress(false)
            .special_tokens(vec![
                AddedToken::from("[SEP]", true),
                AddedToken::from("[CLS]", true),
            ])
            .unk_token(Some("[UNK]".into()))
            .build()
            .unwrap();

        let mut unigram = Unigram::default();
        trainer
            .do_train(vec![("The".into(), 12), ("are".into(), 11)], &mut unigram)
            .unwrap();

        let mut pieces = unigram.iter();
        assert_eq!(pieces.next(), Some(&("[UNK]".into(), 0.0)));
        assert_eq!(pieces.next(), Some(&("[SEP]".into(), 0.0)));
        assert_eq!(pieces.next(), Some(&("[CLS]".into(), 0.0)));

        // 2. Let it where it is
        let trainer = UnigramTrainerBuilder::default()
            .show_progress(false)
            .special_tokens(vec![
                AddedToken::from("[SEP]", true),
                AddedToken::from("[CLS]", true),
                AddedToken::from("[UNK]", true),
            ])
            .unk_token(Some("[UNK]".into()))
            .build()
            .unwrap();

        let mut unigram = Unigram::default();
        trainer
            .do_train(vec![("The".into(), 12), ("are".into(), 11)], &mut unigram)
            .unwrap();

        let mut pieces = unigram.iter();
        assert_eq!(pieces.next(), Some(&("[SEP]".into(), 0.0)));
        assert_eq!(pieces.next(), Some(&("[CLS]".into(), 0.0)));
        assert_eq!(pieces.next(), Some(&("[UNK]".into(), 0.0)));

        // 3. Don't put it there if not needed
        let trainer = UnigramTrainerBuilder::default()
            .show_progress(false)
            .build()
            .unwrap();

        let mut unigram = Unigram::default();
        trainer
            .do_train(vec![("The".into(), 12), ("are".into(), 11)], &mut unigram)
            .unwrap();

        let mut pieces = unigram.iter();
        assert_eq!(pieces.next().unwrap().0, "e".to_string());
    }