in core/src/tokenization.rs [538:648]
fn tokenizer() {
let api = ApiBuilder::from_env().build().unwrap();
let filename = api
.model("BAAI/bge-m3".to_string())
.get("tokenizer.json")
.unwrap();
let string = "这是一个文本向量化的测试句子";
let tokenizer = Tokenizer::from_file(filename).unwrap();
let encoded = tokenizer.encode(string, true).unwrap();
assert_eq!(
encoded.get_offsets(),
vec![
(0, 0),
(0, 3),
(0, 12),
(12, 18),
(18, 21),
(21, 24),
(24, 30),
(30, 36),
(36, 39),
(39, 42),
(0, 0)
]
);
let tokens = into_tokens(encoded, &string);
assert_eq!(
tokens,
vec![
SimpleToken {
id: 0,
text: "<s>".to_string(),
special: true,
start: None,
stop: None
},
SimpleToken {
id: 6,
text: "这".to_string(),
special: false,
start: Some(0),
stop: Some(3)
},
SimpleToken {
id: 100013,
text: "这是一个".to_string(),
special: false,
start: Some(0),
stop: Some(12)
},
SimpleToken {
id: 189061,
text: "文本".to_string(),
special: false,
start: Some(12),
stop: Some(18)
},
SimpleToken {
id: 2110,
text: "向".to_string(),
special: false,
start: Some(18),
stop: Some(21)
},
SimpleToken {
id: 3272,
text: "量".to_string(),
special: false,
start: Some(21),
stop: Some(24)
},
SimpleToken {
id: 41904,
text: "化的".to_string(),
special: false,
start: Some(24),
stop: Some(30)
},
SimpleToken {
id: 49125,
text: "测试".to_string(),
special: false,
start: Some(30),
stop: Some(36)
},
SimpleToken {
id: 27683,
text: "句".to_string(),
special: false,
start: Some(36),
stop: Some(39)
},
SimpleToken {
id: 1344,
text: "子".to_string(),
special: false,
start: Some(39),
stop: Some(42)
},
SimpleToken {
id: 2,
text: "</s>".to_string(),
special: true,
start: None,
stop: None
}
]
);
}