in tokenizers/src/normalizers/byte_level.rs [56:173]
fn test_byte_level_normalize() {
let original = "Hello 我今天能为你做什么";
let normalized = "HelloĠæĪijä»Ĭ天èĥ½ä¸ºä½łåģļä»Ģä¹Ī";
assert_ne!(original, normalized);
let mut n = NormalizedString::from(original);
let byte_level = ByteLevel::new();
byte_level.normalize(&mut n).unwrap();
assert_eq!(&n.get(), &normalized);
assert_eq!(
n,
NormalizedString::new(
original.to_string(),
normalized.to_string(),
vec![
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(5, 6),
(6, 9),
(6, 9),
(6, 9),
(6, 9),
(6, 9),
(6, 9),
(9, 12),
(9, 12),
(9, 12),
(9, 12),
(9, 12),
(9, 12),
(12, 15),
(12, 15),
(12, 15),
(12, 15),
(12, 15),
(12, 15),
(15, 18),
(15, 18),
(15, 18),
(15, 18),
(15, 18),
(15, 18),
(18, 21),
(18, 21),
(18, 21),
(18, 21),
(18, 21),
(18, 21),
(21, 24),
(21, 24),
(21, 24),
(21, 24),
(21, 24),
(21, 24),
(24, 27),
(24, 27),
(24, 27),
(24, 27),
(24, 27),
(24, 27),
(27, 30),
(27, 30),
(27, 30),
(27, 30),
(27, 30),
(27, 30),
(30, 33),
(30, 33),
(30, 33),
(30, 33),
(30, 33),
(30, 33)
],
0
)
);
assert_eq!(
n.alignments_original(),
vec![
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 7),
(7, 13),
(7, 13),
(7, 13),
(13, 19),
(13, 19),
(13, 19),
(19, 25),
(19, 25),
(19, 25),
(25, 31),
(25, 31),
(25, 31),
(31, 37),
(31, 37),
(31, 37),
(37, 43),
(37, 43),
(37, 43),
(43, 49),
(43, 49),
(43, 49),
(49, 55),
(49, 55),
(49, 55),
(55, 61),
(55, 61),
(55, 61)
]
);
}