in tokenizers/src/tokenizer/normalizer.rs [1236:1286]
fn added_characters_alignment() {
let mut n = NormalizedString::from("野口 No");
n.transform(
n.get().to_owned().chars().flat_map(|c| {
if (c as usize) > 0x4E00 {
vec![(' ', 0), (c, 1), (' ', 1)]
} else {
vec![(c, 0)]
}
}),
0,
);
assert_eq!(
n,
NormalizedString {
original: "野口 No".into(),
normalized: " 野 口 No".into(),
alignments: vec![
(0, 3),
(0, 3),
(0, 3),
(0, 3),
(0, 3),
(3, 6),
(3, 6),
(3, 6),
(3, 6),
(3, 6),
(6, 7),
(7, 8),
(8, 9)
],
original_shift: 0
}
);
assert_eq!(
n.alignments_original(),
vec![
(0, 5),
(0, 5),
(0, 5),
(5, 10),
(5, 10),
(5, 10),
(10, 11),
(11, 12),
(12, 13)
]
);
}