in tokenizers/src/tokenizer/normalizer.rs [1857:2281]
fn transform_range_multiple_bytes() {
let s = NormalizedString::from("πΎπ π π");
// Removing at the beginning
let mut current = s.clone();
current.transform_range(Range::Original(0..8), vec![('G', -1)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "Gπ π".into(),
alignments: vec![
(0, 4),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 1),
(0, 1),
(0, 1),
(0, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 1),
(1, 5),
(1, 5),
(1, 5),
(1, 5),
(5, 9),
(5, 9),
(5, 9),
(5, 9)
]
);
assert_eq!(current.get_range(Range::Original(0..8)).unwrap(), "G");
assert_eq!(current.get_range(Range::Original(0..4)).unwrap(), "G");
assert_eq!(
current.get_range_original(Range::Original(0..4)).unwrap(),
"πΎ"
);
assert_eq!(
current.get_range_original(Range::Original(0..8)).unwrap(),
"πΎπ "
);
// Removing in the middle
let mut current = s.clone();
current.transform_range(Range::Original(4..12), vec![('o', -1)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "πΎoπ".into(),
alignments: vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 5),
(4, 5),
(4, 5),
(4, 5),
(5, 5),
(5, 5),
(5, 5),
(5, 5),
(5, 9),
(5, 9),
(5, 9),
(5, 9)
]
);
// Removing at the end
let mut current = s.clone();
current.transform_range(Range::Original(12..), vec![('d', 0), ('!', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "πΎπ π d!".into(),
alignments: vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
// Adding at the beginning
let mut current = s.clone();
current.transform_range(Range::Original(0..4), vec![('_', 1), ('πΎ', 0)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "_πΎπ π π".into(),
alignments: vec![
(0, 0),
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(1, 5),
(1, 5),
(1, 5),
(1, 5),
(5, 9),
(5, 9),
(5, 9),
(5, 9),
(9, 13),
(9, 13),
(9, 13),
(9, 13),
(13, 17),
(13, 17),
(13, 17),
(13, 17)
]
);
assert_eq!(current.get_range(Range::Original(0..8)).unwrap(), "πΎπ ");
assert_eq!(current.get_range(Range::Original(0..4)).unwrap(), "πΎ");
assert_eq!(
current.get_range_original(Range::Original(0..4)).unwrap(),
"πΎ"
);
assert_eq!(
current.get_range_original(Range::Original(0..8)).unwrap(),
"πΎπ "
);
// Equivalent to the previous one
let mut current = s.clone();
current.transform_range(Range::Original(0..0), vec![('_', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "_πΎπ π π".into(),
alignments: vec![
(0, 0),
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(1, 5),
(1, 5),
(1, 5),
(1, 5),
(5, 9),
(5, 9),
(5, 9),
(5, 9),
(9, 13),
(9, 13),
(9, 13),
(9, 13),
(13, 17),
(13, 17),
(13, 17),
(13, 17)
]
);
assert_eq!(current.get_range(Range::Original(0..8)).unwrap(), "πΎπ ");
assert_eq!(current.get_range(Range::Original(0..4)).unwrap(), "πΎ");
assert_eq!(
current.get_range_original(Range::Original(0..4)).unwrap(),
"πΎ"
);
assert_eq!(
current.get_range_original(Range::Original(0..8)).unwrap(),
"πΎπ "
);
// Adding as part of the first character
let mut current = s.clone();
current.transform_range(Range::Original(0..4), vec![('πΎ', 0), ('o', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "πΎoπ π π".into(),
alignments: vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 5),
(0, 5),
(0, 5),
(0, 5),
(5, 9),
(5, 9),
(5, 9),
(5, 9),
(9, 13),
(9, 13),
(9, 13),
(9, 13),
(13, 17),
(13, 17),
(13, 17),
(13, 17)
]
);
assert_eq!(current.get_range(Range::Original(0..8)).unwrap(), "πΎoπ ");
assert_eq!(current.get_range(Range::Original(0..4)).unwrap(), "πΎo");
assert_eq!(
current.get_range_original(Range::Original(0..4)).unwrap(),
"πΎ"
);
assert_eq!(
current.get_range_original(Range::Original(0..8)).unwrap(),
"πΎπ "
);
// Adding in the middle
let mut current = s.clone();
current.transform_range(
Range::Original(4..8),
vec![('π ', 0), ('o', 1), ('o', 1), ('o', 1)],
0,
);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "πΎπ oooπ π".into(),
alignments: vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 11),
(4, 11),
(4, 11),
(4, 11),
(11, 15),
(11, 15),
(11, 15),
(11, 15),
(15, 19),
(15, 19),
(15, 19),
(15, 19)
]
);
// Adding at the end
let mut current = s;
current.transform_range(Range::Original(16..), vec![('!', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "πΎπ π π".into(),
normalized: "πΎπ π π!".into(),
alignments: vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 16),
(12, 16),
(12, 16),
(12, 16),
(12, 16)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 4),
(0, 4),
(0, 4),
(0, 4),
(4, 8),
(4, 8),
(4, 8),
(4, 8),
(8, 12),
(8, 12),
(8, 12),
(8, 12),
(12, 17),
(12, 17),
(12, 17),
(12, 17)
]
);
}