in tokenizers/src/tokenizer/normalizer.rs [1511:1854]
fn transform_range_single_bytes() {
let s = NormalizedString::from("Hello friend");
// Removing at the beginning
let mut current = s.clone();
current.transform_range(Range::Original(0..4), vec![('Y', 0)], 3);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "Yo friend".into(),
alignments: vec![
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 0),
(0, 0),
(0, 0),
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9)
]
);
// Removing in the middle
let mut current = s.clone();
current.transform_range(
Range::Original(3..10),
vec![('_', 0), ('F', 0), ('R', -2)],
2,
);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "Hel_FRnd".into(),
alignments: vec![
(0, 1),
(1, 2),
(2, 3),
(5, 6),
(6, 7),
(7, 8),
(10, 11),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 1),
(1, 2),
(2, 3),
(3, 3),
(3, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 6),
(6, 6),
(6, 7),
(7, 8)
]
);
// Removing at the end
let mut current = s.clone();
current.transform_range(Range::Original(5..), vec![('_', 0), ('F', -5)], 0);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "Hello_F".into(),
alignments: vec![(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 7),
(7, 7),
(7, 7),
(7, 7),
(7, 7)
]
);
// Adding at the beginning
let mut current = s.clone();
current.transform_range(Range::Original(0..1), vec![('H', 1), ('H', 0)], 0);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "HHello friend".into(),
alignments: vec![
(0, 0),
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12),
(12, 13)
]
);
// Equivalent to the previous one
let mut current = s.clone();
current.transform_range(Range::Original(0..0), vec![('H', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "HHello friend".into(),
alignments: vec![
(0, 0),
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12),
(12, 13)
]
);
// Adding as part of the first character
let mut current = s.clone();
current.transform_range(Range::Original(0..1), vec![('H', 0), ('H', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "HHello friend".into(),
alignments: vec![
(0, 1),
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12),
(12, 13)
]
);
// Adding in the middle
let mut current = s.clone();
current.transform_range(
Range::Original(5..6),
vec![('_', 0), ('m', 1), ('y', 1), ('_', 1)],
0,
);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "Hello_my_friend".into(),
alignments: vec![
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(5, 6),
(5, 6),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 9),
(9, 10),
(10, 11),
(11, 12),
(12, 13),
(13, 14),
(14, 15)
]
);
// Adding at the end
let mut current = s;
current.transform_range(Range::Original(11..), vec![('d', 0), ('_', 1), ('!', 1)], 0);
assert_eq!(
current,
NormalizedString {
original: "Hello friend".into(),
normalized: "Hello friend_!".into(),
alignments: vec![
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 12),
(11, 12),
(11, 12)
],
original_shift: 0,
}
);
assert_eq!(
current.alignments_original(),
vec![
(0, 1),
(1, 2),
(2, 3),
(3, 4),
(4, 5),
(5, 6),
(6, 7),
(7, 8),
(8, 9),
(9, 10),
(10, 11),
(11, 14)
]
);
}