in tokenizers/src/pre_tokenizers/metaspace.rs [268:355]
fn non_legacy_meta_space() {
let mut pretok = Metaspace::new('▁', PrependScheme::Always, true);
pretok.set_prepend_scheme(PrependScheme::Always);
assert_eq!(pretok, Metaspace::new('▁', PrependScheme::Always, true));
pretok.set_prepend_scheme(PrependScheme::Never);
assert_eq!(pretok, Metaspace::new('▁', PrependScheme::Never, true));
pretok.set_prepend_scheme(PrependScheme::First);
assert_eq!(pretok, Metaspace::new('▁', PrependScheme::First, true));
let pretok = Metaspace::new('▁', PrependScheme::First, false);
let mut pretokenized = PreTokenizedString::from("Hey my friend <s>how▁are you");
let re_ref = Regex::new(r"(<s>)").unwrap();
pretokenized
.split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated))
.expect("Bad split");
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
vec![
("▁Hey▁my▁friend▁", (0, 23)),
("<s>", (23, 26)),
("how▁are▁you", (26, 41))
]
);
let pretok = Metaspace::new('▁', PrependScheme::Always, true);
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
vec![
("▁Hey", (0, 6)),
("▁my", (6, 11)),
("▁friend", (11, 20)),
("▁", (20, 23)),
("▁<s>", (23, 29)),
("▁how", (29, 35)),
("▁are", (35, 41)),
("▁you", (41, 47))
]
);
let pretok = Metaspace::new('▁', PrependScheme::First, false);
let mut pretokenized = PreTokenizedString::from(" Hey <s>how"); // test with prefix
pretokenized
.split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated))
.expect("Bad split");
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
vec![("▁Hey▁", (0, 9)), ("<s>", (9, 12)), ("how", (12, 15))]
);
let mut pretokenized = PreTokenizedString::from(" Hey <s>how <s>are <s> you"); // test with many splits
pretokenized
.split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated))
.expect("Bad split");
pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!(
pretokenized
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
.into_iter()
.map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(),
vec![
("▁Hey▁", (0, 9)),
("<s>", (9, 12)),
("how▁", (12, 18)),
("<s>", (18, 21)),
("are▁", (21, 27)),
("<s>", (27, 30)),
("▁you", (30, 36))
]
);
}