in Runtime/Tokenizers/PreTokenizers/PreTokenizers.cs [116:124]
public BertPreTokenizer(JObject config) : base(config)
{
// Construct a pattern which matches the rust implementation:
// https://github.com/huggingface/tokenizers/blob/b4fcc9ce6e4ad5806e82826f816acfdfdc4fcc67/tokenizers/src/pre_tokenizers/bert.rs#L11
// Equivalent to removing whitespace and splitting on punctuation (both \p{P} and other ASCII characters)
string punctuationRegex = "\\p{P}";
this.pattern = new Regex($"[^\\s{punctuationRegex}]+|[{punctuationRegex}]", RegexOptions.Compiled | RegexOptions.Multiline);
}