public BertPreTokenizer()

in Runtime/Tokenizers/PreTokenizers/PreTokenizers.cs [116:124]


        public BertPreTokenizer(JObject config) : base(config)
        {
            // Construct a pattern which matches the rust implementation:
            // https://github.com/huggingface/tokenizers/blob/b4fcc9ce6e4ad5806e82826f816acfdfdc4fcc67/tokenizers/src/pre_tokenizers/bert.rs#L11
            // Equivalent to removing whitespace and splitting on punctuation (both \p{P} and other ASCII characters)
            string punctuationRegex = "\\p{P}";
            this.pattern = new Regex($"[^\\s{punctuationRegex}]+|[{punctuationRegex}]", RegexOptions.Compiled | RegexOptions.Multiline);

        }