in packages/core/src/text/tokenizer.ts [6:81]
static parse(str: string): string[] {
if (str.length === 0) {
return []
}
const tokens: string[] = []
const chars = Array.from(str)
const n = chars.length
// 上一个字符
let prevChar = chars[0]
// 上一个字符是字母或数字
let prevCharIsDigitOrLetter = Tokenizer.isDigitOrLetter(prevChar)
// 上一个字符是空白字符
let prevCharIsWhitespace = Tokenizer.isBreakingSpace(prevChar.codePointAt(0)!)
// 上一个字符是否是 regional indicator
let prevCharIsRegionalIndicator = Tokenizer.isRegionalIndicator(prevChar.codePointAt(0)!)
// 当前分组
let currentToken = prevChar
for (let i = 1; i < n; i++) {
const currentChar = chars[i]
const currentCharCode = currentChar.codePointAt(0)!
const currentCharIsDigitOrLetter = Tokenizer.isDigitOrLetter(currentChar)
const currentCharIsWhitespace = Tokenizer.isBreakingSpace(currentCharCode)
const currentCharIsRegionalIndicator = Tokenizer.isRegionalIndicator(currentCharCode)
const shouldKeep =
Tokenizer.isPunctuationStart(prevChar) // 前一个字符是前置标点
|| Tokenizer.isPunctuationEnd(currentChar) // 后置标点
|| prevCharIsDigitOrLetter && currentCharIsDigitOrLetter // 连续英数
|| !prevCharIsWhitespace && currentCharIsWhitespace // 首个空格
|| prevCharIsWhitespace && currentCharIsWhitespace // 连续的空格
|| Tokenizer.isVariationSelector(currentCharCode) // emoji variation selector
|| prevCharIsRegionalIndicator && currentCharIsRegionalIndicator // emoji flags
|| Tokenizer.isZWJ(currentCharCode) // ZWJ
|| Tokenizer.isEmojiModifier(currentCharCode) // emoji modifier
// console.info({
// prevChar,
// currentChar,
// currentToken,
// prevCharIsWhitespace,
// currentCharIsWhitespace,
// shouldKeep,
// currentCharCode: currentCharCode.toString(16),
// currentCharIsRegionalIndicator,
// 'currentChar.isVariationSelectors': Tokenizer.isVariationSelector(currentCharCode),
// 'currentChar.isEmojiModifier': Tokenizer.isEmojiModifier(currentCharCode),
// })
prevChar = currentChar
prevCharIsDigitOrLetter = currentCharIsDigitOrLetter
prevCharIsWhitespace = currentCharIsWhitespace
prevCharIsRegionalIndicator = currentCharIsRegionalIndicator
if (shouldKeep) {
currentToken += currentChar
} else {
tokens.push(currentToken)
currentToken = currentChar
}
}
if (currentToken !== '') {
tokens.push(currentToken)
}
return tokens
}