static parse()

in packages/core/src/text/tokenizer.ts [6:81]


  static parse(str: string): string[] {

    if (str.length === 0) {
      return []
    }

    const tokens: string[] = []
    const chars = Array.from(str)
    const n = chars.length

    // 上一个字符
    let prevChar = chars[0]

    // 上一个字符是字母或数字
    let prevCharIsDigitOrLetter = Tokenizer.isDigitOrLetter(prevChar)

    // 上一个字符是空白字符
    let prevCharIsWhitespace = Tokenizer.isBreakingSpace(prevChar.codePointAt(0)!)

    // 上一个字符是否是 regional indicator
    let prevCharIsRegionalIndicator = Tokenizer.isRegionalIndicator(prevChar.codePointAt(0)!)

    // 当前分组
    let currentToken = prevChar

    for (let i = 1; i < n; i++) {
      const currentChar = chars[i]

      const currentCharCode = currentChar.codePointAt(0)!
      const currentCharIsDigitOrLetter = Tokenizer.isDigitOrLetter(currentChar)
      const currentCharIsWhitespace = Tokenizer.isBreakingSpace(currentCharCode)
      const currentCharIsRegionalIndicator = Tokenizer.isRegionalIndicator(currentCharCode)

      const shouldKeep =
        Tokenizer.isPunctuationStart(prevChar) // 前一个字符是前置标点
        || Tokenizer.isPunctuationEnd(currentChar) // 后置标点
        || prevCharIsDigitOrLetter && currentCharIsDigitOrLetter // 连续英数
        || !prevCharIsWhitespace && currentCharIsWhitespace // 首个空格
        || prevCharIsWhitespace && currentCharIsWhitespace // 连续的空格
        || Tokenizer.isVariationSelector(currentCharCode) // emoji variation selector
        || prevCharIsRegionalIndicator && currentCharIsRegionalIndicator // emoji flags
        || Tokenizer.isZWJ(currentCharCode) // ZWJ
        || Tokenizer.isEmojiModifier(currentCharCode) // emoji modifier

      // console.info({
      //   prevChar,
      //   currentChar,
      //   currentToken,
      //   prevCharIsWhitespace,
      //   currentCharIsWhitespace,
      //   shouldKeep,
      //   currentCharCode: currentCharCode.toString(16),
      //   currentCharIsRegionalIndicator,
      //   'currentChar.isVariationSelectors': Tokenizer.isVariationSelector(currentCharCode),
      //   'currentChar.isEmojiModifier': Tokenizer.isEmojiModifier(currentCharCode),
      // })

      prevChar = currentChar
      prevCharIsDigitOrLetter = currentCharIsDigitOrLetter
      prevCharIsWhitespace = currentCharIsWhitespace
      prevCharIsRegionalIndicator = currentCharIsRegionalIndicator

      if (shouldKeep) {
        currentToken += currentChar
      } else {
        tokens.push(currentToken)
        currentToken = currentChar
      }
    }

    if (currentToken !== '') {
      tokens.push(currentToken)
    }

    return tokens
  }