inline bool IsPunctuationOrChineseChar()

in tensorflow_text/core/kernels/fast_wordpiece_tokenizer_utils.h [251:267]

14 lines of code
19 McCabe index (conditional complexity)


inline bool IsPunctuationOrChineseChar(UChar32 char_value) {
  uint32_t cp = static_cast<uint32_t>(char_value);
  // Chinese characters that are treated as punctuation in Bert.
  if ((cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0x3400 && cp <= 0x4DBF) ||
      (cp >= 0x20000 && cp <= 0x2A6DF) || (cp >= 0x2A700 && cp <= 0x2B73F) ||
      (cp >= 0x2B740 && cp <= 0x2B81F) || (cp >= 0x2B820 && cp <= 0x2CEAF) ||
      (cp >= 0xF900 && cp <= 0xFAFF) || (cp >= 0x2F800 && cp <= 0x2FA1F)) {
    return true;
  }
  // Some special chars e.g. ">", "$" that are not covered by the u_ispunct are
  // considered as punctuation chars.
  if ((cp >= 33 && cp <= 47) || (cp >= 58 && cp <= 64) ||
      (cp >= 91 && cp <= 96) || (cp >= 123 && cp <= 126)) {
    return true;
  }
  return u_ispunct(char_value);
}