Status UnicodeUtil::IsPunctuationWord()

in tensorflow_text/core/kernels/sentence_breaking_utils.cc [186:214]


Status UnicodeUtil::IsPunctuationWord(const absl::string_view& input,
                                      bool* result) const {
  *result = false;
  bool has_more_than_one_char = false;
  UChar32 char_value;
  const auto& status = GetOneUChar(input, &has_more_than_one_char, &char_value);
  if (!status.ok()) return status;
  if (has_more_than_one_char) {
    *result = false;
    return Status::OK();
  }

  // These are unicode characters that should be considered in this category but
  // are not covered by any of the ICU properties.
  switch (char_value) {
    case '`':
    case '<':
    case '>':
    case '~':
    case 5741:
      *result = true;
      return Status::OK();
  }

  *result = u_ispunct(char_value) ||
            u_hasBinaryProperty(char_value, UCHAR_DASH) ||
            u_hasBinaryProperty(char_value, UCHAR_HYPHEN);
  return Status::OK();
}