Status UnicodeUtil::IsTerminalPunc()

in tensorflow_text/core/kernels/sentence_breaking_utils.cc [57:90]


Status UnicodeUtil::IsTerminalPunc(const absl::string_view& input,
                                   bool* result) const {
  *result = false;
  const auto& ellipsis_status = IsEllipsis(input, result);
  // If there was a error decoding, or if we found an ellipsis, then return.
  if (!ellipsis_status.ok()) return ellipsis_status;
  if (*result) return Status::OK();

  bool has_more_than_one_char = false;
  UChar32 char_value;
  const auto& status = GetOneUChar(input, &has_more_than_one_char, &char_value);
  if (!status.ok()) return status;
  if (has_more_than_one_char) {
    *result = false;
    return Status::OK();
  }

  // These are unicode characters that should be considered in this category but
  // are not covered by any of the ICU properties.
  switch (char_value) {
    case 0x055C:  // Armenian exclamation mark
    case 0x055E:  // Armenian question mark
    case 0x17d4:  // Khmer sign khan
    case 0x037E:  // Greek question mark
    case 0x2026:  // ellipsis
      *result = true;
      return Status::OK();
  }

  USentenceBreak sb_property = static_cast<USentenceBreak>(
      u_getIntPropertyValue(char_value, UCHAR_SENTENCE_BREAK));
  *result = sb_property == U_SB_ATERM || sb_property == U_SB_STERM;
  return Status::OK();
}