Status UnicodeUtil::IsClosePunc()

in tensorflow_text/core/kernels/sentence_breaking_utils.cc [92:129]


Status UnicodeUtil::IsClosePunc(const absl::string_view& input,
                                bool* result) const {
  *result = false;
  if (input == "''") {
    *result = true;
    return Status::OK();
  }

  bool has_more_than_one_char = false;
  UChar32 char_value;
  const auto& status = GetOneUChar(input, &has_more_than_one_char, &char_value);
  if (!status.ok()) return status;
  if (has_more_than_one_char) {
    *result = false;
    return Status::OK();
  }

  // These are unicode characters that should be considered in this category but
  // are not covered by any of the ICU properties.
  switch (char_value) {
    case '>':
    case ']':
    case '`':
    case 64831:  // Ornate right parenthesis
    case 65282:  // fullwidth quotation mark
    case 65287:  // fullwidth apostrophe
      *result = true;
      return Status::OK();
  }

  ULineBreak lb_property = static_cast<ULineBreak>(
      u_getIntPropertyValue(char_value, UCHAR_LINE_BREAK));

  *result = lb_property == U_LB_CLOSE_PUNCTUATION ||
            lb_property == U_LB_CLOSE_PARENTHESIS ||
            lb_property == U_LB_QUOTATION;
  return Status::OK();
}