in tensorflow_text/core/kernels/sentence_breaking_utils.cc [92:129]
Status UnicodeUtil::IsClosePunc(const absl::string_view& input,
bool* result) const {
*result = false;
if (input == "''") {
*result = true;
return Status::OK();
}
bool has_more_than_one_char = false;
UChar32 char_value;
const auto& status = GetOneUChar(input, &has_more_than_one_char, &char_value);
if (!status.ok()) return status;
if (has_more_than_one_char) {
*result = false;
return Status::OK();
}
// These are unicode characters that should be considered in this category but
// are not covered by any of the ICU properties.
switch (char_value) {
case '>':
case ']':
case '`':
case 64831: // Ornate right parenthesis
case 65282: // fullwidth quotation mark
case 65287: // fullwidth apostrophe
*result = true;
return Status::OK();
}
ULineBreak lb_property = static_cast<ULineBreak>(
u_getIntPropertyValue(char_value, UCHAR_LINE_BREAK));
*result = lb_property == U_LB_CLOSE_PUNCTUATION ||
lb_property == U_LB_CLOSE_PARENTHESIS ||
lb_property == U_LB_QUOTATION;
return Status::OK();
}