in tensorflow_text/core/kernels/sentence_breaking_utils.cc [57:90]
Status UnicodeUtil::IsTerminalPunc(const absl::string_view& input,
bool* result) const {
*result = false;
const auto& ellipsis_status = IsEllipsis(input, result);
// If there was a error decoding, or if we found an ellipsis, then return.
if (!ellipsis_status.ok()) return ellipsis_status;
if (*result) return Status::OK();
bool has_more_than_one_char = false;
UChar32 char_value;
const auto& status = GetOneUChar(input, &has_more_than_one_char, &char_value);
if (!status.ok()) return status;
if (has_more_than_one_char) {
*result = false;
return Status::OK();
}
// These are unicode characters that should be considered in this category but
// are not covered by any of the ICU properties.
switch (char_value) {
case 0x055C: // Armenian exclamation mark
case 0x055E: // Armenian question mark
case 0x17d4: // Khmer sign khan
case 0x037E: // Greek question mark
case 0x2026: // ellipsis
*result = true;
return Status::OK();
}
USentenceBreak sb_property = static_cast<USentenceBreak>(
u_getIntPropertyValue(char_value, UCHAR_SENTENCE_BREAK));
*result = sb_property == U_SB_ATERM || sb_property == U_SB_STERM;
return Status::OK();
}