in tensorflow_text/core/kernels/sentence_breaking_utils.cc [186:214]
Status UnicodeUtil::IsPunctuationWord(const absl::string_view& input,
bool* result) const {
*result = false;
bool has_more_than_one_char = false;
UChar32 char_value;
const auto& status = GetOneUChar(input, &has_more_than_one_char, &char_value);
if (!status.ok()) return status;
if (has_more_than_one_char) {
*result = false;
return Status::OK();
}
// These are unicode characters that should be considered in this category but
// are not covered by any of the ICU properties.
switch (char_value) {
case '`':
case '<':
case '>':
case '~':
case 5741:
*result = true;
return Status::OK();
}
*result = u_ispunct(char_value) ||
u_hasBinaryProperty(char_value, UCHAR_DASH) ||
u_hasBinaryProperty(char_value, UCHAR_HYPHEN);
return Status::OK();
}