sql_utils/common/utf_util.cc (200 lines of code) (raw):

/* * Copyright 2023 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "sql_utils/common/utf_util.h" #include <algorithm> #include <cstdint> #include <string> #include "sql_utils/base/logging.h" #include "absl/strings/ascii.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_replace.h" #include "absl/strings/string_view.h" #include "unicode/utf8.h" #include "sql_utils/base/ret_check.h" namespace bigquery_ml_utils { constexpr absl::string_view kReplacementCharacter = "\uFFFD"; static int SpanWellFormedUTF8(const char* s, int length) { for (int i = 0; i < length;) { int start = i; UChar32 c; U8_NEXT(s, i, length, c); if (c < 0) { return start; } } return length; } absl::string_view::size_type SpanWellFormedUTF8(absl::string_view s) { return static_cast<absl::string_view::size_type>( SpanWellFormedUTF8(s.data(), static_cast<int>(s.length()))); } bool IsWellFormedUTF8(absl::string_view s) { return SpanWellFormedUTF8(s) == s.length(); } std::string CoerceToWellFormedUTF8(absl::string_view input) { const char* s = input.data(); size_t length = input.length(); size_t prev = 0; std::string out; for (size_t i = 0; i < length;) { size_t start = i; UChar32 c; U8_NEXT(s, i, length, c); if (c < 0) { if (prev < start) { // Append the well-formed span between the last ill-formed sequence // (or start of input), and the point just before the current one. out.append(s + prev, start - prev); } out.append(kReplacementCharacter.data(), kReplacementCharacter.size()); prev = i; } } if (prev < length) { // Append any remaining well formed span. out.append(s + prev, length - prev); } return out; } std::string PrettyTruncateUTF8(absl::string_view input, int max_bytes) { if (max_bytes <= 0) { return ""; } if (input.size() <= max_bytes) { // Already small enough, take no action. return std::string(input); } const bool append_ellipsis = max_bytes > 3; int new_width = append_ellipsis ? max_bytes - 3 : max_bytes; const uint8_t* str_ptr = reinterpret_cast<const uint8_t*>(input.data()); // Handles the edge case that a unicode character is cut in half. Finds // a safe width that has complete unicode characters. No (further) truncation // occurs for totally invalid unicode. U8_SET_CP_START(str_ptr, 0, new_width); if (append_ellipsis) return absl::StrCat(input.substr(0, new_width), "..."); else return std::string(input.substr(0, new_width)); } bool CheckAndCastStrLength(absl::string_view str, int32_t* str_length32) { if (str.length() > std::numeric_limits<int32_t>::max()) { return false; } *str_length32 = static_cast<int32_t>(str.length()); return true; } std::optional<int32_t> ForwardN(absl::string_view str, int32_t str_length32, int64_t num_code_points) { int32_t str_offset = 0; for (int64_t i = 0; i < num_code_points && str_offset < str_length32; ++i) { UChar32 character; U8_NEXT(str, str_offset, str_length32, character); if (character < 0) { return absl::nullopt; } } return str_offset; } absl::StatusOr<int32_t> LengthUtf8(absl::string_view str) { SQL_RET_CHECK_LE(str.size(), std::numeric_limits<int32_t>::max()); int32_t str_length32 = static_cast<int32_t>(str.size()); int utf8_length = 0; int32_t offset = 0; while (offset < str_length32) { UChar32 character; U8_NEXT(str.data(), offset, str_length32, character); if (character < 0) { return absl::InvalidArgumentError("Invalid utf8"); } utf8_length++; } return utf8_length; } namespace { // Helper function for GetStringWithEllipses(). // Invoked when the caller has already determined that // 1) <str> is a valid UTF-8 string and // 2) <str> is long enough that truncation is required. // // Computes the string to appear before the "...", along with its character // length. absl::Status ComputePrefixBeforeEllipses(absl::string_view str, int min_prefix_code_points, int max_total_code_points, std::string& prefix, int& prefix_char_len) { // Set the maximum prefix length to allow enough room for "..." and a suffix // of equal length to come after it, without exceeding <max_chars>. int max_prefix_chars = max_total_code_points - 3 - min_prefix_code_points; int32_t offset = 0; int32_t str_len = static_cast<int32_t>(str.size()); prefix.clear(); prefix_char_len = 0; bool in_word = false; int num_trailing_spaces = 0; while (offset < str_len) { int32_t prev_offset = offset; bool prev_in_word = in_word; // Fetch the next Unicode character. We already checked that <str> is // valid UTF-8, so this call should never fail. UChar32 character; U8_NEXT(str.data(), offset, str_len, character); SQL_RET_CHECK_GE(character, 0); in_word = absl::ascii_isalnum(character) || character == '_'; if (prefix_char_len >= min_prefix_code_points && (!prev_in_word || !in_word)) { break; } // Character is part of prefix absl::StrAppend(&prefix, str.substr(prev_offset, offset - prev_offset)); ++prefix_char_len; if (absl::ascii_isspace(character)) { ++num_trailing_spaces; } else { num_trailing_spaces = 0; } if (prefix_char_len >= max_prefix_chars) { break; } } absl::StripTrailingAsciiWhitespace(&prefix); prefix_char_len -= num_trailing_spaces; return absl::OkStatus(); } // Helper function for GetStringWithEllipses(). // Invoked when the caller has already determined that // 1) <str> is a valid UTF-8 string and // 2) <str> is long enough that truncation is required. // // Computes the string to appear after the "...". absl::StatusOr<std::string> ComputeSuffixAfterEllipses( absl::string_view str, int min_suffix_code_points, int max_total_code_points) { // Build the suffix in reverse order, then reverse it at the end. std::string suffix; int suffix_char_len = 0; int offset = static_cast<int>(str.size()); bool in_word = false; int max_suffix_chars = max_total_code_points - 3; while (offset >= 0) { int32_t prev_offset = offset; bool prev_in_word = in_word; UChar32 character; U8_PREV(str.data(), 0, offset, character); in_word = absl::ascii_isalnum(character) || character == '_'; if (suffix_char_len >= min_suffix_code_points && (!prev_in_word || !in_word)) { break; } int char_bytelen = prev_offset - offset; absl::StrAppend(&suffix, str.substr(offset, char_bytelen)); ++suffix_char_len; // Reverse the order of the bytes of the just-appended character in 'suffix' // so that the bytes will be in the correct order after the final call to // std::reverse(), below. std::reverse(suffix.end() - char_bytelen, suffix.end()); if (suffix_char_len >= max_suffix_chars) { break; } } std::reverse(suffix.begin(), suffix.end()); absl::StripLeadingAsciiWhitespace(&suffix); return suffix; } } // namespace absl::StatusOr<std::string> GetSummaryString(absl::string_view str, int max_code_points) { SQL_RET_CHECK_LE(str.size(), std::numeric_limits<int32_t>::max()); SQL_RET_CHECK_GE(max_code_points, 5); // minimum length to hold "a...b" // Simplify whitespace: // - Strip leading/trailing whitespace completely // - Replace all other whitespace characters with " ", replacing the "\r\n" // newline combination with a single space. std::string str_normalized = absl::StrReplaceAll(absl::StripAsciiWhitespace(str), {{"\r\n", " "}}); std::replace_if(str_normalized.begin(), str_normalized.end(), absl::ascii_isspace, ' '); // Check that <str> is valid UTF-8 and compute total UTF-8 length of input // string to see if we need to truncate it. SQL_ASSIGN_OR_RETURN(int32_t total_code_points, LengthUtf8(str_normalized)); std::string result; if (total_code_points <= max_code_points) { return std::string(str_normalized); // No truncation necessary } // Determine the minimum number of characters to include before and after // the "...". It must be small enough to allow two strings of that length, // plus "..." to not exceed <max_chars>. But, hueristically, it is good to // have it even smaller (assuming max_chars is not very small) so that we // have more flexibility to choose a prefix and length that doesn't break in // the middle of a word. int min_prefix_suffix_chars = std::min((max_code_points - 3) / 2, max_code_points / 3); std::string prefix; int prefix_char_len; SQL_RETURN_IF_ERROR( ComputePrefixBeforeEllipses(str_normalized, min_prefix_suffix_chars, max_code_points, prefix, prefix_char_len)); SQL_ASSIGN_OR_RETURN( std::string suffix, ComputeSuffixAfterEllipses(str_normalized, min_prefix_suffix_chars, max_code_points - prefix_char_len)); return absl::StrCat(prefix, "...", suffix); } } // namespace bigquery_ml_utils