sql_utils/common/utf_util.h (22 lines of code) (raw):
/*
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef THIRD_PARTY_PY_BIGQUERY_ML_UTILS_SQL_UTILS_COMMON_UTF_UTIL_H_
#define THIRD_PARTY_PY_BIGQUERY_ML_UTILS_SQL_UTILS_COMMON_UTF_UTIL_H_
#include <cstdint>
#include <string>
#include "absl/status/statusor.h"
#include "absl/strings/string_view.h"
#include "absl/types/optional.h"
#include "unicode/utf8.h"
namespace bigquery_ml_utils {
// Returns the length of `s` that is well formed UTF8. This will return
// `s.length()` if it is completely well formed UTF8.
absl::string_view::size_type SpanWellFormedUTF8(absl::string_view s);
bool IsWellFormedUTF8(absl::string_view s);
// Returns a well-formed Unicode string. Replaces any ill-formed
// subsequences with the Unicode REPLACEMENT CHARACTER (U+FFFD).
// This is usually rendered as a diamond with a question mark in the middle.
std::string CoerceToWellFormedUTF8(absl::string_view input);
// Truncate the given UTF8 string to ensure it is no more than max_bytes.
// If truncated, attempts to create a well formed unicode string, and append an
// (ascii) ellipsis. If max_bytes is < 3, no ellipsis is appended.
std::string PrettyTruncateUTF8(absl::string_view input, int max_bytes);
// Verifies that the string length can be represented in a 32-bit signed int and
// returns that value. Fitting in an int32_t is a requirement for icu methods.
ABSL_MUST_USE_RESULT bool CheckAndCastStrLength(absl::string_view str,
int32_t* str_length32);
// Returns the offset needed to forward `str` by `num_code_points` or an empty
// optional if an invalid UTF-8 codepoint is detected.
// Similar to U8_FWD_N, but will detect bad utf codepoints.
std::optional<int32_t> ForwardN(absl::string_view str, int32_t str_length32,
int64_t num_code_points);
// Returns the number of code points in the given UTF-8 string, or a failed
// status if <str> is not a valid utf-8 string.
absl::StatusOr<int32_t> LengthUtf8(absl::string_view str);
// Transforms <str> into a single line string, guaranteed to fit within
// <max_code_points> UTF-8 code points, while preserving as many useful parts of
// the input string as possible.
//
// The following describes the transformations performed:
// 1) Leading and trailing whitespace is skipped.
// 2) All whitespace characters are replaced with " ". The "\r\n" newline
// combination is replaced with a single " ".
// 3) If the resultant string has no more than <hard_max_chars> UTF-8
// characters, it is returned as is. If not, the returned string contains
// the first few characters of the string, followed by "...", followed by
// the last few characters.
//
// The number of characters to include before and after the "..." is
// determined heuristically, with goals of minimizing whitespace and
// avoiding breaking up words.
//
// A failed status is returned if <str> is not a valid UTF-8 string, or if
// <max_code_points> is not at least 5 (the minimum length to hold "...", plus
// one character before and after).
absl::StatusOr<std::string> GetSummaryString(absl::string_view str,
int max_code_points);
} // namespace bigquery_ml_utils
#endif // THIRD_PARTY_PY_BIGQUERY_ML_UTILS_SQL_UTILS_COMMON_UTF_UTIL_H_