velox/functions/lib/string/StringCore.h (270 lines of code) (raw):
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cstring>
#include <string>
#include <string_view>
#include "folly/CPortability.h"
#include "velox/external/utf8proc/utf8procImpl.h"
#if (ENABLE_VECTORIZATION > 0) && !defined(_DEBUG) && !defined(DEBUG)
#if defined(__clang__) && (__clang_major__ > 7)
#define IS_SANITIZER \
((__has_feature(address_sanitizer) == 1) || \
(__has_feature(memory_sanitizer) == 1) || \
(__has_feature(thread_sanitizer) == 1) || \
(__has_feature(undefined_sanitizer) == 1))
#if IS_SANITIZER == 0
#define VECTORIZE_LOOP_IF_POSSIBLE _Pragma("clang loop vectorize(enable)")
#endif
#endif
#endif
#ifndef VECTORIZE_LOOP_IF_POSSIBLE
// Not supported
#define VECTORIZE_LOOP_IF_POSSIBLE
#endif
namespace facebook::velox::functions {
namespace stringCore {
/// Check if a given string is ascii
static bool isAscii(const char* str, size_t length);
FOLLY_ALWAYS_INLINE bool isAscii(const char* str, size_t length) {
for (auto i = 0; i < length; i++) {
if (str[i] & 0x80) {
return false;
}
}
return true;
}
/// Perform reverse for ascii string input
FOLLY_ALWAYS_INLINE static void
reverseAscii(char* output, const char* input, size_t length) {
auto j = length - 1;
VECTORIZE_LOOP_IF_POSSIBLE for (auto i = 0; i < length; ++i, --j) {
output[i] = input[j];
}
}
/// Perform reverse for utf8 string input
FOLLY_ALWAYS_INLINE static void
reverseUnicode(char* output, const char* input, size_t length) {
auto inputIdx = 0;
auto outputIdx = length;
while (inputIdx < length) {
int size;
utf8proc_codepoint(&input[inputIdx], size);
// invalid utf8 gets byte sequence with nextCodePoint==-1 and size==1,
// continue reverse invalid sequence byte by byte.
assert(
size > 0 && "UNLIKELY: could not get size of invalid utf8 code point");
outputIdx -= size;
assert(outputIdx >= 0 && outputIdx < length && "access out of bound");
std::memcpy(&output[outputIdx], &input[inputIdx], size);
inputIdx += size;
}
}
/// Perform upper for ascii string input
FOLLY_ALWAYS_INLINE static void
upperAscii(char* output, const char* input, size_t length) {
VECTORIZE_LOOP_IF_POSSIBLE for (auto i = 0; i < length; i++) {
if (input[i] >= 'a' && input[i] <= 'z') {
output[i] = input[i] - 32;
} else {
output[i] = input[i];
}
}
}
/// Perform lower for ascii string input
FOLLY_ALWAYS_INLINE static void
lowerAscii(char* output, const char* input, size_t length) {
VECTORIZE_LOOP_IF_POSSIBLE for (auto i = 0; i < length; i++) {
if (input[i] >= 'A' && input[i] <= 'Z') {
output[i] = input[i] + 32;
} else {
output[i] = input[i];
}
}
}
/// Perform upper for utf8 string input, output should be pre-allocated and
/// large enough for the results. outputLength refers to the number of bytes
/// available in the output buffer, and inputLength is the number of bytes in
/// the input string
FOLLY_ALWAYS_INLINE size_t upperUnicode(
char* output,
size_t outputLength,
const char* input,
size_t inputLength) {
auto inputIdx = 0;
auto outputIdx = 0;
while (inputIdx < inputLength) {
utf8proc_int32_t nextCodePoint;
int size;
nextCodePoint = utf8proc_codepoint(&input[inputIdx], size);
if (UNLIKELY(nextCodePoint == -1)) {
// invalid input string, copy the remaining of the input string as is to
// the output.
std::memcpy(&output[outputIdx], &input[inputIdx], inputLength - inputIdx);
outputIdx += inputLength - inputIdx;
return outputIdx;
}
inputIdx += size;
auto upperCodePoint = utf8proc_toupper(nextCodePoint);
assert(
(outputIdx + utf8proc_codepoint_length(upperCodePoint)) <
outputLength &&
"access out of bound");
auto newSize = utf8proc_encode_char(
upperCodePoint, reinterpret_cast<unsigned char*>(&output[outputIdx]));
outputIdx += newSize;
}
return outputIdx;
}
/// Perform lower for utf8 string input, output should be pre-allocated and
/// large enough for the results outputLength refers to the number of bytes
/// available in the output buffer, and inputLength is the number of bytes in
/// the input string
FOLLY_ALWAYS_INLINE size_t lowerUnicode(
char* output,
size_t outputLength,
const char* input,
size_t inputLength) {
auto inputIdx = 0;
auto outputIdx = 0;
while (inputIdx < inputLength) {
utf8proc_int32_t nextCodePoint;
int size;
nextCodePoint = utf8proc_codepoint(&input[inputIdx], size);
if (UNLIKELY(nextCodePoint == -1)) {
// invalid input string, copy the remaining of the input string as is to
// the output.
std::memcpy(&output[outputIdx], &input[inputIdx], inputLength - inputIdx);
outputIdx += inputLength - inputIdx;
return outputIdx;
}
inputIdx += size;
auto lowerCodePoint = utf8proc_tolower(nextCodePoint);
assert(
(outputIdx + utf8proc_codepoint_length(lowerCodePoint)) <
outputLength &&
"access out of bound");
auto newSize = utf8proc_encode_char(
lowerCodePoint, reinterpret_cast<unsigned char*>(&output[outputIdx]));
outputIdx += newSize;
}
return outputIdx;
}
/// Apply a sequence of appenders to the output string sequentially.
/// @param output the output string that appenders are applied to
/// @param appenderFunc a function that appends some string to an input string
/// of type TOutStr
template <typename TOutStr, typename Func>
static void applyAppendersRecursive(TOutStr& output, Func appenderFunc) {
appenderFunc(output);
}
template <typename TOutStr, typename Func, typename... Funcs>
static void
applyAppendersRecursive(TOutStr& output, Func appenderFunc, Funcs... funcs) {
appenderFunc(output);
applyAppendersRecursive(output, funcs...);
}
/**
* Return the length in chars of a utf8 string stored in the input buffer
* @param inputBuffer input buffer that hold the string
* @param numBytes size of input buffer
* @return the number of characters represented by the input utf8 string
*/
FOLLY_ALWAYS_INLINE int64_t
lengthUnicode(const char* inputBuffer, size_t bufferLength) {
// First address after the last byte in the buffer
auto buffEndAddress = inputBuffer + bufferLength;
auto currentChar = inputBuffer;
int64_t size = 0;
while (currentChar < buffEndAddress) {
auto chrOffset = utf8proc_char_length(currentChar);
// Skip bad byte if we get utf length < 0.
currentChar += UNLIKELY(chrOffset < 0) ? 1 : chrOffset;
size++;
}
return size;
}
/// Returns the start byte index of the Nth instance of subString in
/// string. Search starts from startPosition. Positions start with 0. If not
/// found, -1 is returned.
/// stringPosition for Unicode uses this by first finding the byte index of
/// substring and then computing the length of substring[0, byteIndex). This is
/// safe because in UTF8 a char can not be a subset of another char (in bytes
/// representation).
static int64_t findNthInstanceByteIndex(
const std::string_view& string,
const std::string_view subString,
const size_t instance = 1,
const size_t startPosition = 0) {
assert(instance > 0);
if (startPosition >= string.size()) {
return -1;
}
auto byteIndex = string.find(subString, startPosition);
// Not found
if (byteIndex == std::string_view::npos) {
return -1;
}
// Search done
if (instance == 1) {
return byteIndex;
}
// Find next occurrence
return findNthInstanceByteIndex(
string, subString, instance - 1, byteIndex + subString.size());
}
/// Replace replaced with replacement in inputString and write results in
/// outputString. If inPlace=true inputString and outputString are assumed to
/// tbe the same. When replaced is empty, replacement is added before and after
/// each charecter. When inputString is empty results is empty.
/// replace("", "", "x") = ""
/// replace("aa", "", "x") = "xaxax"
inline static size_t replace(
char* outputString,
const std::string_view& inputString,
const std::string_view& replaced,
const std::string_view& replacement,
bool inPlace = false) {
if (inputString.size() == 0) {
return 0;
}
size_t readPosition = 0;
size_t writePosition = 0;
// Copy needed in out of place replace, and when replaced and replacement are
// of different sizes.
bool doCopyUnreplaced = !inPlace || (replaced.size() != replacement.size());
auto findNextReplaced = [&]() {
return findNthInstanceByteIndex(inputString, replaced, 1, readPosition);
};
auto writeUnchanged = [&](ssize_t size) {
assert(size >= 0 && "Probable math error?");
if (size <= 0) {
return;
}
if (inPlace) {
if (doCopyUnreplaced) {
// memcpy does not allow overllapping
std::memmove(
&outputString[writePosition],
&inputString.data()[readPosition],
size);
}
} else {
std::memcpy(
&outputString[writePosition],
&inputString.data()[readPosition],
size);
}
writePosition += size;
readPosition += size;
};
auto writeReplacement = [&]() {
if (replacement.size() > 0) {
std::memcpy(
&outputString[writePosition], replacement.data(), replacement.size());
writePosition += replacement.size();
}
readPosition += replaced.size();
};
// Special case when size of replaced is 0
if (replaced.size() == 0) {
if (replacement.size() == 0) {
if (!inPlace) {
std::memcpy(outputString, inputString.data(), inputString.size());
}
return inputString.size();
}
// Can never be in place since replacement.size()>replaced.size()
assert(!inPlace && "wrong inplace replace usage");
// add replacement before and after each char in inputString
for (auto i = 0; i < inputString.size(); i++) {
writeReplacement();
outputString[writePosition] = inputString[i];
writePosition++;
}
writeReplacement();
return writePosition;
}
while (readPosition < inputString.size()) {
// Find next token to replace
auto position = findNextReplaced();
if (position == -1) {
break;
}
assert(position >= 0 && "invalid position found");
auto unchangedSize = position - readPosition;
writeUnchanged(unchangedSize);
writeReplacement();
}
auto unchangedSize = inputString.size() - readPosition;
writeUnchanged(unchangedSize);
return writePosition;
}
/// Given a utf8 string, a starting position and length returns the
/// corresponding underlying byte range [startByteIndex, endByteIndex).
/// Byte indicies starts from 0, UTF8 character positions starts from 1.
template <bool isAscii>
static inline std::pair<size_t, size_t>
getByteRange(const char* str, size_t startCharPosition, size_t length) {
if (startCharPosition < 1 && length > 0) {
throw std::invalid_argument(
"start position must be >= 1 and length must be > 0");
}
if constexpr (isAscii) {
return std::make_pair(
startCharPosition - 1, startCharPosition + length - 1);
} else {
size_t startByteIndex = 0;
size_t nextCharOffset = 0;
// Find startByteIndex
for (auto i = 0; i < startCharPosition - 1; i++) {
nextCharOffset += utf8proc_char_length(&str[nextCharOffset]);
}
startByteIndex = nextCharOffset;
// Find endByteIndex
for (auto i = 0; i < length; i++) {
nextCharOffset += utf8proc_char_length(&str[nextCharOffset]);
}
return std::make_pair(startByteIndex, nextCharOffset);
}
}
} // namespace stringCore
} // namespace facebook::velox::functions