be/src/vec/functions/function_string.cpp (1,160 lines of code) (raw):

// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "vec/functions/function_string.h" #include <ctype.h> #include <math.h> #include <re2/stringpiece.h> #include <unicode/schriter.h> #include <unicode/uchar.h> #include <unicode/unistr.h> #include <unicode/ustream.h> #include <bitset> #include <cstddef> #include <cstdint> #include <string_view> #include "common/cast_set.h" #include "common/status.h" #include "runtime/string_search.hpp" #include "util/url_coding.h" #include "vec/columns/column.h" #include "vec/columns/column_string.h" #include "vec/common/pod_array_fwd.h" #include "vec/common/string_ref.h" #include "vec/functions/function_reverse.h" #include "vec/functions/function_string_to_string.h" #include "vec/functions/function_totype.h" #include "vec/functions/simple_function_factory.h" namespace doris::vectorized { #include "common/compile_check_begin.h" struct NameStringASCII { static constexpr auto name = "ascii"; }; struct StringASCII { using ReturnType = DataTypeInt32; static constexpr auto TYPE_INDEX = TypeIndex::String; using Type = String; using ReturnColumnType = ColumnVector<Int32>; static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, PaddedPODArray<Int32>& res) { auto size = offsets.size(); res.resize(size); for (int i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]); res[i] = (offsets[i] == offsets[i - 1]) ? 0 : static_cast<uint8_t>(raw_str[0]); } return Status::OK(); } }; struct NameQuote { static constexpr auto name = "quote"; }; struct NameQuoteImpl { static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { size_t offset_size = offsets.size(); ColumnString::Offset pos = 0; res_offsets.resize(offset_size); res_data.resize(data.size() + offset_size * 2); for (int i = 0; i < offset_size; i++) { const unsigned char* raw_str = &data[offsets[i - 1]]; ColumnString::Offset size = offsets[i] - offsets[i - 1]; res_data[pos] = '\''; std::memcpy(res_data.data() + pos + 1, raw_str, size); res_data[pos + size + 1] = '\''; pos += size + 2; res_offsets[i] = pos; } return Status::OK(); } }; struct NameStringLength { static constexpr auto name = "length"; }; struct StringLengthImpl { using ReturnType = DataTypeInt32; static constexpr auto TYPE_INDEX = TypeIndex::String; using Type = String; using ReturnColumnType = ColumnVector<Int32>; static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, PaddedPODArray<Int32>& res) { auto size = offsets.size(); res.resize(size); for (int i = 0; i < size; ++i) { int str_size = offsets[i] - offsets[i - 1]; res[i] = str_size; } return Status::OK(); } }; struct NameCrc32 { static constexpr auto name = "crc32"; }; struct Crc32Impl { using ReturnType = DataTypeInt64; static constexpr auto TYPE_INDEX = TypeIndex::String; using Type = String; using ReturnColumnType = ColumnVector<Int64>; static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, PaddedPODArray<Int64>& res) { auto size = offsets.size(); res.resize(size); for (int i = 0; i < size; ++i) { res[i] = crc32_z(0L, (const unsigned char*)data.data() + offsets[i - 1], offsets[i] - offsets[i - 1]); } return Status::OK(); } }; struct NameStringUtf8Length { static constexpr auto name = "char_length"; }; struct StringUtf8LengthImpl { using ReturnType = DataTypeInt32; static constexpr auto TYPE_INDEX = TypeIndex::String; using Type = String; using ReturnColumnType = ColumnVector<Int32>; static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, PaddedPODArray<Int32>& res) { auto size = offsets.size(); res.resize(size); for (int i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]); int str_size = offsets[i] - offsets[i - 1]; res[i] = simd::VStringFunctions::get_char_len(raw_str, str_size); } return Status::OK(); } }; struct NameStartsWith { static constexpr auto name = "starts_with"; }; struct StartsWithOp { using ResultDataType = DataTypeUInt8; using ResultPaddedPODArray = PaddedPODArray<UInt8>; static void execute(const std::string_view& strl, const std::string_view& strr, uint8_t& res) { re2::StringPiece str_sp(const_cast<char*>(strl.data()), strl.length()); re2::StringPiece prefix_sp(const_cast<char*>(strr.data()), strr.length()); res = str_sp.starts_with(prefix_sp); } }; struct NameEndsWith { static constexpr auto name = "ends_with"; }; struct EndsWithOp { using ResultDataType = DataTypeUInt8; using ResultPaddedPODArray = PaddedPODArray<UInt8>; static void execute(const std::string_view& strl, const std::string_view& strr, uint8_t& res) { re2::StringPiece str_sp(const_cast<char*>(strl.data()), strl.length()); re2::StringPiece prefix_sp(const_cast<char*>(strr.data()), strr.length()); res = str_sp.ends_with(prefix_sp); } }; struct NameFindInSet { static constexpr auto name = "find_in_set"; }; struct FindInSetOp { using ResultDataType = DataTypeInt32; using ResultPaddedPODArray = PaddedPODArray<Int32>; static void execute(const std::string_view& strl, const std::string_view& strr, int32_t& res) { for (const auto& c : strl) { if (c == ',') { res = 0; return; } } int32_t token_index = 1; int32_t start = 0; int32_t end; do { end = start; // Position end. while (end < strr.length() && strr[end] != ',') { ++end; } if (strl == std::string_view {strr.data() + start, (size_t)end - start}) { res = token_index; return; } // Re-position start and end past ',' start = end + 1; ++token_index; } while (start < strr.length()); res = 0; } }; struct NameInstr { static constexpr auto name = "instr"; }; // LeftDataType and RightDataType are DataTypeString template <typename LeftDataType, typename RightDataType> struct StringInStrImpl { using ResultDataType = DataTypeInt32; using ResultPaddedPODArray = PaddedPODArray<Int32>; static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { StringRef lstr_ref(ldata.data, ldata.size); auto size = roffsets.size(); res.resize(size); for (int i = 0; i < size; ++i) { const char* r_raw_str = reinterpret_cast<const char*>(&rdata[roffsets[i - 1]]); int r_str_size = roffsets[i] - roffsets[i - 1]; StringRef rstr_ref(r_raw_str, r_str_size); res[i] = execute(lstr_ref, rstr_ref); } return Status::OK(); } static Status vector_scalar(const ColumnString::Chars& ldata, const ColumnString::Offsets& loffsets, const StringRef& rdata, ResultPaddedPODArray& res) { auto size = loffsets.size(); res.resize(size); if (rdata.size == 0) { std::fill(res.begin(), res.end(), 1); return Status::OK(); } const UInt8* begin = ldata.data(); const UInt8* end = begin + ldata.size(); const UInt8* pos = begin; /// Current index in the array of strings. size_t i = 0; std::fill(res.begin(), res.end(), 0); StringRef rstr_ref(rdata.data, rdata.size); StringSearch search(&rstr_ref); while (pos < end) { // search return matched substring start offset pos = (UInt8*)search.search((char*)pos, end - pos); if (pos >= end) { break; } /// Determine which index it refers to. /// begin + value_offsets[i] is the start offset of string at i+1 while (begin + loffsets[i] < pos) { ++i; } /// We check that the entry does not pass through the boundaries of strings. if (pos + rdata.size <= begin + loffsets[i]) { int loc = (int)(pos - begin) - loffsets[i - 1]; int l_str_size = loffsets[i] - loffsets[i - 1]; auto len = std::min(l_str_size, loc); loc = simd::VStringFunctions::get_char_len((char*)(begin + loffsets[i - 1]), len); res[i] = loc + 1; } // move to next string offset pos = begin + loffsets[i]; ++i; } return Status::OK(); } static Status vector_vector(const ColumnString::Chars& ldata, const ColumnString::Offsets& loffsets, const ColumnString::Chars& rdata, const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { DCHECK_EQ(loffsets.size(), roffsets.size()); auto size = loffsets.size(); res.resize(size); for (int i = 0; i < size; ++i) { const char* l_raw_str = reinterpret_cast<const char*>(&ldata[loffsets[i - 1]]); int l_str_size = loffsets[i] - loffsets[i - 1]; StringRef lstr_ref(l_raw_str, l_str_size); const char* r_raw_str = reinterpret_cast<const char*>(&rdata[roffsets[i - 1]]); int r_str_size = roffsets[i] - roffsets[i - 1]; StringRef rstr_ref(r_raw_str, r_str_size); res[i] = execute(lstr_ref, rstr_ref); } return Status::OK(); } static int execute(const StringRef& strl, const StringRef& strr) { if (strr.size == 0) { return 1; } StringSearch search(&strr); // Hive returns positions starting from 1. int loc = search.search(&strl); if (loc > 0) { int len = std::min(loc, (int)strl.size); loc = simd::VStringFunctions::get_char_len(strl.data, len); } return loc + 1; } }; // the same impl as instr struct NameLocate { static constexpr auto name = "locate"; }; // LeftDataType and RightDataType are DataTypeString template <typename LeftDataType, typename RightDataType> struct StringLocateImpl { using ResultDataType = DataTypeInt32; using ResultPaddedPODArray = PaddedPODArray<Int32>; static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { return StringInStrImpl<LeftDataType, RightDataType>::vector_scalar(rdata, roffsets, ldata, res); } static Status vector_scalar(const ColumnString::Chars& ldata, const ColumnString::Offsets& loffsets, const StringRef& rdata, ResultPaddedPODArray& res) { return StringInStrImpl<LeftDataType, RightDataType>::scalar_vector(rdata, ldata, loffsets, res); } static Status vector_vector(const ColumnString::Chars& ldata, const ColumnString::Offsets& loffsets, const ColumnString::Chars& rdata, const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { return StringInStrImpl<LeftDataType, RightDataType>::vector_vector(rdata, roffsets, ldata, loffsets, res); } }; // LeftDataType and RightDataType are DataTypeString template <typename LeftDataType, typename RightDataType, typename OP> struct StringFunctionImpl { using ResultDataType = typename OP::ResultDataType; using ResultPaddedPODArray = typename OP::ResultPaddedPODArray; static void vector_vector(const ColumnString::Chars& ldata, const ColumnString::Offsets& loffsets, const ColumnString::Chars& rdata, const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { DCHECK_EQ(loffsets.size(), roffsets.size()); auto size = loffsets.size(); res.resize(size); for (int i = 0; i < size; ++i) { const char* l_raw_str = reinterpret_cast<const char*>(&ldata[loffsets[i - 1]]); int l_str_size = loffsets[i] - loffsets[i - 1]; const char* r_raw_str = reinterpret_cast<const char*>(&rdata[roffsets[i - 1]]); int r_str_size = roffsets[i] - roffsets[i - 1]; std::string_view lview(l_raw_str, l_str_size); std::string_view rview(r_raw_str, r_str_size); OP::execute(lview, rview, res[i]); } } static void vector_scalar(const ColumnString::Chars& ldata, const ColumnString::Offsets& loffsets, const StringRef& rdata, ResultPaddedPODArray& res) { auto size = loffsets.size(); res.resize(size); std::string_view rview(rdata.data, rdata.size); for (int i = 0; i < size; ++i) { const char* l_raw_str = reinterpret_cast<const char*>(&ldata[loffsets[i - 1]]); int l_str_size = loffsets[i] - loffsets[i - 1]; std::string_view lview(l_raw_str, l_str_size); OP::execute(lview, rview, res[i]); } } static void scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { auto size = roffsets.size(); res.resize(size); std::string_view lview(ldata.data, ldata.size); for (int i = 0; i < size; ++i) { const char* r_raw_str = reinterpret_cast<const char*>(&rdata[roffsets[i - 1]]); int r_str_size = roffsets[i] - roffsets[i - 1]; std::string_view rview(r_raw_str, r_str_size); OP::execute(lview, rview, res[i]); } } }; struct NameToLower { static constexpr auto name = "lower"; }; struct NameToUpper { static constexpr auto name = "upper"; }; template <typename OpName> struct TransferImpl { static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { size_t offset_size = offsets.size(); if (UNLIKELY(!offset_size)) { return Status::OK(); } const bool is_ascii = simd::VStringFunctions::is_ascii({data.data(), data.size()}); res_offsets.resize(offset_size); if (is_ascii) { memcpy_small_allow_read_write_overflow15( res_offsets.data(), offsets.data(), offset_size * sizeof(ColumnString::Offsets::value_type)); size_t data_length = data.size(); res_data.resize(data_length); if constexpr (std::is_same_v<OpName, NameToUpper>) { simd::VStringFunctions::to_upper(data.data(), data_length, res_data.data()); } else if constexpr (std::is_same_v<OpName, NameToLower>) { simd::VStringFunctions::to_lower(data.data(), data_length, res_data.data()); } } else { execute_utf8(data, offsets, res_data, res_offsets); } return Status::OK(); } static void execute_utf8(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { std::string result; for (int64_t i = 0; i < offsets.size(); ++i) { const char* begin = reinterpret_cast<const char*>(&data[offsets[i - 1]]); uint32_t size = offsets[i] - offsets[i - 1]; result.clear(); if constexpr (std::is_same_v<OpName, NameToUpper>) { to_upper_utf8(begin, size, result); } else if constexpr (std::is_same_v<OpName, NameToLower>) { to_lower_utf8(begin, size, result); } StringOP::push_value_string(result, i, res_data, res_offsets); } } static void to_upper_utf8(const char* data, uint32_t size, std::string& result) { icu::StringPiece sp; sp.set(data, size); icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(sp); unicode_str.toUpper(); unicode_str.toUTF8String(result); } static void to_lower_utf8(const char* data, uint32_t size, std::string& result) { icu::StringPiece sp; sp.set(data, size); icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(sp); unicode_str.toLower(); unicode_str.toUTF8String(result); } }; // Capitalize first letter struct NameToInitcap { static constexpr auto name = "initcap"; }; struct InitcapImpl { static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { res_offsets.resize(offsets.size()); const bool is_ascii = simd::VStringFunctions::is_ascii({data.data(), data.size()}); if (is_ascii) { impl_vectors_ascii(data, offsets, res_data, res_offsets); } else { impl_vectors_utf8(data, offsets, res_data, res_offsets); } return Status::OK(); } static void impl_vectors_ascii(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { size_t offset_size = offsets.size(); memcpy_small_allow_read_write_overflow15( res_offsets.data(), offsets.data(), offset_size * sizeof(ColumnString::Offsets::value_type)); size_t data_length = data.size(); res_data.resize(data_length); simd::VStringFunctions::to_lower(data.data(), data_length, res_data.data()); bool need_capitalize = true; for (size_t offset_index = 0, start_index = 0; offset_index < offset_size; ++offset_index) { auto end_index = res_offsets[offset_index]; need_capitalize = true; for (size_t i = start_index; i < end_index; ++i) { if (!::isalnum(res_data[i])) { need_capitalize = true; } else if (need_capitalize) { /* https://en.cppreference.com/w/cpp/string/byte/toupper Like all other functions from <cctype>, the behavior of std::toupper is undefined if the argument's value is neither representable as unsigned char nor equal to EOF. To use these functions safely with plain chars (or signed chars), the argument should first be converted to unsigned char: char my_toupper(char ch) { return static_cast<char>(std::toupper(static_cast<unsigned char>(ch))); } */ res_data[i] = static_cast<unsigned char>(::toupper(res_data[i])); need_capitalize = false; } } start_index = end_index; } } static void impl_vectors_utf8(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { std::string result; for (int64_t i = 0; i < offsets.size(); ++i) { const char* begin = reinterpret_cast<const char*>(&data[offsets[i - 1]]); uint32_t size = offsets[i] - offsets[i - 1]; result.clear(); to_initcap_utf8(begin, size, result); StringOP::push_value_string(result, i, res_data, res_offsets); } } static void to_initcap_utf8(const char* data, uint32_t size, std::string& result) { icu::StringPiece sp; sp.set(data, size); icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(sp); unicode_str.toLower(); icu::UnicodeString output_str; bool need_capitalize = true; icu::StringCharacterIterator iter(unicode_str); for (UChar32 ch = iter.first32(); ch != icu::CharacterIterator::DONE; ch = iter.next32()) { if (!u_isalnum(ch)) { need_capitalize = true; } else if (need_capitalize) { ch = u_toupper(ch); need_capitalize = false; } output_str.append(ch); } output_str.toUTF8String(result); } }; struct NameTrim { static constexpr auto name = "trim"; }; struct NameLTrim { static constexpr auto name = "ltrim"; }; struct NameRTrim { static constexpr auto name = "rtrim"; }; struct NameTrimIn { static constexpr auto name = "trim_in"; }; struct NameLTrimIn { static constexpr auto name = "ltrim_in"; }; struct NameRTrimIn { static constexpr auto name = "rtrim_in"; }; template <bool is_ltrim, bool is_rtrim, bool trim_single> struct TrimUtil { static Status vector(const ColumnString::Chars& str_data, const ColumnString::Offsets& str_offsets, const StringRef& remove_str, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { const size_t offset_size = str_offsets.size(); res_offsets.resize(offset_size); res_data.reserve(str_data.size()); for (size_t i = 0; i < offset_size; ++i) { const auto* str_begin = str_data.data() + str_offsets[i - 1]; const auto* str_end = str_data.data() + str_offsets[i]; if constexpr (is_ltrim) { str_begin = simd::VStringFunctions::ltrim<trim_single>(str_begin, str_end, remove_str); } if constexpr (is_rtrim) { str_end = simd::VStringFunctions::rtrim<trim_single>(str_begin, str_end, remove_str); } res_data.insert_assume_reserved(str_begin, str_end); // The length of the result of the trim function will never exceed the length of the input. res_offsets[i] = (ColumnString::Offset)res_data.size(); } return Status::OK(); } }; template <bool is_ltrim, bool is_rtrim, bool trim_single> struct TrimInUtil { static Status vector(const ColumnString::Chars& str_data, const ColumnString::Offsets& str_offsets, const StringRef& remove_str, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { const size_t offset_size = str_offsets.size(); res_offsets.resize(offset_size); res_data.reserve(str_data.size()); bool all_ascii = simd::VStringFunctions::is_ascii(remove_str) && simd::VStringFunctions::is_ascii(StringRef( reinterpret_cast<const char*>(str_data.data()), str_data.size())); if (all_ascii) { return impl_vectors_ascii(str_data, str_offsets, remove_str, res_data, res_offsets); } else { return impl_vectors_utf8(str_data, str_offsets, remove_str, res_data, res_offsets); } } private: static Status impl_vectors_ascii(const ColumnString::Chars& str_data, const ColumnString::Offsets& str_offsets, const StringRef& remove_str, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { const size_t offset_size = str_offsets.size(); std::bitset<128> char_lookup; const char* remove_begin = remove_str.data; const char* remove_end = remove_str.data + remove_str.size; while (remove_begin < remove_end) { char_lookup.set(static_cast<unsigned char>(*remove_begin)); remove_begin += 1; } for (size_t i = 0; i < offset_size; ++i) { const char* str_begin = reinterpret_cast<const char*>(str_data.data() + str_offsets[i - 1]); const char* str_end = reinterpret_cast<const char*>(str_data.data() + str_offsets[i]); const char* left_trim_pos = str_begin; const char* right_trim_pos = str_end; if constexpr (is_ltrim) { while (left_trim_pos < str_end) { if (!char_lookup.test(static_cast<unsigned char>(*left_trim_pos))) { break; } ++left_trim_pos; } } if constexpr (is_rtrim) { while (right_trim_pos > left_trim_pos) { --right_trim_pos; if (!char_lookup.test(static_cast<unsigned char>(*right_trim_pos))) { ++right_trim_pos; break; } } } res_data.insert_assume_reserved(left_trim_pos, right_trim_pos); // The length of the result of the trim function will never exceed the length of the input. res_offsets[i] = (ColumnString::Offset)res_data.size(); } return Status::OK(); } static Status impl_vectors_utf8(const ColumnString::Chars& str_data, const ColumnString::Offsets& str_offsets, const StringRef& remove_str, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { const size_t offset_size = str_offsets.size(); res_offsets.resize(offset_size); res_data.reserve(str_data.size()); std::unordered_set<std::string_view> char_lookup; const char* remove_begin = remove_str.data; const char* remove_end = remove_str.data + remove_str.size; while (remove_begin < remove_end) { size_t byte_len, char_len; std::tie(byte_len, char_len) = simd::VStringFunctions::iterate_utf8_with_limit_length( remove_begin, remove_end, 1); char_lookup.insert(std::string_view(remove_begin, byte_len)); remove_begin += byte_len; } for (size_t i = 0; i < offset_size; ++i) { const char* str_begin = reinterpret_cast<const char*>(str_data.data() + str_offsets[i - 1]); const char* str_end = reinterpret_cast<const char*>(str_data.data() + str_offsets[i]); const char* left_trim_pos = str_begin; const char* right_trim_pos = str_end; if constexpr (is_ltrim) { while (left_trim_pos < str_end) { size_t byte_len, char_len; std::tie(byte_len, char_len) = simd::VStringFunctions::iterate_utf8_with_limit_length(left_trim_pos, str_end, 1); if (char_lookup.find(std::string_view(left_trim_pos, byte_len)) == char_lookup.end()) { break; } left_trim_pos += byte_len; } } if constexpr (is_rtrim) { while (right_trim_pos > left_trim_pos) { const char* prev_char_pos = right_trim_pos; do { --prev_char_pos; } while ((*prev_char_pos & 0xC0) == 0x80); size_t byte_len = right_trim_pos - prev_char_pos; if (char_lookup.find(std::string_view(prev_char_pos, byte_len)) == char_lookup.end()) { break; } right_trim_pos = prev_char_pos; } } res_data.insert_assume_reserved(left_trim_pos, right_trim_pos); // The length of the result of the trim function will never exceed the length of the input. res_offsets[i] = (ColumnString::Offset)res_data.size(); } return Status::OK(); } }; // This is an implementation of a parameter for the Trim function. template <bool is_ltrim, bool is_rtrim, typename Name> struct Trim1Impl { static constexpr auto name = Name::name; static DataTypes get_variadic_argument_types() { return {std::make_shared<DataTypeString>()}; } static Status execute(FunctionContext* context, Block& block, const ColumnNumbers& arguments, uint32_t result, size_t input_rows_count) { const ColumnPtr column = block.get_by_position(arguments[0]).column; if (const auto* col = assert_cast<const ColumnString*>(column.get())) { auto col_res = ColumnString::create(); char blank[] = " "; const StringRef remove_str(blank, 1); RETURN_IF_ERROR((TrimUtil<is_ltrim, is_rtrim, true>::vector( col->get_chars(), col->get_offsets(), remove_str, col_res->get_chars(), col_res->get_offsets()))); block.replace_by_position(result, std::move(col_res)); } else { return Status::RuntimeError("Illegal column {} of argument of function {}", block.get_by_position(arguments[0]).column->get_name(), name); } return Status::OK(); } }; // This is an implementation of two parameters for the Trim function. template <bool is_ltrim, bool is_rtrim, typename Name> struct Trim2Impl { static constexpr auto name = Name::name; static DataTypes get_variadic_argument_types() { return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()}; } static Status execute(FunctionContext* context, Block& block, const ColumnNumbers& arguments, uint32_t result, size_t input_rows_count) { const ColumnPtr column = block.get_by_position(arguments[0]).column; const auto& rcol = assert_cast<const ColumnConst*>(block.get_by_position(arguments[1]).column.get()) ->get_data_column_ptr(); if (const auto* col = assert_cast<const ColumnString*>(column.get())) { if (const auto* col_right = assert_cast<const ColumnString*>(rcol.get())) { auto col_res = ColumnString::create(); const auto* remove_str_raw = col_right->get_chars().data(); const ColumnString::Offset remove_str_size = col_right->get_offsets()[0]; const StringRef remove_str(remove_str_raw, remove_str_size); if (remove_str.size == 1) { RETURN_IF_ERROR((TrimUtil<is_ltrim, is_rtrim, true>::vector( col->get_chars(), col->get_offsets(), remove_str, col_res->get_chars(), col_res->get_offsets()))); } else { if constexpr (std::is_same<Name, NameTrimIn>::value || std::is_same<Name, NameLTrimIn>::value || std::is_same<Name, NameRTrimIn>::value) { RETURN_IF_ERROR((TrimInUtil<is_ltrim, is_rtrim, false>::vector( col->get_chars(), col->get_offsets(), remove_str, col_res->get_chars(), col_res->get_offsets()))); } else { RETURN_IF_ERROR((TrimUtil<is_ltrim, is_rtrim, false>::vector( col->get_chars(), col->get_offsets(), remove_str, col_res->get_chars(), col_res->get_offsets()))); } } block.replace_by_position(result, std::move(col_res)); } else { return Status::RuntimeError("Illegal column {} of argument of function {}", block.get_by_position(arguments[1]).column->get_name(), name); } } else { return Status::RuntimeError("Illegal column {} of argument of function {}", block.get_by_position(arguments[0]).column->get_name(), name); } return Status::OK(); } }; template <typename impl> class FunctionTrim : public IFunction { public: static constexpr auto name = impl::name; static FunctionPtr create() { return std::make_shared<FunctionTrim<impl>>(); } String get_name() const override { return impl::name; } size_t get_number_of_arguments() const override { return get_variadic_argument_types_impl().size(); } DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { if (!is_string_or_fixed_string(arguments[0])) { throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->get_name(), get_name()); } return arguments[0]; } // The second parameter of "trim" is a constant. ColumnNumbers get_arguments_that_are_always_constant() const override { return {1}; } DataTypes get_variadic_argument_types_impl() const override { return impl::get_variadic_argument_types(); } Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, uint32_t result, size_t input_rows_count) const override { return impl::execute(context, block, arguments, result, input_rows_count); } }; struct UnHexImplEmpty { static constexpr auto name = "unhex"; }; struct UnHexImplNull { static constexpr auto name = "unhex_null"; }; static constexpr int MAX_STACK_CIPHER_LEN = 1024 * 64; template <typename Name> struct UnHexImpl { static constexpr auto name = Name::name; using ReturnType = DataTypeString; using ColumnType = ColumnString; static bool check_and_decode_one(char& c, const char src_c, bool flag) { int k = flag ? 16 : 1; int value = src_c - '0'; // 9 = ('9'-'0') if (value >= 0 && value <= 9) { c += value * k; return true; } value = src_c - 'A'; // 5 = ('F'-'A') if (value >= 0 && value <= 5) { c += (value + 10) * k; return true; } value = src_c - 'a'; // 5 = ('f'-'a') if (value >= 0 && value <= 5) { c += (value + 10) * k; return true; } // not in ( ['0','9'], ['a','f'], ['A','F'] ) return false; } static int hex_decode(const char* src_str, ColumnString::Offset src_len, char* dst_str) { // if str length is odd or 0, return empty string like mysql dose. if ((src_len & 1) != 0 or src_len == 0) { return 0; } //check and decode one character at the same time // character in ( ['0','9'], ['a','f'], ['A','F'] ), return 'NULL' like mysql dose. for (auto i = 0, dst_index = 0; i < src_len; i += 2, dst_index++) { char c = 0; // combine two character into dst_str one character bool left_4bits_flag = check_and_decode_one(c, *(src_str + i), true); bool right_4bits_flag = check_and_decode_one(c, *(src_str + i + 1), false); if (!left_4bits_flag || !right_4bits_flag) { return 0; } *(dst_str + dst_index) = c; } return src_len / 2; } static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& dst_data, ColumnString::Offsets& dst_offsets) { auto rows_count = offsets.size(); dst_offsets.resize(rows_count); for (int i = 0; i < rows_count; ++i) { const auto* source = reinterpret_cast<const char*>(&data[offsets[i - 1]]); ColumnString::Offset srclen = offsets[i] - offsets[i - 1]; if (srclen == 0) { StringOP::push_empty_string(i, dst_data, dst_offsets); continue; } char dst_array[MAX_STACK_CIPHER_LEN]; char* dst = dst_array; int cipher_len = srclen / 2; std::unique_ptr<char[]> dst_uptr; if (cipher_len > MAX_STACK_CIPHER_LEN) { dst_uptr.reset(new char[cipher_len]); dst = dst_uptr.get(); } int outlen = hex_decode(source, srclen, dst); StringOP::push_value_string(std::string_view(dst, outlen), i, dst_data, dst_offsets); } return Status::OK(); } static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& dst_data, ColumnString::Offsets& dst_offsets, ColumnUInt8::Container* null_map_data) { auto rows_count = offsets.size(); dst_offsets.resize(rows_count); for (int i = 0; i < rows_count; ++i) { const auto* source = reinterpret_cast<const char*>(&data[offsets[i - 1]]); ColumnString::Offset srclen = offsets[i] - offsets[i - 1]; if (srclen == 0) { StringOP::push_null_string(i, dst_data, dst_offsets, *null_map_data); continue; } char dst_array[MAX_STACK_CIPHER_LEN]; char* dst = dst_array; int cipher_len = srclen / 2; std::unique_ptr<char[]> dst_uptr; if (cipher_len > MAX_STACK_CIPHER_LEN) { dst_uptr.reset(new char[cipher_len]); dst = dst_uptr.get(); } int outlen = hex_decode(source, srclen, dst); if (outlen == 0) { StringOP::push_null_string(i, dst_data, dst_offsets, *null_map_data); continue; } StringOP::push_value_string(std::string_view(dst, outlen), i, dst_data, dst_offsets); } return Status::OK(); } }; struct NameStringSpace { static constexpr auto name = "space"; }; struct StringSpace { using ReturnType = DataTypeString; static constexpr auto TYPE_INDEX = TypeIndex::Int32; using Type = Int32; using ReturnColumnType = ColumnString; static Status vector(const ColumnInt32::Container& data, ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { res_offsets.resize(data.size()); size_t input_size = res_offsets.size(); int64_t total_size = 0; for (size_t i = 0; i < input_size; ++i) { if (data[i] > 0) { total_size += data[i]; } } ColumnString::check_chars_length(total_size, input_size); res_data.reserve(total_size); for (size_t i = 0; i < input_size; ++i) { if (data[i] > 0) [[likely]] { res_data.resize_fill(res_data.size() + data[i], ' '); cast_set(res_offsets[i], res_data.size()); } else { StringOP::push_empty_string(i, res_data, res_offsets); } } return Status::OK(); } }; struct ToBase64Impl { static constexpr auto name = "to_base64"; using ReturnType = DataTypeString; using ColumnType = ColumnString; static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& dst_data, ColumnString::Offsets& dst_offsets) { auto rows_count = offsets.size(); dst_offsets.resize(rows_count); for (int i = 0; i < rows_count; ++i) { const auto* source = reinterpret_cast<const char*>(&data[offsets[i - 1]]); size_t srclen = offsets[i] - offsets[i - 1]; if (srclen == 0) { StringOP::push_empty_string(i, dst_data, dst_offsets); continue; } char dst_array[MAX_STACK_CIPHER_LEN]; char* dst = dst_array; int cipher_len = (int)(4.0 * ceil((double)srclen / 3.0)); std::unique_ptr<char[]> dst_uptr; if (cipher_len > MAX_STACK_CIPHER_LEN) { dst_uptr.reset(new char[cipher_len]); dst = dst_uptr.get(); } auto outlen = base64_encode((const unsigned char*)source, srclen, (unsigned char*)dst); StringOP::push_value_string(std::string_view(dst, outlen), i, dst_data, dst_offsets); } return Status::OK(); } }; struct FromBase64Impl { static constexpr auto name = "from_base64"; using ReturnType = DataTypeString; using ColumnType = ColumnString; static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& dst_data, ColumnString::Offsets& dst_offsets, NullMap& null_map) { auto rows_count = offsets.size(); dst_offsets.resize(rows_count); for (int i = 0; i < rows_count; ++i) { if (null_map[i]) { StringOP::push_null_string(i, dst_data, dst_offsets, null_map); continue; } const auto* source = reinterpret_cast<const char*>(&data[offsets[i - 1]]); ColumnString::Offset srclen = offsets[i] - offsets[i - 1]; if (srclen == 0) { StringOP::push_empty_string(i, dst_data, dst_offsets); continue; } char dst_array[MAX_STACK_CIPHER_LEN]; char* dst = dst_array; int cipher_len = srclen; std::unique_ptr<char[]> dst_uptr; if (cipher_len > MAX_STACK_CIPHER_LEN) { dst_uptr.reset(new char[cipher_len]); dst = dst_uptr.get(); } auto outlen = base64_decode(source, srclen, dst); if (outlen < 0) { StringOP::push_null_string(i, dst_data, dst_offsets, null_map); } else { StringOP::push_value_string(std::string_view(dst, outlen), i, dst_data, dst_offsets); } } return Status::OK(); } }; struct StringAppendTrailingCharIfAbsent { static constexpr auto name = "append_trailing_char_if_absent"; using Chars = ColumnString::Chars; using Offsets = ColumnString::Offsets; using ReturnType = DataTypeString; using ColumnType = ColumnString; static bool str_end_with(const StringRef& str, const StringRef& end) { if (str.size < end.size) { return false; } // The end_with method of StringRef needs to ensure that the size of end is less than or equal to the size of str. return str.end_with(end); } static void vector_vector(FunctionContext* context, const Chars& ldata, const Offsets& loffsets, const Chars& rdata, const Offsets& roffsets, Chars& res_data, Offsets& res_offsets, NullMap& null_map_data) { DCHECK_EQ(loffsets.size(), roffsets.size()); size_t input_rows_count = loffsets.size(); res_offsets.resize(input_rows_count); fmt::memory_buffer buffer; for (size_t i = 0; i < input_rows_count; ++i) { buffer.clear(); StringRef lstr = StringRef(reinterpret_cast<const char*>(&ldata[loffsets[i - 1]]), loffsets[i] - loffsets[i - 1]); StringRef rstr = StringRef(reinterpret_cast<const char*>(&rdata[roffsets[i - 1]]), roffsets[i] - roffsets[i - 1]); // The iterate_utf8_with_limit_length function iterates over a maximum of two UTF-8 characters. auto [byte_len, char_len] = simd::VStringFunctions::iterate_utf8_with_limit_length( rstr.begin(), rstr.end(), 2); if (char_len != 1) { StringOP::push_null_string(i, res_data, res_offsets, null_map_data); continue; } if (str_end_with(lstr, rstr)) { StringOP::push_value_string(lstr, i, res_data, res_offsets); continue; } buffer.append(lstr.begin(), lstr.end()); buffer.append(rstr.begin(), rstr.end()); StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i, res_data, res_offsets); } } static void vector_scalar(FunctionContext* context, const Chars& ldata, const Offsets& loffsets, const StringRef& rstr, Chars& res_data, Offsets& res_offsets, NullMap& null_map_data) { size_t input_rows_count = loffsets.size(); res_offsets.resize(input_rows_count); fmt::memory_buffer buffer; // The iterate_utf8_with_limit_length function iterates over a maximum of two UTF-8 characters. auto [byte_len, char_len] = simd::VStringFunctions::iterate_utf8_with_limit_length(rstr.begin(), rstr.end(), 2); if (char_len != 1) { for (size_t i = 0; i < input_rows_count; ++i) { StringOP::push_null_string(i, res_data, res_offsets, null_map_data); } return; } for (size_t i = 0; i < input_rows_count; ++i) { buffer.clear(); StringRef lstr = StringRef(reinterpret_cast<const char*>(&ldata[loffsets[i - 1]]), loffsets[i] - loffsets[i - 1]); if (str_end_with(lstr, rstr)) { StringOP::push_value_string(lstr, i, res_data, res_offsets); continue; } buffer.append(lstr.begin(), lstr.end()); buffer.append(rstr.begin(), rstr.end()); StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i, res_data, res_offsets); } } static void scalar_vector(FunctionContext* context, const StringRef& lstr, const Chars& rdata, const Offsets& roffsets, Chars& res_data, Offsets& res_offsets, NullMap& null_map_data) { size_t input_rows_count = roffsets.size(); res_offsets.resize(input_rows_count); fmt::memory_buffer buffer; for (size_t i = 0; i < input_rows_count; ++i) { buffer.clear(); StringRef rstr = StringRef(reinterpret_cast<const char*>(&rdata[roffsets[i - 1]]), roffsets[i] - roffsets[i - 1]); // The iterate_utf8_with_limit_length function iterates over a maximum of two UTF-8 characters. auto [byte_len, char_len] = simd::VStringFunctions::iterate_utf8_with_limit_length( rstr.begin(), rstr.end(), 2); if (char_len != 1) { StringOP::push_null_string(i, res_data, res_offsets, null_map_data); continue; } if (str_end_with(lstr, rstr)) { StringOP::push_value_string(lstr, i, res_data, res_offsets); continue; } buffer.append(lstr.begin(), lstr.end()); buffer.append(rstr.begin(), rstr.end()); StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i, res_data, res_offsets); } } }; struct StringLPad { static constexpr auto name = "lpad"; static constexpr auto is_lpad = true; }; struct StringRPad { static constexpr auto name = "rpad"; static constexpr auto is_lpad = false; }; template <typename LeftDataType, typename RightDataType> using StringStartsWithImpl = StringFunctionImpl<LeftDataType, RightDataType, StartsWithOp>; template <typename LeftDataType, typename RightDataType> using StringEndsWithImpl = StringFunctionImpl<LeftDataType, RightDataType, EndsWithOp>; template <typename LeftDataType, typename RightDataType> using StringFindInSetImpl = StringFunctionImpl<LeftDataType, RightDataType, FindInSetOp>; // ready for regist function using FunctionStringASCII = FunctionUnaryToType<StringASCII, NameStringASCII>; using FunctionStringLength = FunctionUnaryToType<StringLengthImpl, NameStringLength>; using FunctionCrc32 = FunctionUnaryToType<Crc32Impl, NameCrc32>; using FunctionStringUTF8Length = FunctionUnaryToType<StringUtf8LengthImpl, NameStringUtf8Length>; using FunctionStringSpace = FunctionUnaryToType<StringSpace, NameStringSpace>; using FunctionStringStartsWith = FunctionBinaryToType<DataTypeString, DataTypeString, StringStartsWithImpl, NameStartsWith>; using FunctionStringEndsWith = FunctionBinaryToType<DataTypeString, DataTypeString, StringEndsWithImpl, NameEndsWith>; using FunctionStringInstr = FunctionBinaryToType<DataTypeString, DataTypeString, StringInStrImpl, NameInstr>; using FunctionStringLocate = FunctionBinaryToType<DataTypeString, DataTypeString, StringLocateImpl, NameLocate>; using FunctionStringFindInSet = FunctionBinaryToType<DataTypeString, DataTypeString, StringFindInSetImpl, NameFindInSet>; using FunctionQuote = FunctionStringToString<NameQuoteImpl, NameQuote>; using FunctionToLower = FunctionStringToString<TransferImpl<NameToLower>, NameToLower>; using FunctionToUpper = FunctionStringToString<TransferImpl<NameToUpper>, NameToUpper>; using FunctionToInitcap = FunctionStringToString<InitcapImpl, NameToInitcap>; using FunctionUnHex = FunctionStringEncode<UnHexImpl<UnHexImplEmpty>, false>; using FunctionUnHexNullable = FunctionStringEncode<UnHexImpl<UnHexImplNull>, true>; using FunctionToBase64 = FunctionStringEncode<ToBase64Impl, false>; using FunctionFromBase64 = FunctionStringOperateToNullType<FromBase64Impl>; using FunctionStringAppendTrailingCharIfAbsent = FunctionBinaryStringOperateToNullType<StringAppendTrailingCharIfAbsent>; using FunctionStringLPad = FunctionStringPad<StringLPad>; using FunctionStringRPad = FunctionStringPad<StringRPad>; void register_function_string(SimpleFunctionFactory& factory) { factory.register_function<FunctionStringASCII>(); factory.register_function<FunctionStringLength>(); factory.register_function<FunctionCrc32>(); factory.register_function<FunctionStringUTF8Length>(); factory.register_function<FunctionStringSpace>(); factory.register_function<FunctionStringStartsWith>(); factory.register_function<FunctionStringEndsWith>(); factory.register_function<FunctionStringInstr>(); factory.register_function<FunctionStringFindInSet>(); factory.register_function<FunctionStringLocate>(); factory.register_function<FunctionStringLocatePos>(); factory.register_function<FunctionQuote>(); factory.register_function<FunctionAutoPartitionName>(); factory.register_function<FunctionReverseCommon>(); factory.register_function<FunctionUnHex>(); factory.register_function<FunctionUnHexNullable>(); factory.register_function<FunctionToLower>(); factory.register_function<FunctionToUpper>(); factory.register_function<FunctionToInitcap>(); factory.register_function<FunctionTrim<Trim1Impl<true, true, NameTrim>>>(); factory.register_function<FunctionTrim<Trim1Impl<true, false, NameLTrim>>>(); factory.register_function<FunctionTrim<Trim1Impl<false, true, NameRTrim>>>(); factory.register_function<FunctionTrim<Trim2Impl<true, true, NameTrim>>>(); factory.register_function<FunctionTrim<Trim2Impl<true, false, NameLTrim>>>(); factory.register_function<FunctionTrim<Trim2Impl<false, true, NameRTrim>>>(); factory.register_function<FunctionTrim<Trim1Impl<true, true, NameTrimIn>>>(); factory.register_function<FunctionTrim<Trim1Impl<true, false, NameLTrimIn>>>(); factory.register_function<FunctionTrim<Trim1Impl<false, true, NameRTrimIn>>>(); factory.register_function<FunctionTrim<Trim2Impl<true, true, NameTrimIn>>>(); factory.register_function<FunctionTrim<Trim2Impl<true, false, NameLTrimIn>>>(); factory.register_function<FunctionTrim<Trim2Impl<false, true, NameRTrimIn>>>(); factory.register_function<FunctionConvertTo>(); factory.register_function<FunctionSubstring<Substr3Impl>>(); factory.register_function<FunctionSubstring<Substr2Impl>>(); factory.register_function<FunctionLeft>(); factory.register_function<FunctionRight>(); factory.register_function<FunctionNullOrEmpty>(); factory.register_function<FunctionNotNullOrEmpty>(); factory.register_function<FunctionStringConcat>(); factory.register_function<FunctionIntToChar>(); factory.register_function<FunctionStringElt>(); factory.register_function<FunctionStringConcatWs>(); factory.register_function<FunctionStringAppendTrailingCharIfAbsent>(); factory.register_function<FunctionStringRepeat>(); factory.register_function<FunctionStringLPad>(); factory.register_function<FunctionStringRPad>(); factory.register_function<FunctionToBase64>(); factory.register_function<FunctionFromBase64>(); factory.register_function<FunctionSplitPart>(); factory.register_function<FunctionSplitByString>(); factory.register_function<FunctionCountSubString>(); factory.register_function<FunctionSubstringIndex>(); factory.register_function<FunctionExtractURLParameter>(); factory.register_function<FunctionStringParseUrl>(); factory.register_function<FunctionUrlDecode>(); factory.register_function<FunctionUrlEncode>(); factory.register_function<FunctionRandomBytes>(); factory.register_function<FunctionMoneyFormat<MoneyFormatDoubleImpl>>(); factory.register_function<FunctionMoneyFormat<MoneyFormatInt64Impl>>(); factory.register_function<FunctionMoneyFormat<MoneyFormatInt128Impl>>(); factory.register_function<FunctionMoneyFormat<MoneyFormatDecimalImpl>>(); factory.register_function<FunctionStringFormatRound<FormatRoundDoubleImpl>>(); factory.register_function<FunctionStringFormatRound<FormatRoundInt64Impl>>(); factory.register_function<FunctionStringFormatRound<FormatRoundInt128Impl>>(); factory.register_function<FunctionStringFormatRound<FormatRoundDecimalImpl>>(); factory.register_function<FunctionStringDigestOneArg<SM3Sum>>(); factory.register_function<FunctionStringDigestOneArg<MD5Sum>>(); factory.register_function<FunctionStringDigestSHA1>(); factory.register_function<FunctionStringDigestSHA2>(); factory.register_function<FunctionReplace<ReplaceImpl, true>>(); factory.register_function<FunctionReplace<ReplaceEmptyImpl, false>>(); factory.register_function<FunctionTranslate>(); factory.register_function<FunctionMask>(); factory.register_function<FunctionMaskPartial<true>>(); factory.register_function<FunctionMaskPartial<false>>(); factory.register_function<FunctionSubReplace<SubReplaceThreeImpl>>(); factory.register_function<FunctionSubReplace<SubReplaceFourImpl>>(); factory.register_function<FunctionOverlay>(); factory.register_function<FunctionStrcmp>(); factory.register_function<FunctionNgramSearch>(); factory.register_function<FunctionXPathString>(); factory.register_alias(FunctionLeft::name, "strleft"); factory.register_alias(FunctionRight::name, "strright"); factory.register_alias(SubstringUtil::name, "substr"); factory.register_alias(FunctionToLower::name, "lcase"); factory.register_alias(FunctionToUpper::name, "ucase"); factory.register_alias(FunctionStringDigestOneArg<MD5Sum>::name, "md5"); factory.register_alias(FunctionStringUTF8Length::name, "character_length"); factory.register_alias(FunctionStringDigestOneArg<SM3Sum>::name, "sm3"); factory.register_alias(FunctionStringDigestSHA1::name, "sha"); } } // namespace doris::vectorized