be/src/kudu/gutil/strings/split.cc

// Copyright 2008 and onwards Google Inc. All rights reserved. // // Maintainer: Greg Miller <jgm@google.com> #include "gutil/strings/split.h" #include <assert.h> #include <stdlib.h> #include <string.h> #include <iterator> using std::back_insert_iterator; using std::iterator_traits; #include <limits> using std::numeric_limits; using std::unordered_map; using std::unordered_set; #include "gutil/integral_types.h" #include <common/logging.h> #include "gutil/logging-inl.h" #include "gutil/macros.h" #include "gutil/strtoint.h" #include "gutil/strings/ascii_ctype.h" #include "gutil/strings/util.h" #include "gutil/hash/hash.h" // Implementations for some of the Split2 API. Much of the Split2 API is // templated so it exists in header files, either strings/split.h or // strings/split_iternal.h. namespace strings { namespace delimiter { namespace { // This GenericFind() template function encapsulates the finding algorithm // shared between the Literal and AnyOf delimiters. The FindPolicy template // parameter allows each delimiter to customize the actual find function to use // and the length of the found delimiter. For example, the Literal delimiter // will ultimately use StringPiece::find(), and the AnyOf delimiter will use // StringPiece::find_first_of(). template <typename FindPolicy> StringPiece GenericFind( StringPiece text, StringPiece delimiter, FindPolicy find_policy) { if (delimiter.empty() && text.length() > 0) { // Special case for empty string delimiters: always return a zero-length // StringPiece referring to the item at position 1. return StringPiece(text.begin() + 1, 0); } int found_pos = StringPiece::npos; StringPiece found(text.end(), 0); // By default, not found found_pos = find_policy.Find(text, delimiter); if (found_pos != StringPiece::npos) { found.set(text.data() + found_pos, find_policy.Length(delimiter)); } return found; } // Finds using StringPiece::find(), therefore the length of the found delimiter // is delimiter.length(). struct LiteralPolicy { int Find(StringPiece text, StringPiece delimiter) { return text.find(delimiter); } int Length(StringPiece delimiter) { return delimiter.length(); } }; // Finds using StringPiece::find_first_of(), therefore the length of the found // delimiter is 1. struct AnyOfPolicy { size_t Find(StringPiece text, StringPiece delimiter) { return text.find_first_of(delimiter); } int Length(StringPiece delimiter) { return 1; } }; } // namespace // // Literal // Literal::Literal(StringPiece sp) : delimiter_(sp.ToString()) { } StringPiece Literal::Find(StringPiece text) const { return GenericFind(text, delimiter_, LiteralPolicy()); } // // AnyOf // AnyOf::AnyOf(StringPiece sp) : delimiters_(sp.ToString()) { } StringPiece AnyOf::Find(StringPiece text) const { return GenericFind(text, delimiters_, AnyOfPolicy()); } } // namespace delimiter } // namespace strings // // ==================== LEGACY SPLIT FUNCTIONS ==================== // using ::strings::SkipEmpty; using ::strings::delimiter::AnyOf; using ::strings::delimiter::Limit; namespace { // Appends the results of a split to the specified container. This function has // the following overloads: // - vector<string> - for better performance // - map<string, string> - to change append semantics // - unordered_map<string, string> - to change append semantics template <typename Container, typename Splitter> void AppendToImpl(Container* container, Splitter splitter) { Container c = splitter; // Calls implicit conversion operator. std::copy(c.begin(), c.end(), std::inserter(*container, container->end())); } // Overload of AppendToImpl() that is optimized for appending to vector<string>. // This version eliminates a couple string copies by using a vector<StringPiece> // as the intermediate container. template <typename Splitter> void AppendToImpl(vector<string>* container, Splitter splitter) { vector<StringPiece> vsp = splitter; // Calls implicit conversion operator. size_t container_size = container->size(); container->resize(container_size + vsp.size()); for (const auto& sp : vsp) { sp.CopyToString(&(*container)[container_size++]); } } // Here we define two AppendToImpl() overloads for map<> and unordered_map<>. Both of // these overloads call through to this AppendToMap() function. This is needed // because inserting a duplicate key into a map does NOT overwrite the previous // value, which was not the behavior of the split1 Split*() functions. Consider // this example: // // map<string, string> m; // m.insert(std::make_pair("a", "1")); // m.insert(std::make_pair("a", "2")); // <-- doesn't actually insert. // ASSERT_EQ(m["a"], "1"); // <-- "a" has value "1" not "2". // // Due to this behavior of map::insert, we can't rely on a normal std::inserter // for a maps. Instead, maps and unordered_maps need to be special cased to implement // the desired append semantic of inserting an existing value overwrites the // previous value. // // This same issue is true with sets as well. However, since sets don't have a // separate key and value, failing to overwrite an existing value in a set is // fine because the value already exists in the set. // template <typename Map, typename Splitter> void AppendToMap(Map* m, Splitter splitter) { Map tmp = splitter; // Calls implicit conversion operator. for (typename Map::const_iterator it = tmp.begin(); it != tmp.end(); ++it) { (*m)[it->first] = it->second; } } template <typename Splitter> void AppendToImpl(map<string, string>* map_container, Splitter splitter) { AppendToMap(map_container, splitter); } template <typename Splitter> void AppendToImpl(unordered_map<string, string>* map_container, Splitter splitter) { AppendToMap(map_container, splitter); } // Appends the results of a call to strings::Split() to the specified container. // This function is used with the new strings::Split() API to implement the // append semantics of the legacy Split*() functions. // // The "Splitter" template parameter is intended to be a // ::strings::internal::Splitter<>, which is the return value of a call to // strings::Split(). Sample usage: // // vector<string> v; // ... add stuff to "v" ... // AppendTo(&v, strings::Split("a,b,c", ",")); // template <typename Container, typename Splitter> void AppendTo(Container* container, Splitter splitter) { if (container->empty()) { // "Appending" to an empty container is by far the common case. For this we // assign directly to the output container, which is more efficient than // explicitly appending. *container = splitter; // Calls implicit conversion operator. } else { AppendToImpl(container, splitter); } } } // anonymous namespace // Constants for ClipString() static const int kMaxOverCut = 12; // The ellipsis to add to strings that are too long static const char kCutStr[] = "..."; static const int kCutStrSize = sizeof(kCutStr) - 1; // ---------------------------------------------------------------------- // Return the place to clip the string at, or -1 // if the string doesn't need to be clipped. // ---------------------------------------------------------------------- static int ClipStringHelper(const char* str, int max_len, bool use_ellipsis) { if (strlen(str) <= max_len) return -1; int max_substr_len = max_len; if (use_ellipsis && max_len > kCutStrSize) { max_substr_len -= kCutStrSize; } const char* cut_by = (max_substr_len < kMaxOverCut ? str : str + max_len - kMaxOverCut); const char* cut_at = str + max_substr_len; while (!ascii_isspace(*cut_at) && cut_at > cut_by) cut_at--; if (cut_at == cut_by) { // No space was found return max_substr_len; } else { return cut_at-str; } } // ---------------------------------------------------------------------- // ClipString // Clip a string to a max length. We try to clip on a word boundary // if this is possible. If the string is clipped, we append an // ellipsis. // ---------------------------------------------------------------------- void ClipString(char* str, int max_len) { int cut_at = ClipStringHelper(str, max_len, true); if (cut_at != -1) { if (max_len > kCutStrSize) { strcpy(str+cut_at, kCutStr); } else { strcpy(str+cut_at, ""); } } } // ---------------------------------------------------------------------- // ClipString // Version of ClipString() that uses string instead of char*. // ---------------------------------------------------------------------- void ClipString(string* full_str, int max_len) { int cut_at = ClipStringHelper(full_str->c_str(), max_len, true); if (cut_at != -1) { full_str->erase(cut_at); if (max_len > kCutStrSize) { full_str->append(kCutStr); } } } // ---------------------------------------------------------------------- // SplitStringToIteratorAllowEmpty() // Split a string using a character delimiter. Append the components // to 'result'. If there are consecutive delimiters, this function // will return corresponding empty strings. The string is split into // at most the specified number of pieces greedily. This means that the // last piece may possibly be split further. To split into as many pieces // as possible, specify 0 as the number of pieces. // // If "full" is the empty string, yields an empty string as the only value. // // If "pieces" is negative for some reason, it returns the whole string // ---------------------------------------------------------------------- template <typename StringType, typename ITR> static inline void SplitStringToIteratorAllowEmpty(const StringType& full, const char* delim, int pieces, ITR& result) { string::size_type begin_index, end_index; begin_index = 0; for (int i = 0; (i < pieces-1) || (pieces == 0); i++) { end_index = full.find_first_of(delim, begin_index); if (end_index == string::npos) { *result++ = full.substr(begin_index); return; } *result++ = full.substr(begin_index, (end_index - begin_index)); begin_index = end_index + 1; } *result++ = full.substr(begin_index); } void SplitStringIntoNPiecesAllowEmpty(const string& full, const char* delim, int pieces, vector<string>* result) { if (pieces == 0) { // No limit when pieces is 0. AppendTo(result, strings::Split(full, AnyOf(delim))); } else { // The input argument "pieces" specifies the max size that *result should // be. However, the argument to the Limit() delimiter is the max number of // delimiters, which should be one less than "pieces". Example: "a,b,c" has // 3 pieces and two comma delimiters. int limit = std::max(pieces - 1, 0); AppendTo(result, strings::Split(full, Limit(AnyOf(delim), limit))); } } // ---------------------------------------------------------------------- // SplitStringAllowEmpty // Split a string using a character delimiter. Append the components // to 'result'. If there are consecutive delimiters, this function // will return corresponding empty strings. // ---------------------------------------------------------------------- void SplitStringAllowEmpty(const string& full, const char* delim, vector<string>* result) { AppendTo(result, strings::Split(full, AnyOf(delim))); } // If we know how much to allocate for a vector of strings, we can // allocate the vector<string> only once and directly to the right size. // This saves in between 33-66 % of memory space needed for the result, // and runs faster in the microbenchmarks. // // The reserve is only implemented for the single character delim. // // The implementation for counting is cut-and-pasted from // SplitStringToIteratorUsing. I could have written my own counting iterator, // and use the existing template function, but probably this is more clear // and more sure to get optimized to reasonable code. static int CalculateReserveForVector(const string& full, const char* delim) { int count = 0; if (delim[0] != '\0' && delim[1] == '\0') { // Optimize the common case where delim is a single character. char c = delim[0]; const char* p = full.data(); const char* end = p + full.size(); while (p != end) { if (*p == c) { // This could be optimized with hasless(v,1) trick. ++p; } else { while (++p != end && *p != c) { // Skip to the next occurence of the delimiter. } ++count; } } } return count; } // ---------------------------------------------------------------------- // SplitStringUsing() // SplitStringToHashsetUsing() // SplitStringToSetUsing() // SplitStringToMapUsing() // SplitStringToHashmapUsing() // Split a string using a character delimiter. Append the components // to 'result'. // // Note: For multi-character delimiters, this routine will split on *ANY* of // the characters in the string, not the entire string as a single delimiter. // ---------------------------------------------------------------------- template <typename StringType, typename ITR> static inline void SplitStringToIteratorUsing(const StringType& full, const char* delim, ITR& result) { // Optimize the common case where delim is a single character. if (delim[0] != '\0' && delim[1] == '\0') { char c = delim[0]; const char* p = full.data(); const char* end = p + full.size(); while (p != end) { if (*p == c) { ++p; } else { const char* start = p; while (++p != end && *p != c) { // Skip to the next occurence of the delimiter. } *result++ = StringType(start, p - start); } } return; } string::size_type begin_index, end_index; begin_index = full.find_first_not_of(delim); while (begin_index != string::npos) { end_index = full.find_first_of(delim, begin_index); if (end_index == string::npos) { *result++ = full.substr(begin_index); return; } *result++ = full.substr(begin_index, (end_index - begin_index)); begin_index = full.find_first_not_of(delim, end_index); } } void SplitStringUsing(const string& full, const char* delim, vector<string>* result) { result->reserve(result->size() + CalculateReserveForVector(full, delim)); std::back_insert_iterator< vector<string> > it(*result); SplitStringToIteratorUsing(full, delim, it); } void SplitStringToHashsetUsing(const string& full, const char* delim, unordered_set<string>* result) { AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); } void SplitStringToSetUsing(const string& full, const char* delim, set<string>* result) { AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); } void SplitStringToMapUsing(const string& full, const char* delim, map<string, string>* result) { AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); } void SplitStringToHashmapUsing(const string& full, const char* delim, unordered_map<string, string>* result) { AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); } // ---------------------------------------------------------------------- // SplitStringPieceToVector() // Split a StringPiece into sub-StringPieces based on delim // and appends the pieces to 'vec'. // If omit empty strings is true, empty strings are omitted // from the resulting vector. // ---------------------------------------------------------------------- void SplitStringPieceToVector(const StringPiece& full, const char* delim, vector<StringPiece>* vec, bool omit_empty_strings) { if (omit_empty_strings) { AppendTo(vec, strings::Split(full, AnyOf(delim), SkipEmpty())); } else { AppendTo(vec, strings::Split(full, AnyOf(delim))); } } // ---------------------------------------------------------------------- // SplitUsing() // Split a string using a string of delimiters, returning vector // of strings. The original string is modified to insert nulls. // ---------------------------------------------------------------------- vector<char*>* SplitUsing(char* full, const char* delim) { auto vec = new vector<char*>; SplitToVector(full, delim, vec, true); // Omit empty strings return vec; } void SplitToVector(char* full, const char* delim, vector<char*>* vec, bool omit_empty_strings) { char* next = full; while ((next = gstrsep(&full, delim)) != nullptr) { if (omit_empty_strings && next[0] == '\0') continue; vec->push_back(next); } // Add last element (or full string if no delimeter found): if (full != nullptr) { vec->push_back(full); } } void SplitToVector(char* full, const char* delim, vector<const char*>* vec, bool omit_empty_strings) { char* next = full; while ((next = gstrsep(&full, delim)) != nullptr) { if (omit_empty_strings && next[0] == '\0') continue; vec->push_back(next); } // Add last element (or full string if no delimeter found): if (full != nullptr) { vec->push_back(full); } } // ---------------------------------------------------------------------- // SplitOneStringToken() // Mainly a stringified wrapper around strpbrk() // ---------------------------------------------------------------------- string SplitOneStringToken(const char ** source, const char * delim) { assert(source); assert(delim); if (!*source) { return string(); } const char * begin = *source; // Optimize the common case where delim is a single character. if (delim[0] != '\0' && delim[1] == '\0') { *source = strchr(*source, delim[0]); } else { *source = strpbrk(*source, delim); } if (*source) { return string(begin, (*source)++); } else { return string(begin); } } // ---------------------------------------------------------------------- // SplitStringWithEscaping() // SplitStringWithEscapingAllowEmpty() // SplitStringWithEscapingToSet() // SplitStringWithWithEscapingToHashset() // Split the string using the specified delimiters, taking escaping into // account. '\' is not allowed as a delimiter. // ---------------------------------------------------------------------- template <typename ITR> static inline void SplitStringWithEscapingToIterator(const string& src, const strings::CharSet& delimiters, const bool allow_empty, ITR* result) { CHECK(!delimiters.Test('\\')) << "\\ is not allowed as a delimiter."; CHECK(result); string part; for (uint32 i = 0; i < src.size(); ++i) { char current_char = src[i]; if (delimiters.Test(current_char)) { // Push substrings when we encounter delimiters. if (allow_empty || !part.empty()) { *(*result)++ = part; part.clear(); } } else if (current_char == '\\' && ++i < src.size()) { // If we see a backslash, the next delimiter or backslash is literal. current_char = src[i]; if (current_char != '\\' && !delimiters.Test(current_char)) { // Don't honour unknown escape sequences: emit \f for \f. part.push_back('\\'); } part.push_back(current_char); } else { // Otherwise, we have a normal character or trailing backslash. part.push_back(current_char); } } // Push the trailing part. if (allow_empty || !part.empty()) { *(*result)++ = part; } } void SplitStringWithEscaping(const string &full, const strings::CharSet& delimiters, vector<string> *result) { std::back_insert_iterator< vector<string> > it(*result); SplitStringWithEscapingToIterator(full, delimiters, false, &it); } void SplitStringWithEscapingAllowEmpty(const string &full, const strings::CharSet& delimiters, vector<string> *result) { std::back_insert_iterator< vector<string> > it(*result); SplitStringWithEscapingToIterator(full, delimiters, true, &it); } void SplitStringWithEscapingToSet(const string &full, const strings::CharSet& delimiters, set<string> *result) { std::insert_iterator< set<string> > it(*result, result->end()); SplitStringWithEscapingToIterator(full, delimiters, false, &it); } void SplitStringWithEscapingToHashset(const string &full, const strings::CharSet& delimiters, unordered_set<string> *result) { std::insert_iterator< unordered_set<string> > it(*result, result->end()); SplitStringWithEscapingToIterator(full, delimiters, false, &it); } // ---------------------------------------------------------------------- // SplitOneIntToken() // SplitOneInt32Token() // SplitOneUint32Token() // SplitOneInt64Token() // SplitOneUint64Token() // SplitOneDoubleToken() // SplitOneFloatToken() // SplitOneDecimalIntToken() // SplitOneDecimalInt32Token() // SplitOneDecimalUint32Token() // SplitOneDecimalInt64Token() // SplitOneDecimalUint64Token() // SplitOneHexUint32Token() // SplitOneHexUint64Token() // Mainly a stringified wrapper around strtol/strtoul/strtod // ---------------------------------------------------------------------- // Curried functions for the macro below static inline long strto32_0(const char * source, char ** end) { return strto32(source, end, 0); } static inline unsigned long strtou32_0(const char * source, char ** end) { return strtou32(source, end, 0); } static inline int64 strto64_0(const char * source, char ** end) { return strto64(source, end, 0); } static inline uint64 strtou64_0(const char * source, char ** end) { return strtou64(source, end, 0); } static inline long strto32_10(const char * source, char ** end) { return strto32(source, end, 10); } static inline unsigned long strtou32_10(const char * source, char ** end) { return strtou32(source, end, 10); } static inline int64 strto64_10(const char * source, char ** end) { return strto64(source, end, 10); } static inline uint64 strtou64_10(const char * source, char ** end) { return strtou64(source, end, 10); } static inline uint32 strtou32_16(const char * source, char ** end) { return strtou32(source, end, 16); } static inline uint64 strtou64_16(const char * source, char ** end) { return strtou64(source, end, 16); } #define DEFINE_SPLIT_ONE_NUMBER_TOKEN(name, type, function) \ bool SplitOne##name##Token(const char ** source, const char * delim, \ type * value) { \ assert(source); \ assert(delim); \ assert(value); \ if (!*source) \ return false; \ /* Parse int */ \ char * end; \ *value = function(*source, &end); \ if (end == *source) \ return false; /* number not present at start of string */ \ if (end[0] && !strchr(delim, end[0])) \ return false; /* Garbage characters after int */ \ /* Advance past token */ \ if (*end != '\0') \ *source = const_cast<const char *>(end+1); \ else \ *source = NULL; \ return true; \ } DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int, int, strto32_0) DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int32, int32, strto32_0) DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint32, uint32, strtou32_0) DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int64, int64, strto64_0) DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint64, uint64, strtou64_0) DEFINE_SPLIT_ONE_NUMBER_TOKEN(Double, double, strtod) #ifdef _MSC_VER // has no strtof() // Note: does an implicit cast to float. DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtod) #else DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtof) #endif DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt, int, strto32_10) DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt32, int32, strto32_10) DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint32, uint32, strtou32_10) DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt64, int64, strto64_10) DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint64, uint64, strtou64_10) DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint32, uint32, strtou32_16) DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint64, uint64, strtou64_16) // ---------------------------------------------------------------------- // SplitRange() // Splits a string of the form "<from>-<to>". Either or both can be // missing. A raw number (<to>) is interpreted as "<to>-". Modifies // parameters insofar as they're specified by the string. RETURNS // true iff the input is a well-formed range. If it RETURNS false, // from and to remain unchanged. The range in rangestr should be // terminated either by "\0" or by whitespace. // ---------------------------------------------------------------------- #define EOS(ch) ( (ch) == '\0' || ascii_isspace(ch) ) bool SplitRange(const char* rangestr, int* from, int* to) { // We need to do the const-cast because strol takes a char**, not const char** char* val = const_cast<char*>(rangestr); if (val == nullptr || EOS(*val)) return true; // we'll say nothingness is ok if ( val[0] == '-' && EOS(val[1]) ) // CASE 1: - return true; // nothing changes if ( val[0] == '-' ) { // CASE 2: -<i2> const int int2 = strto32(val+1, &val, 10); if ( !EOS(*val) ) return false; // not a valid integer *to = int2; // only "to" changes return true; } else { const int int1 = strto32(val, &val, 10); if ( EOS(*val) || (*val == '-' && EOS(*(val+1))) ) { *from = int1; // CASE 3: <i1>, same as <i1>- return true; // only "from" changes } else if (*val != '-') { // not a valid range return false; } const int int2 = strto32(val+1, &val, 10); if ( !EOS(*val) ) return false; // not a valid integer *from = int1; // CASE 4: <i1>-<i2> *to = int2; return true; } } void SplitCSVLineWithDelimiter(char* line, char delimiter, vector<char*>* cols) { char* end_of_line = line + strlen(line); char* end; char* start; for (; line < end_of_line; line++) { // Skip leading whitespace, unless said whitespace is the delimiter. while (ascii_isspace(*line) && *line != delimiter) ++line; if (*line == '"' && delimiter == ',') { // Quoted value... start = ++line; end = start; for (; *line; line++) { if (*line == '"') { line++; if (*line != '"') // [""] is an escaped ["] break; // but just ["] is end of value } *end++ = *line; } // All characters after the closing quote and before the comma // are ignored. line = strchr(line, delimiter); if (!line) line = end_of_line; } else { start = line; line = strchr(line, delimiter); if (!line) line = end_of_line; // Skip all trailing whitespace, unless said whitespace is the delimiter. for (end = line; end > start; --end) { if (!ascii_isspace(end[-1]) || end[-1] == delimiter) break; } } const bool need_another_column = (*line == delimiter) && (line == end_of_line - 1); *end = '\0'; cols->push_back(start); // If line was something like [paul,] (comma is the last character // and is not proceeded by whitespace or quote) then we are about // to eliminate the last column (which is empty). This would be // incorrect. if (need_another_column) cols->push_back(end); assert(*line == '\0' || *line == delimiter); } } void SplitCSVLine(char* line, vector<char*>* cols) { SplitCSVLineWithDelimiter(line, ',', cols); } void SplitCSVLineWithDelimiterForStrings(const string &line, char delimiter, vector<string> *cols) { // Unfortunately, the interface requires char* instead of const char* // which requires copying the string. char *cline = strndup_with_new(line.c_str(), line.size()); vector<char *> v; SplitCSVLineWithDelimiter(cline, delimiter, &v); for (vector<char*>::const_iterator ci = v.begin(); ci != v.end(); ++ci) { cols->push_back(*ci); } delete[] cline; } // ---------------------------------------------------------------------- namespace { // Helper class used by SplitStructuredLineInternal. class ClosingSymbolLookup { public: explicit ClosingSymbolLookup(const char* symbol_pairs) : closing_(), valid_closing_() { // Initialize the opening/closing arrays. for (const char* symbol = symbol_pairs; *symbol != 0; ++symbol) { unsigned char opening = *symbol; ++symbol; // If the string ends before the closing character has been found, // use the opening character as the closing character. unsigned char closing = *symbol != 0 ? *symbol : opening; closing_[opening] = closing; valid_closing_[closing] = true; if (*symbol == 0) break; } } // Returns the closing character corresponding to an opening one, // or 0 if the argument is not an opening character. char GetClosingChar(char opening) const { return closing_[static_cast<unsigned char>(opening)]; } // Returns true if the argument is a closing character. bool IsClosing(char c) const { return valid_closing_[static_cast<unsigned char>(c)]; } private: // Maps an opening character to its closing. If the entry contains 0, // the character is not in the opening set. char closing_[256]; // Valid closing characters. bool valid_closing_[256]; DISALLOW_COPY_AND_ASSIGN(ClosingSymbolLookup); }; char* SplitStructuredLineInternal(char* line, char delimiter, const char* symbol_pairs, vector<char*>* cols, bool with_escapes) { ClosingSymbolLookup lookup(symbol_pairs); // Stack of symbols expected to close the current opened expressions. vector<char> expected_to_close; bool in_escape = false; CHECK(cols); cols->push_back(line); char* current; for (current = line; *current; ++current) { char c = *current; if (in_escape) { in_escape = false; } else if (with_escapes && c == '\\') { // We are escaping the next character. Note the escape still appears // in the output. in_escape = true; } else if (expected_to_close.empty() && c == delimiter) { // We don't have any open expression, this is a valid separator. *current = 0; cols->push_back(current + 1); } else if (!expected_to_close.empty() && c == expected_to_close.back()) { // Can we close the currently open expression? expected_to_close.pop_back(); } else if (lookup.GetClosingChar(c)) { // If this is an opening symbol, we open a new expression and push // the expected closing symbol on the stack. expected_to_close.push_back(lookup.GetClosingChar(c)); } else if (lookup.IsClosing(c)) { // Error: mismatched closing symbol. return current; } } if (!expected_to_close.empty()) { return current; // Missing closing symbol(s) } return nullptr; // Success } bool SplitStructuredLineInternal(StringPiece line, char delimiter, const char* symbol_pairs, vector<StringPiece>* cols, bool with_escapes) { ClosingSymbolLookup lookup(symbol_pairs); // Stack of symbols expected to close the current opened expressions. vector<char> expected_to_close; bool in_escape = false; CHECK_NOTNULL(cols); cols->push_back(line); for (int i = 0; i < line.size(); ++i) { char c = line[i]; if (in_escape) { in_escape = false; } else if (with_escapes && c == '\\') { // We are escaping the next character. Note the escape still appears // in the output. in_escape = true; } else if (expected_to_close.empty() && c == delimiter) { // We don't have any open expression, this is a valid separator. cols->back().remove_suffix(line.size() - i); cols->push_back(StringPiece(line, i + 1)); } else if (!expected_to_close.empty() && c == expected_to_close.back()) { // Can we close the currently open expression? expected_to_close.pop_back(); } else if (lookup.GetClosingChar(c)) { // If this is an opening symbol, we open a new expression and push // the expected closing symbol on the stack. expected_to_close.push_back(lookup.GetClosingChar(c)); } else if (lookup.IsClosing(c)) { // Error: mismatched closing symbol. return false; } } if (!expected_to_close.empty()) { return false; // Missing closing symbol(s) } return true; // Success } } // anonymous namespace char* SplitStructuredLine(char* line, char delimiter, const char *symbol_pairs, vector<char*>* cols) { return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, false); } bool SplitStructuredLine(StringPiece line, char delimiter, const char* symbol_pairs, vector<StringPiece>* cols) { return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, false); } char* SplitStructuredLineWithEscapes(char* line, char delimiter, const char *symbol_pairs, vector<char*>* cols) { return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, true); } bool SplitStructuredLineWithEscapes(StringPiece line, char delimiter, const char* symbol_pairs, vector<StringPiece>* cols) { return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, true); } // ---------------------------------------------------------------------- // SplitStringIntoKeyValues() // ---------------------------------------------------------------------- bool SplitStringIntoKeyValues(const string& line, const string& key_value_delimiters, const string& value_value_delimiters, string *key, vector<string> *values) { key->clear(); values->clear(); // find the key string size_t end_key_pos = line.find_first_of(key_value_delimiters); if (end_key_pos == string::npos) { VLOG(1) << "cannot parse key from line: " << line; return false; // no key } key->assign(line, 0, end_key_pos); // find the values string string remains(line, end_key_pos, line.size() - end_key_pos); size_t begin_values_pos = remains.find_first_not_of(key_value_delimiters); if (begin_values_pos == string::npos) { VLOG(1) << "cannot parse value from line: " << line; return false; // no value } string values_string(remains, begin_values_pos, remains.size() - begin_values_pos); // construct the values vector if (value_value_delimiters.empty()) { // one value values->push_back(values_string); } else { // multiple values SplitStringUsing(values_string, value_value_delimiters.c_str(), values); if (values->size() < 1) { VLOG(1) << "cannot parse value from line: " << line; return false; // no value } } return true; } bool SplitStringIntoKeyValuePairs(const string& line, const string& key_value_delimiters, const string& key_value_pair_delimiters, vector<pair<string, string> >* kv_pairs) { kv_pairs->clear(); vector<string> pairs; SplitStringUsing(line, key_value_pair_delimiters.c_str(), &pairs); bool success = true; for (const auto& pair : pairs) { string key; vector<string> value; if (!SplitStringIntoKeyValues(pair, key_value_delimiters, "", &key, &value)) { // Don't return here, to allow for keys without associated // values; just record that our split failed. success = false; } // we expect atmost one value because we passed in an empty vsep to // SplitStringIntoKeyValues DCHECK_LE(value.size(), 1); kv_pairs->push_back(make_pair(key, value.empty()? "" : value[0])); } return success; } // ---------------------------------------------------------------------- // SplitLeadingDec32Values() // SplitLeadingDec64Values() // A simple parser for space-separated decimal int32/int64 values. // Appends parsed integers to the end of the result vector, stopping // at the first unparsable spot. Skips past leading and repeated // whitespace (does not consume trailing whitespace), and returns // a pointer beyond the last character parsed. // -------------------------------------------------------------------- const char* SplitLeadingDec32Values(const char *str, vector<int32> *result) { for (;;) { char *end = nullptr; long value = strtol(str, &end, 10); if (end == str) break; // Limit long values to int32 min/max. Needed for lp64. if (value > numeric_limits<int32>::max()) { value = numeric_limits<int32>::max(); } else if (value < numeric_limits<int32>::min()) { value = numeric_limits<int32>::min(); } result->push_back(value); str = end; if (!ascii_isspace(*end)) break; } return str; } const char* SplitLeadingDec64Values(const char *str, vector<int64> *result) { for (;;) { char *end = nullptr; const int64 value = strtoll(str, &end, 10); if (end == str) break; result->push_back(value); str = end; if (!ascii_isspace(*end)) break; } return str; } void SplitStringToLines(const char* full, int max_len, int num_lines, vector<string>* result) { if (max_len <= 0) { return; } int pos = 0; for (int i = 0; (i < num_lines || num_lines <= 0); i++) { int cut_at = ClipStringHelper(full+pos, max_len, (i == num_lines - 1)); if (cut_at == -1) { result->push_back(string(full+pos)); return; } result->push_back(string(full+pos, cut_at)); if (i == num_lines - 1 && max_len > kCutStrSize) { result->at(i).append(kCutStr); } pos += cut_at; } }

be/src/kudu/gutil/strings/split.cc (730 lines of code) (raw):