src/kudu/gutil/strings/numbers.cc

// Copyright 2010 Google Inc. All Rights Reserved. // Refactored from contributions of various authors in strings/strutil.cc // // This file contains string processing functions related to // numeric values. #include "kudu/gutil/strings/numbers.h" #include <sys/types.h> #include <cassert> #include <cctype> #include <cerrno> #include <cfloat> // for DBL_DIG and FLT_DIG #include <cmath> // for HUGE_VAL #include <cstdio> #include <cstdlib> #include <cstring> #include <limits> #include <ostream> #include <string> #include <glog/logging.h> #include "kudu/gutil/int128.h" #include "kudu/gutil/integral_types.h" #include "kudu/gutil/gscoped_ptr.h" #include "kudu/gutil/stringprintf.h" #include "kudu/gutil/strtoint.h" #include "kudu/gutil/strings/ascii_ctype.h" using std::numeric_limits; using std::string; // Reads a <double> in *text, which may not be whitespace-initiated. // *len is the length, or -1 if text is '\0'-terminated, which is more // efficient. Sets *text to the end of the double, and val to the // converted value, and the length of the double is subtracted from // *len. <double> may also be a '?', in which case val will be // unchanged. Returns true upon success. If initial_minus is // non-NULL, then *initial_minus will indicate whether the first // symbol seen was a '-', which will be ignored. Similarly, if // final_period is non-NULL, then *final_period will indicate whether // the last symbol seen was a '.', which will be ignored. This is // useful in case that an initial '-' or final '.' would have another // meaning (as a separator, e.g.). static inline bool EatADouble(const char** text, int* len, bool allow_question, double* val, bool* initial_minus, bool* final_period) { const char* pos = *text; int rem = *len; // remaining length, or -1 if null-terminated if (pos == nullptr || rem == 0) return false; if (allow_question && (*pos == '?')) { *text = pos + 1; if (rem != -1) *len = rem - 1; return true; } if (initial_minus) { if ((*initial_minus = (*pos == '-'))) { // Yes, we want assignment. if (rem == 1) return false; ++pos; if (rem != -1) --rem; } } // a double has to begin one of these (we don't allow 'inf' or whitespace) // this also serves as an optimization. if (!strchr("-+.0123456789", *pos)) return false; // strtod is evil in that the second param is a non-const char** char* end_nonconst; double retval; if (rem == -1) { retval = strtod(pos, &end_nonconst); } else { // not '\0'-terminated & no obvious terminator found. must copy. gscoped_array<char> buf(new char[rem + 1]); memcpy(buf.get(), pos, rem); buf[rem] = '\0'; retval = strtod(buf.get(), &end_nonconst); end_nonconst = const_cast<char*>(pos) + (end_nonconst - buf.get()); } if (pos == end_nonconst) return false; if (final_period) { *final_period = (end_nonconst[-1] == '.'); if (*final_period) { --end_nonconst; } } *text = end_nonconst; *val = retval; if (rem != -1) *len = rem - (end_nonconst - pos); return true; } // If update, consume one of acceptable_chars from string *text of // length len and return that char, or '\0' otherwise. If len is -1, // *text is null-terminated. If update is false, don't alter *text and // *len. If null_ok, then update must be false, and, if text has no // more chars, then return '\1' (arbitrary nonzero). static inline char EatAChar(const char** text, int* len, const char* acceptable_chars, bool update, bool null_ok) { assert(!(update && null_ok)); if ((*len == 0) || (**text == '\0')) return (null_ok ? '\1' : '\0'); // if null_ok, we're in predicate mode. if (strchr(acceptable_chars, **text)) { char result = **text; if (update) { ++(*text); if (*len != -1) --(*len); } return result; } return '\0'; // no match; no update } // Parse an expression in 'text' of the form: <comparator><double> or // <double><sep><double> See full comments in header file. bool ParseDoubleRange(const char* text, int len, const char** end, double* from, double* to, bool* is_currency, const DoubleRangeOptions& opts) { const double from_default = opts.dont_modify_unbounded ? *from : -HUGE_VAL; if (!opts.dont_modify_unbounded) { *from = -HUGE_VAL; *to = HUGE_VAL; } if (opts.allow_currency && (is_currency != nullptr)) *is_currency = false; assert(len >= -1); assert(opts.separators && (*opts.separators != '\0')); // these aren't valid separators assert(strlen(opts.separators) == strcspn(opts.separators, "+0123456789eE$")); assert(opts.num_required_bounds <= 2); // Handle easier cases of comparators (<, >) first if (opts.allow_comparators) { char comparator = EatAChar(&text, &len, "<>", true, false); if (comparator) { double* dest = (comparator == '>') ? from : to; EatAChar(&text, &len, "=", true, false); if (opts.allow_currency && EatAChar(&text, &len, "$", true, false)) if (is_currency != nullptr) *is_currency = true; if (!EatADouble(&text, &len, opts.allow_unbounded_markers, dest, nullptr, nullptr)) return false; *end = text; return EatAChar(&text, &len, opts.acceptable_terminators, false, opts.null_terminator_ok); } } bool seen_dollar = (opts.allow_currency && EatAChar(&text, &len, "$", true, false)); // If we see a '-', two things could be happening: -<to> or // <from>... where <from> is negative. Treat initial minus sign as a // separator if '-' is a valid separator. // Similarly, we prepare for the possibility of seeing a '.' at the // end of the number, in case '.' (which really means '..') is a // separator. bool initial_minus_sign = false; bool final_period = false; bool* check_initial_minus = (strchr(opts.separators, '-') && !seen_dollar && (opts.num_required_bounds < 2)) ? (&initial_minus_sign) : nullptr; bool* check_final_period = strchr(opts.separators, '.') ? (&final_period) : nullptr; bool double_seen = EatADouble(&text, &len, opts.allow_unbounded_markers, from, check_initial_minus, check_final_period); // if 2 bounds required, must see a double (or '?' if allowed) if ((opts.num_required_bounds == 2) && !double_seen) return false; if (seen_dollar && !double_seen) { --text; if (len != -1) ++len; seen_dollar = false; } // If we're here, we've read the first double and now expect a // separator and another <double>. char separator = EatAChar(&text, &len, opts.separators, true, false); if (separator == '.') { // seen one '.' as separator; must check for another; perhaps set seplen=2 if (EatAChar(&text, &len, ".", true, false)) { if (final_period) { // We may have three periods in a row. The first is part of the // first number, the others are a separator. Policy: 234...567 // is "234." to "567", not "234" to ".567". EatAChar(&text, &len, ".", true, false); } } else if (!EatAChar(&text, &len, opts.separators, true, false)) { // just one '.' and no other separator; uneat the first '.' we saw --text; if (len != -1) ++len; separator = '\0'; } } // By now, we've consumed whatever separator there may have been, // and separator is true iff there was one. if (!separator) { if (final_period) // final period now considered part of first double EatAChar(&text, &len, ".", true, false); if (initial_minus_sign && double_seen) { *to = *from; *from = from_default; } else if (opts.require_separator || (opts.num_required_bounds > 0 && !double_seen) || (opts.num_required_bounds > 1) ) { return false; } } else { if (initial_minus_sign && double_seen) *from = -(*from); // read second <double> bool second_dollar_seen = (seen_dollar || (opts.allow_currency && !double_seen)) && EatAChar(&text, &len, "$", true, false); bool second_double_seen = EatADouble( &text, &len, opts.allow_unbounded_markers, to, nullptr, nullptr); if (opts.num_required_bounds > double_seen + second_double_seen) return false; if (second_dollar_seen && !second_double_seen) { --text; if (len != -1) ++len; second_dollar_seen = false; } seen_dollar = seen_dollar || second_dollar_seen; } if (seen_dollar && (is_currency != nullptr)) *is_currency = true; // We're done. But we have to check that the next char is a proper // terminator. *end = text; char terminator = EatAChar(&text, &len, opts.acceptable_terminators, false, opts.null_terminator_ok); if (terminator == '.') --(*end); return terminator; } // ---------------------------------------------------------------------- // ConsumeStrayLeadingZeroes // Eliminates all leading zeroes (unless the string itself is composed // of nothing but zeroes, in which case one is kept: 0...0 becomes 0). // -------------------------------------------------------------------- void ConsumeStrayLeadingZeroes(string *const str) { const string::size_type len(str->size()); if (len > 1 && (*str)[0] == '0') { const char *const begin(str->c_str()), *const end(begin + len), *ptr(begin + 1); while (ptr != end && *ptr == '0') { ++ptr; } string::size_type remove(ptr - begin); DCHECK_GT(ptr, begin); if (remove == len) { --remove; // if they are all zero, leave one... } str->erase(0, remove); } } // ---------------------------------------------------------------------- // ParseLeadingInt32Value() // ParseLeadingUInt32Value() // A simple parser for [u]int32 values. Returns the parsed value // if a valid value is found; else returns deflt // This cannot handle decimal numbers with leading 0s. // -------------------------------------------------------------------- int32 ParseLeadingInt32Value(const char *str, int32 deflt) { char *error = nullptr; long value = strtol(str, &error, 0); // Limit long values to int32 min/max. Needed for lp64; no-op on 32 bits. if (value > numeric_limits<int32>::max()) { value = numeric_limits<int32>::max(); } else if (value < numeric_limits<int32>::min()) { value = numeric_limits<int32>::min(); } return (error == str) ? deflt : value; } uint32 ParseLeadingUInt32Value(const char *str, uint32 deflt) { if (numeric_limits<unsigned long>::max() == numeric_limits<uint32>::max()) { // When long is 32 bits, we can use strtoul. char *error = nullptr; const uint32 value = strtoul(str, &error, 0); return (error == str) ? deflt : value; } else { // When long is 64 bits, we must use strto64 and handle limits // by hand. The reason we cannot use a 64-bit strtoul is that // it would be impossible to differentiate "-2" (that should wrap // around to the value UINT_MAX-1) from a string with ULONG_MAX-1 // (that should be pegged to UINT_MAX due to overflow). char *error = nullptr; int64 value = strto64(str, &error, 0); if (value > numeric_limits<uint32>::max() || value < -static_cast<int64>(numeric_limits<uint32>::max())) { value = numeric_limits<uint32>::max(); } // Within these limits, truncation to 32 bits handles negatives correctly. return (error == str) ? deflt : value; } } // ---------------------------------------------------------------------- // ParseLeadingDec32Value // ParseLeadingUDec32Value // A simple parser for [u]int32 values. Returns the parsed value // if a valid value is found; else returns deflt // The string passed in is treated as *10 based*. // This can handle strings with leading 0s. // -------------------------------------------------------------------- int32 ParseLeadingDec32Value(const char *str, int32 deflt) { char *error = nullptr; long value = strtol(str, &error, 10); // Limit long values to int32 min/max. Needed for lp64; no-op on 32 bits. if (value > numeric_limits<int32>::max()) { value = numeric_limits<int32>::max(); } else if (value < numeric_limits<int32>::min()) { value = numeric_limits<int32>::min(); } return (error == str) ? deflt : value; } uint32 ParseLeadingUDec32Value(const char *str, uint32 deflt) { if (numeric_limits<unsigned long>::max() == numeric_limits<uint32>::max()) { // When long is 32 bits, we can use strtoul. char *error = nullptr; const uint32 value = strtoul(str, &error, 10); return (error == str) ? deflt : value; } else { // When long is 64 bits, we must use strto64 and handle limits // by hand. The reason we cannot use a 64-bit strtoul is that // it would be impossible to differentiate "-2" (that should wrap // around to the value UINT_MAX-1) from a string with ULONG_MAX-1 // (that should be pegged to UINT_MAX due to overflow). char *error = nullptr; int64 value = strto64(str, &error, 10); if (value > numeric_limits<uint32>::max() || value < -static_cast<int64>(numeric_limits<uint32>::max())) { value = numeric_limits<uint32>::max(); } // Within these limits, truncation to 32 bits handles negatives correctly. return (error == str) ? deflt : value; } } // ---------------------------------------------------------------------- // ParseLeadingUInt64Value // ParseLeadingInt64Value // ParseLeadingHex64Value // A simple parser for 64-bit values. Returns the parsed value if a // valid integer is found; else returns deflt // UInt64 and Int64 cannot handle decimal numbers with leading 0s. // -------------------------------------------------------------------- uint64 ParseLeadingUInt64Value(const char *str, uint64 deflt) { char *error = nullptr; const uint64 value = strtou64(str, &error, 0); return (error == str) ? deflt : value; } int64 ParseLeadingInt64Value(const char *str, int64 deflt) { char *error = nullptr; const int64 value = strto64(str, &error, 0); return (error == str) ? deflt : value; } uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) { char *error = nullptr; const uint64 value = strtou64(str, &error, 16); return (error == str) ? deflt : value; } // ---------------------------------------------------------------------- // ParseLeadingDec64Value // ParseLeadingUDec64Value // A simple parser for [u]int64 values. Returns the parsed value // if a valid value is found; else returns deflt // The string passed in is treated as *10 based*. // This can handle strings with leading 0s. // -------------------------------------------------------------------- int64 ParseLeadingDec64Value(const char *str, int64 deflt) { char *error = nullptr; const int64 value = strto64(str, &error, 10); return (error == str) ? deflt : value; } uint64 ParseLeadingUDec64Value(const char *str, uint64 deflt) { char *error = nullptr; const uint64 value = strtou64(str, &error, 10); return (error == str) ? deflt : value; } // ---------------------------------------------------------------------- // ParseLeadingDoubleValue() // A simple parser for double values. Returns the parsed value // if a valid value is found; else returns deflt // -------------------------------------------------------------------- double ParseLeadingDoubleValue(const char *str, double deflt) { char *error = nullptr; errno = 0; const double value = strtod(str, &error); if (errno != 0 || // overflow/underflow happened error == str) { // no valid parse return deflt; } else { return value; } } // ---------------------------------------------------------------------- // ParseLeadingBoolValue() // A recognizer of boolean string values. Returns the parsed value // if a valid value is found; else returns deflt. This skips leading // whitespace, is case insensitive, and recognizes these forms: // 0/1, false/true, no/yes, n/y // -------------------------------------------------------------------- bool ParseLeadingBoolValue(const char *str, bool deflt) { static const int kMaxLen = 5; char value[kMaxLen + 1]; // Skip whitespace while (ascii_isspace(*str)) { ++str; } int len = 0; for (; len <= kMaxLen && ascii_isalnum(*str); ++str) value[len++] = ascii_tolower(*str); if (len == 0 || len > kMaxLen) return deflt; value[len] = '\0'; switch (len) { case 1: if (value[0] == '0' || value[0] == 'n') return false; if (value[0] == '1' || value[0] == 'y') return true; break; case 2: if (!strcmp(value, "no")) return false; break; case 3: if (!strcmp(value, "yes")) return true; break; case 4: if (!strcmp(value, "true")) return true; break; case 5: if (!strcmp(value, "false")) return false; break; } return deflt; } // ---------------------------------------------------------------------- // FpToString() // FloatToString() // IntToString() // Convert various types to their string representation, possibly padded // with spaces, using snprintf format specifiers. // ---------------------------------------------------------------------- string FpToString(Fprint fp) { char buf[17]; snprintf(buf, sizeof(buf), "%016" PRIx64, fp); return string(buf); } // Default arguments string Uint128ToHexString(uint128 ui128) { char buf[33]; snprintf(buf, sizeof(buf), "%016" PRIx64, Uint128High64(ui128)); snprintf(buf + 16, sizeof(buf) - 16, "%016" PRIx64, Uint128Low64(ui128)); return string(buf); } namespace { // Represents integer values of digits. // Uses 36 to indicate an invalid character since we support // bases up to 36. static const int8 kAsciiToInt[256] = { 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 16 36s. 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 36, 36, 36, 36, 36, 36, 36, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 36, 36, 36, 36, 36, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36 }; // Input format based on POSIX.1-2008 strtol // http://pubs.opengroup.org/onlinepubs/9699919799/functions/strtol.html template<typename IntType> bool safe_int_internal(const char* start, const char* end, int base, IntType* value_p) { // Consume whitespace. while (start < end && ascii_isspace(start[0])) { ++start; } while (start < end && ascii_isspace(end[-1])) { --end; } if (start >= end) { return false; } // Consume sign. const bool negative = (start[0] == '-'); if (negative || start[0] == '+') { ++start; if (start >= end) { return false; } } // Consume base-dependent prefix. // base 0: "0x" -> base 16, "0" -> base 8, default -> base 10 // base 16: "0x" -> base 16 // Also validate the base. if (base == 0) { if (end - start >= 2 && start[0] == '0' && (start[1] == 'x' || start[1] == 'X')) { base = 16; start += 2; } else if (end - start >= 1 && start[0] == '0') { base = 8; start += 1; } else { base = 10; } } else if (base == 16) { if (end - start >= 2 && start[0] == '0' && (start[1] == 'x' || start[1] == 'X')) { start += 2; } } else if (base >= 2 && base <= 36) { // okay } else { return false; } // Consume digits. // // The classic loop: // // for each digit // value = value * base + digit // value *= sign // // The classic loop needs overflow checking. It also fails on the most // negative integer, -2147483648 in 32-bit two's complement representation. // // My improved loop: // // if (!negative) // for each digit // value = value * base // value = value + digit // else // for each digit // value = value * base // value = value - digit // // Overflow checking becomes simple. // // I present the positive code first for easier reading. IntType value = 0; if (!negative) { const IntType vmax = std::numeric_limits<IntType>::max(); assert(vmax > 0); assert(vmax >= base); const IntType vmax_over_base = vmax / base; // loop over digits // loop body is interleaved for perf, not readability for (; start < end; ++start) { unsigned char c = static_cast<unsigned char>(start[0]); int digit = kAsciiToInt[c]; if (value > vmax_over_base) return false; value *= base; if (digit >= base) return false; if (value > vmax - digit) return false; value += digit; } } else { const IntType vmin = std::numeric_limits<IntType>::min(); assert(vmin < 0); assert(vmin <= 0 - base); IntType vmin_over_base = vmin / base; // 2003 c++ standard [expr.mul] // "... the sign of the remainder is implementation-defined." // Although (vmin/base)*base + vmin%base is always vmin. // 2011 c++ standard tightens the spec but we cannot rely on it. if (vmin % base > 0) { vmin_over_base += 1; } // loop over digits // loop body is interleaved for perf, not readability for (; start < end; ++start) { unsigned char c = static_cast<unsigned char>(start[0]); int digit = kAsciiToInt[c]; if (value < vmin_over_base) return false; value *= base; if (digit >= base) return false; if (value < vmin + digit) return false; value -= digit; } } // Store output. *value_p = value; return true; } } // anonymous namespace bool safe_strto32_base(const char* startptr, const int buffer_size, int32* v, int base) { return safe_int_internal<int32>(startptr, startptr + buffer_size, base, v); } bool safe_strto64_base(const char* startptr, const int buffer_size, int64* v, int base) { return safe_int_internal<int64>(startptr, startptr + buffer_size, base, v); } bool safe_strto32(const char* startptr, const int buffer_size, int32* value) { return safe_int_internal<int32>(startptr, startptr + buffer_size, 10, value); } bool safe_strto64(const char* startptr, const int buffer_size, int64* value) { return safe_int_internal<int64>(startptr, startptr + buffer_size, 10, value); } bool safe_strto32_base(const char* str, int32* value, int base) { char* endptr; errno = 0; // errno only gets set on errors *value = strto32(str, &endptr, base); if (endptr != str) { while (ascii_isspace(*endptr)) ++endptr; } return *str != '\0' && *endptr == '\0' && errno == 0; } bool safe_strto64_base(const char* str, int64* value, int base) { char* endptr; errno = 0; // errno only gets set on errors *value = strto64(str, &endptr, base); if (endptr != str) { while (ascii_isspace(*endptr)) ++endptr; } return *str != '\0' && *endptr == '\0' && errno == 0; } bool safe_strtou32_base(const char* str, uint32* value, int base) { // strtoul does not give any errors on negative numbers, so we have to // search the string for '-' manually. while (ascii_isspace(*str)) ++str; if (*str == '-') return false; char* endptr; errno = 0; // errno only gets set on errors *value = strtou32(str, &endptr, base); if (endptr != str) { while (ascii_isspace(*endptr)) ++endptr; } return *str != '\0' && *endptr == '\0' && errno == 0; } bool safe_strtou64_base(const char* str, uint64* value, int base) { // strtou64 does not give any errors on negative numbers, so we have to // search the string for '-' manually. while (ascii_isspace(*str)) ++str; if (*str == '-') return false; char* endptr; errno = 0; // errno only gets set on errors *value = strtou64(str, &endptr, base); if (endptr != str) { while (ascii_isspace(*endptr)) ++endptr; } return *str != '\0' && *endptr == '\0' && errno == 0; } // ---------------------------------------------------------------------- // u64tostr_base36() // Converts unsigned number to string representation in base-36. // -------------------------------------------------------------------- size_t u64tostr_base36(uint64 number, size_t buf_size, char* buffer) { CHECK_GT(buf_size, 0); CHECK(buffer); static const char kAlphabet[] = "0123456789abcdefghijklmnopqrstuvwxyz"; buffer[buf_size - 1] = '\0'; size_t result_size = 1; do { if (buf_size == result_size) { // Ran out of space. return 0; } int remainder = number % 36; number /= 36; buffer[buf_size - result_size - 1] = kAlphabet[remainder]; result_size++; } while (number); memmove(buffer, buffer + buf_size - result_size, result_size); return result_size - 1; } // Generate functions that wrap safe_strtoXXX_base. #define GEN_SAFE_STRTO(name, type) \ bool name##_base(const string& str, type* value, int base) { \ return name##_base(str.c_str(), value, base); \ } \ bool name(const char* str, type* value) { \ return name##_base(str, value, 10); \ } \ bool name(const string& str, type* value) { \ return name##_base(str.c_str(), value, 10); \ } GEN_SAFE_STRTO(safe_strto32, int32); GEN_SAFE_STRTO(safe_strtou32, uint32); GEN_SAFE_STRTO(safe_strto64, int64); GEN_SAFE_STRTO(safe_strtou64, uint64); #undef GEN_SAFE_STRTO bool safe_strtof(const char* str, float* value) { char* endptr; #ifdef _MSC_VER // has no strtof() *value = strtod(str, &endptr); #else *value = strtof(str, &endptr); #endif if (endptr != str) { while (ascii_isspace(*endptr)) ++endptr; } // Ignore range errors from strtod/strtof. // The values it returns on underflow and // overflow are the right fallback in a // robust setting. return *str != '\0' && *endptr == '\0'; } bool safe_strtod(const char* str, double* value) { char* endptr; *value = strtod(str, &endptr); if (endptr != str) { while (ascii_isspace(*endptr)) ++endptr; } // Ignore range errors from strtod. The values it // returns on underflow and overflow are the right // fallback in a robust setting. return *str != '\0' && *endptr == '\0'; } bool safe_strtof(const string& str, float* value) { return safe_strtof(str.c_str(), value); } bool safe_strtod(const string& str, double* value) { return safe_strtod(str.c_str(), value); } uint64 atoi_kmgt(const char* s) { char* endptr; uint64 n = strtou64(s, &endptr, 10); uint64 scale = 1; char c = *endptr; if (c != '\0') { c = ascii_toupper(c); switch (c) { case 'K': scale = GG_ULONGLONG(1) << 10; break; case 'M': scale = GG_ULONGLONG(1) << 20; break; case 'G': scale = GG_ULONGLONG(1) << 30; break; case 'T': scale = GG_ULONGLONG(1) << 40; break; default: LOG(FATAL) << "Invalid mnemonic: `" << c << "';" << " should be one of `K', `M', `G', and `T'."; } } return n * scale; } // ---------------------------------------------------------------------- // FastIntToBuffer() // FastInt64ToBuffer() // FastHexToBuffer() // FastHex64ToBuffer() // FastHex32ToBuffer() // FastTimeToBuffer() // These are intended for speed. FastHexToBuffer() assumes the // integer is non-negative. FastHexToBuffer() puts output in // hex rather than decimal. FastTimeToBuffer() puts the output // into RFC822 format. If time is 0, uses the current time. // // FastHex64ToBuffer() puts a 64-bit unsigned value in hex-format, // padded to exactly 16 bytes (plus one byte for '\0') // // FastHex32ToBuffer() puts a 32-bit unsigned value in hex-format, // padded to exactly 8 bytes (plus one byte for '\0') // // All functions take the output buffer as an arg. FastInt() // uses at most 22 bytes, FastTime() uses exactly 30 bytes. // They all return a pointer to the beginning of the output, // which may not be the beginning of the input buffer. (Though // for FastTimeToBuffer(), we guarantee that it is.) // ---------------------------------------------------------------------- char *FastInt64ToBuffer(int64 i, char* buffer) { FastInt64ToBufferLeft(i, buffer); return buffer; } char *FastInt32ToBuffer(int32 i, char* buffer) { FastInt32ToBufferLeft(i, buffer); return buffer; } char *FastHexToBuffer(int i, char* buffer) { CHECK_GE(i, 0) << "FastHexToBuffer() wants non-negative integers, not " << i; static const char *hexdigits = "0123456789abcdef"; char *p = buffer + 21; *p-- = '\0'; do { *p-- = hexdigits[i & 15]; // mod by 16 i >>= 4; // divide by 16 } while (i > 0); return p + 1; } char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) { static const char *hexdigits = "0123456789abcdef"; buffer[num_byte] = '\0'; for (int i = num_byte - 1; i >= 0; i--) { buffer[i] = hexdigits[value & 0xf]; value >>= 4; } return buffer; } char *FastHex64ToBuffer(uint64 value, char* buffer) { return InternalFastHexToBuffer(value, buffer, 16); } char *FastHex32ToBuffer(uint32 value, char* buffer) { return InternalFastHexToBuffer(value, buffer, 8); } // TODO(user): revisit the two_ASCII_digits optimization. // // Several converters use this table to reduce // division and modulo operations. extern const char two_ASCII_digits[100][2]; // from strutil.cc // ---------------------------------------------------------------------- // FastInt32ToBufferLeft() // FastUInt32ToBufferLeft() // FastInt64ToBufferLeft() // FastUInt64ToBufferLeft() // // Like the Fast*ToBuffer() functions above, these are intended for speed. // Unlike the Fast*ToBuffer() functions, however, these functions write // their output to the beginning of the buffer (hence the name, as the // output is left-aligned). The caller is responsible for ensuring that // the buffer has enough space to hold the output. // // Returns a pointer to the end of the string (i.e. the null character // terminating the string). // ---------------------------------------------------------------------- char* FastUInt32ToBufferLeft(uint32 u, char* buffer) { uint digits; const char *ASCII_digits = nullptr; // The idea of this implementation is to trim the number of divides to as few // as possible by using multiplication and subtraction rather than mod (%), // and by outputting two digits at a time rather than one. // The huge-number case is first, in the hopes that the compiler will output // that case in one branch-free block of code, and only output conditional // branches into it from below. if (u >= 1000000000) { // >= 1,000,000,000 digits = u / 100000000; // 100,000,000 ASCII_digits = two_ASCII_digits[digits]; buffer[0] = ASCII_digits[0]; buffer[1] = ASCII_digits[1]; buffer += 2; sublt100_000_000: u -= digits * 100000000; // 100,000,000 lt100_000_000: digits = u / 1000000; // 1,000,000 ASCII_digits = two_ASCII_digits[digits]; buffer[0] = ASCII_digits[0]; buffer[1] = ASCII_digits[1]; buffer += 2; sublt1_000_000: u -= digits * 1000000; // 1,000,000 lt1_000_000: digits = u / 10000; // 10,000 ASCII_digits = two_ASCII_digits[digits]; buffer[0] = ASCII_digits[0]; buffer[1] = ASCII_digits[1]; buffer += 2; sublt10_000: u -= digits * 10000; // 10,000 lt10_000: digits = u / 100; ASCII_digits = two_ASCII_digits[digits]; buffer[0] = ASCII_digits[0]; buffer[1] = ASCII_digits[1]; buffer += 2; sublt100: u -= digits * 100; lt100: digits = u; ASCII_digits = two_ASCII_digits[digits]; buffer[0] = ASCII_digits[0]; buffer[1] = ASCII_digits[1]; buffer += 2; done: *buffer = 0; return buffer; } if (u < 100) { digits = u; if (u >= 10) goto lt100; *buffer++ = '0' + digits; goto done; } if (u < 10000) { // 10,000 if (u >= 1000) goto lt10_000; digits = u / 100; *buffer++ = '0' + digits; goto sublt100; } if (u < 1000000) { // 1,000,000 if (u >= 100000) goto lt1_000_000; digits = u / 10000; // 10,000 *buffer++ = '0' + digits; goto sublt10_000; } if (u < 100000000) { // 100,000,000 if (u >= 10000000) goto lt100_000_000; digits = u / 1000000; // 1,000,000 *buffer++ = '0' + digits; goto sublt1_000_000; } // we already know that u < 1,000,000,000 digits = u / 100000000; // 100,000,000 *buffer++ = '0' + digits; goto sublt100_000_000; } char* FastInt32ToBufferLeft(int32 i, char* buffer) { uint32 u = i; if (i < 0) { *buffer++ = '-'; u = ~u + 1; } return FastUInt32ToBufferLeft(u, buffer); } char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) { uint digits; const char *ASCII_digits = nullptr; uint32 u = static_cast<uint32>(u64); if (u == u64) return FastUInt32ToBufferLeft(u, buffer); uint64 top_11_digits = u64 / 1000000000; buffer = FastUInt64ToBufferLeft(top_11_digits, buffer); u = u64 - (top_11_digits * 1000000000); digits = u / 10000000; // 10,000,000 DCHECK_LT(digits, 100); ASCII_digits = two_ASCII_digits[digits]; buffer[0] = ASCII_digits[0]; buffer[1] = ASCII_digits[1]; buffer += 2; u -= digits * 10000000; // 10,000,000 digits = u / 100000; // 100,000 ASCII_digits = two_ASCII_digits[digits]; buffer[0] = ASCII_digits[0]; buffer[1] = ASCII_digits[1]; buffer += 2; u -= digits * 100000; // 100,000 digits = u / 1000; // 1,000 ASCII_digits = two_ASCII_digits[digits]; buffer[0] = ASCII_digits[0]; buffer[1] = ASCII_digits[1]; buffer += 2; u -= digits * 1000; // 1,000 digits = u / 10; ASCII_digits = two_ASCII_digits[digits]; buffer[0] = ASCII_digits[0]; buffer[1] = ASCII_digits[1]; buffer += 2; u -= digits * 10; digits = u; *buffer++ = '0' + digits; *buffer = 0; return buffer; } char* FastInt64ToBufferLeft(int64 i, char* buffer) { uint64 u = i; if (i < 0) { *buffer++ = '-'; u = ~u + 1; } return FastUInt64ToBufferLeft(u, buffer); } char* FastUInt128ToBufferLeft(unsigned __int128 i, char* buffer) { static const unsigned __int128 TWENTY_DIGITS = static_cast<unsigned __int128>(10000000000) * static_cast<unsigned __int128>(10000000000); uint64 u = static_cast<uint64>(i); if (u == i) return FastUInt64ToBufferLeft(u, buffer); unsigned __int128 top_19_digits = i / TWENTY_DIGITS; buffer = FastUInt64ToBufferLeft(top_19_digits, buffer); unsigned __int128 rem128 = i - (top_19_digits * TWENTY_DIGITS); unsigned __int128 middle_19_digits = rem128 / 10; buffer = FastUInt64ToBufferLeft(middle_19_digits, buffer); u = rem128 - (middle_19_digits * 10); buffer = FastUInt32ToBufferLeft(u, buffer); return buffer; } char* FastInt128ToBufferLeft(__int128 i, char* buffer) { unsigned __int128 u = i; if (i < 0) { *buffer++ = '-'; u = ~u + 1; } return FastUInt128ToBufferLeft(u, buffer); } int HexDigitsPrefix(const char* buf, int num_digits) { for (int i = 0; i < num_digits; i++) if (!ascii_isxdigit(buf[i])) return 0; // This also detects end of string as '\0' is not xdigit. return 1; } // ---------------------------------------------------------------------- // AutoDigitStrCmp // AutoDigitLessThan // StrictAutoDigitLessThan // autodigit_less // autodigit_greater // strict_autodigit_less // strict_autodigit_greater // These are like less<string> and greater<string>, except when a // run of digits is encountered at corresponding points in the two // arguments. Such digit strings are compared numerically instead // of lexicographically. Therefore if you sort by // "autodigit_less", some machine names might get sorted as: // exaf1 // exaf2 // exaf10 // When using "strict" comparison (AutoDigitStrCmp with the strict flag // set to true, or the strict version of the other functions), // strings that represent equal numbers will not be considered equal if // the string representations are not identical. That is, "01" < "1" in // strict mode, but "01" == "1" otherwise. // ---------------------------------------------------------------------- int AutoDigitStrCmp(const char* a, int alen, const char* b, int blen, bool strict) { int aindex = 0; int bindex = 0; while ((aindex < alen) && (bindex < blen)) { if (isdigit(a[aindex]) && isdigit(b[bindex])) { // Compare runs of digits. Instead of extracting numbers, we // just skip leading zeroes, and then get the run-lengths. This // allows us to handle arbitrary precision numbers. We remember // how many zeroes we found so that we can differentiate between // "1" and "01" in strict mode. // Skip leading zeroes, but remember how many we found int azeroes = aindex; int bzeroes = bindex; while ((aindex < alen) && (a[aindex] == '0')) aindex++; while ((bindex < blen) && (b[bindex] == '0')) bindex++; azeroes = aindex - azeroes; bzeroes = bindex - bzeroes; // Count digit lengths int astart = aindex; int bstart = bindex; while ((aindex < alen) && isdigit(a[aindex])) aindex++; while ((bindex < blen) && isdigit(b[bindex])) bindex++; if (aindex - astart < bindex - bstart) { // a has shorter run of digits: so smaller return -1; } else if (aindex - astart > bindex - bstart) { // a has longer run of digits: so larger return 1; } else { // Same lengths, so compare digit by digit for (int i = 0; i < aindex-astart; i++) { if (a[astart+i] < b[bstart+i]) { return -1; } else if (a[astart+i] > b[bstart+i]) { return 1; } } // Equal: did one have more leading zeroes? if (strict && azeroes != bzeroes) { if (azeroes > bzeroes) { // a has more leading zeroes: a < b return -1; } else { // b has more leading zeroes: a > b return 1; } } // Equal: so continue scanning } } else if (a[aindex] < b[bindex]) { return -1; } else if (a[aindex] > b[bindex]) { return 1; } else { aindex++; bindex++; } } if (aindex < alen) { // b is prefix of a return 1; } else if (bindex < blen) { // a is prefix of b return -1; } else { // a is equal to b return 0; } } bool AutoDigitLessThan(const char* a, int alen, const char* b, int blen) { return AutoDigitStrCmp(a, alen, b, blen, false) < 0; } bool StrictAutoDigitLessThan(const char* a, int alen, const char* b, int blen) { return AutoDigitStrCmp(a, alen, b, blen, true) < 0; } // ---------------------------------------------------------------------- // SimpleDtoa() // SimpleFtoa() // DoubleToBuffer() // FloatToBuffer() // We want to print the value without losing precision, but we also do // not want to print more digits than necessary. This turns out to be // trickier than it sounds. Numbers like 0.2 cannot be represented // exactly in binary. If we print 0.2 with a very large precision, // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167". // On the other hand, if we set the precision too low, we lose // significant digits when printing numbers that actually need them. // It turns out there is no precision value that does the right thing // for all numbers. // // Our strategy is to first try printing with a precision that is never // over-precise, then parse the result with strtod() to see if it // matches. If not, we print again with a precision that will always // give a precise result, but may use more digits than necessary. // // An arguably better strategy would be to use the algorithm described // in "How to Print Floating-Point Numbers Accurately" by Steele & // White, e.g. as implemented by David M. Gay's dtoa(). It turns out, // however, that the following implementation is about as fast as // DMG's code. Furthermore, DMG's code locks mutexes, which means it // will not scale well on multi-core machines. DMG's code is slightly // more accurate (in that it will never use more digits than // necessary), but this is probably irrelevant for most users. // // Rob Pike and Ken Thompson also have an implementation of dtoa() in // third_party/fmt/fltfmt.cc. Their implementation is similar to this // one in that it makes guesses and then uses strtod() to check them. // Their implementation is faster because they use their own code to // generate the digits in the first place rather than use snprintf(), // thus avoiding format string parsing overhead. However, this makes // it considerably more complicated than the following implementation, // and it is embedded in a larger library. If speed turns out to be // an issue, we could re-implement this in terms of their // implementation. // ---------------------------------------------------------------------- string SimpleDtoa(double value) { char buffer[kDoubleToBufferSize]; return DoubleToBuffer(value, buffer); } string SimpleFtoa(float value) { char buffer[kFloatToBufferSize]; return FloatToBuffer(value, buffer); } char* DoubleToBuffer(double value, char* buffer) { // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all // platforms these days. Just in case some system exists where DBL_DIG // is significantly larger -- and risks overflowing our buffer -- we have // this assert. COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big); int snprintf_result = snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value); // The snprintf should never overflow because the buffer is significantly // larger than the precision we asked for. DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize); if (strtod(buffer, nullptr) != value) { snprintf_result = snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value); // Should never overflow; see above. DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize); } return buffer; } char* FloatToBuffer(float value, char* buffer) { // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all // platforms these days. Just in case some system exists where FLT_DIG // is significantly larger -- and risks overflowing our buffer -- we have // this assert. COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big); int snprintf_result = snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value); // The snprintf should never overflow because the buffer is significantly // larger than the precision we asked for. DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize); float parsed_value; if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) { snprintf_result = snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+2, value); // Should never overflow; see above. DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize); } return buffer; } // ---------------------------------------------------------------------- // SimpleItoaWithCommas() // Description: converts an integer to a string. // Puts commas every 3 spaces. // Faster than printf("%d")? // // Return value: string // ---------------------------------------------------------------------- string SimpleItoaWithCommas(int32 i) { // 10 digits, 3 commas, and sign are good for 32-bit or smaller ints. // Longest is -2,147,483,648. char local[14]; char *p = local + sizeof(local); // Need to use uint32 instead of int32 to correctly handle // -2,147,483,648. uint32 n = i; if (i < 0) n = 0 - n; // negate the unsigned value to avoid overflow *--p = '0' + n % 10; // this case deals with the number "0" n /= 10; while (n) { *--p = '0' + n % 10; n /= 10; if (n == 0) break; *--p = '0' + n % 10; n /= 10; if (n == 0) break; *--p = ','; *--p = '0' + n % 10; n /= 10; // For this unrolling, we check if n == 0 in the main while loop } if (i < 0) *--p = '-'; return string(p, local + sizeof(local)); } // We need this overload because otherwise SimpleItoaWithCommas(5U) wouldn't // compile. string SimpleItoaWithCommas(uint32 i) { // 10 digits and 3 commas are good for 32-bit or smaller ints. // Longest is 4,294,967,295. char local[13]; char *p = local + sizeof(local); *--p = '0' + i % 10; // this case deals with the number "0" i /= 10; while (i) { *--p = '0' + i % 10; i /= 10; if (i == 0) break; *--p = '0' + i % 10; i /= 10; if (i == 0) break; *--p = ','; *--p = '0' + i % 10; i /= 10; // For this unrolling, we check if i == 0 in the main while loop } return string(p, local + sizeof(local)); } string SimpleItoaWithCommas(int64 i) { // 19 digits, 6 commas, and sign are good for 64-bit or smaller ints. char local[26]; char *p = local + sizeof(local); // Need to use uint64 instead of int64 to correctly handle // -9,223,372,036,854,775,808. uint64 n = i; if (i < 0) n = 0 - n; *--p = '0' + n % 10; // this case deals with the number "0" n /= 10; while (n) { *--p = '0' + n % 10; n /= 10; if (n == 0) break; *--p = '0' + n % 10; n /= 10; if (n == 0) break; *--p = ','; *--p = '0' + n % 10; n /= 10; // For this unrolling, we check if n == 0 in the main while loop } if (i < 0) *--p = '-'; return string(p, local + sizeof(local)); } // We need this overload because otherwise SimpleItoaWithCommas(5ULL) wouldn't // compile. string SimpleItoaWithCommas(uint64 i) { // 20 digits and 6 commas are good for 64-bit or smaller ints. // Longest is 18,446,744,073,709,551,615. char local[26]; char *p = local + sizeof(local); *--p = '0' + i % 10; // this case deals with the number "0" i /= 10; while (i) { *--p = '0' + i % 10; i /= 10; if (i == 0) break; *--p = '0' + i % 10; i /= 10; if (i == 0) break; *--p = ','; *--p = '0' + i % 10; i /= 10; // For this unrolling, we check if i == 0 in the main while loop } return string(p, local + sizeof(local)); } // ---------------------------------------------------------------------- // ItoaKMGT() // Description: converts an integer to a string // Truncates values to a readable unit: K, G, M or T // Opposite of atoi_kmgt() // e.g. 100 -> "100" 1500 -> "1500" 4000 -> "3K" 57185920 -> "45M" // // Return value: string // ---------------------------------------------------------------------- string ItoaKMGT(int64 i) { const char *sign = "", *suffix = ""; if (i < 0) { // We lose some accuracy if the caller passes LONG_LONG_MIN, but // that's OK as this function is only for human readability if (i == numeric_limits<int64>::min()) i++; sign = "-"; i = -i; } int64 val; if ((val = (i >> 40)) > 1) { suffix = "T"; } else if ((val = (i >> 30)) > 1) { suffix = "G"; } else if ((val = (i >> 20)) > 1) { suffix = "M"; } else if ((val = (i >> 10)) > 1) { suffix = "K"; } else { val = i; } return StringPrintf("%s%" PRId64 "%s", sign, val, suffix); } // DEPRECATED(wadetregaskis). // These are non-inline because some BUILD files turn on -Wformat-non-literal. string FloatToString(float f, const char* format) { return StringPrintf(format, f); } string IntToString(int i, const char* format) { return StringPrintf(format, i); } string Int64ToString(int64 i64, const char* format) { return StringPrintf(format, i64); } string UInt64ToString(uint64 ui64, const char* format) { return StringPrintf(format, ui64); }

src/kudu/gutil/strings/numbers.cc (979 lines of code) (raw):