library/base/string_utilities.cpp (1,106 lines of code) (raw):
/*
* Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License, version 2.0,
* as published by the Free Software Foundation.
*
* This program is designed to work with certain software (including
* but not limited to OpenSSL) that is licensed under separate terms, as
* designated in a particular file or component or in included license
* documentation. The authors of MySQL hereby grant you an additional
* permission to link the program and your derivative works with the
* separately licensed software that they have either included with
* the program or referenced in the documentation.
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
* the GNU General Public License, version 2.0, for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "base/string_utilities.h"
#include "base/file_functions.h"
#include "base/log.h"
#include <stdexcept>
#include <functional>
#include <locale>
#include <algorithm>
#include <math.h>
#include <errno.h>
#include <string.h>
#include <fstream>
#include <boost/locale/encoding_utf.hpp>
DEFAULT_LOG_DOMAIN(DOMAIN_BASE);
namespace base {
#ifdef _MSC_VER
// Win uses C++11 with support for wstring_convert. Other platforms use boost for now.
//--------------------------------------------------------------------------------------------------
thread_local static std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utf16Converter;
thread_local static std::wstring_convert<std::codecvt_utf8<__int32>, __int32> utf32Converter;
/**
* Converts an UTF-8 encoded string to an UTF-16 string.
*/
std::wstring string_to_wstring(const std::string &s) {
if (sizeof(wchar_t) > 2) {
auto utf32String = utf32Converter.from_bytes(s);
return std::wstring(utf32String.begin(), utf32String.end());
} else
return utf16Converter.from_bytes(s);
}
//--------------------------------------------------------------------------------------------------
/**
* Converts an UTF-16 encoded string to an UTF-8 string.
*/
std::string wstring_to_string(const std::wstring &s) {
if (sizeof(wchar_t) > 2)
return utf32Converter.to_bytes((__int32 *)s.c_str());
else
return utf16Converter.to_bytes(s);
}
//--------------------------------------------------------------------------------------------------
std::wstring path_from_utf8(const std::string &s) {
return string_to_wstring(s);
}
#else
using boost::locale::conv::utf_to_utf;
std::wstring string_to_wstring(const std::string &str) {
return utf_to_utf<wchar_t>(str.c_str(), str.c_str() + str.size());
}
//--------------------------------------------------------------------------------------------------
std::string wstring_to_string(const std::wstring &str) {
if (sizeof(wchar_t) > 2)
return utf_to_utf<char>((int32_t *)str.c_str(), (int32_t *)str.c_str() + str.size());
else
return utf_to_utf<char>(str.c_str(), str.c_str() + str.size());
}
//--------------------------------------------------------------------------------------------------
std::string path_from_utf8(const std::string &s) {
return s;
}
#endif
//--------------------------------------------------------------------------------------------------
std::string string_to_path_for_open(const std::string &s) {
// XXX: convert from utf-8 to wide string and then back to utf-8?
// How can this help in any way here?
#ifdef _MSC_VER
std::wstring ws = string_to_wstring(s);
int buflen = GetShortPathNameW(ws.c_str(), NULL, 0);
if (buflen > 0) {
wchar_t *buffer = g_new(wchar_t, buflen);
if (GetShortPathNameW(ws.c_str(), buffer, buflen) > 0) {
char *buffer2;
buflen = WideCharToMultiByte(CP_UTF8, 0, buffer, buflen, NULL, 0, 0, 0);
buffer2 = g_new(char, buflen);
if (WideCharToMultiByte(CP_UTF8, 0, buffer, buflen, buffer2, buflen, 0, 0) == 0) {
std::string path(buffer2);
g_free(buffer2);
g_free(buffer);
return path;
}
g_free(buffer2);
}
g_free(buffer);
}
return s;
#else
return s;
#endif
}
//--------------------------------------------------------------------------------------------------
inline bool is_invalid_filesystem_char(int ch) {
static const char invalids[] = "/?<>\\:*|\"^";
return memchr(invalids, ch, sizeof(invalids) - 1) != NULL;
}
std::string sanitize_file_name(const std::string &s) {
static const char *invalid_filenames[] = {"com1", "com2", "com3", "com4", "com5", "com6", "com7", "com8",
"com9", "lpt1", "lpt2", "lpt3", "lpt4", "lpt5", "lpt6", "lpt7",
"lpt8", "lpt9", "con", "nul", "prn", ".", "..", NULL};
std::string out;
for (std::string::const_iterator c = s.begin(); c != s.end(); ++c) {
// utf-8 has the high-bit = 1, so we just copy those verbatim
if ((unsigned char)*c >= 128 || isalnum(*c) || (ispunct(*c) && !is_invalid_filesystem_char(*c)))
out.push_back(*c);
else
out.push_back('_');
}
// not valid under windows
if (!out.empty() && (out[out.size() - 1] == ' ' || out[out.size() - 1] == '.'))
out[out.size() - 1] = '_';
for (const char **fn = invalid_filenames; *fn; ++fn) {
if (strcmp(out.c_str(), *fn) == 0) {
out.append("_");
break;
}
}
return out;
}
//--------------------------------------------------------------------------------------------------
std::string trim_right(const std::string &s, const std::string &t) {
std::string d(s);
std::string::size_type i(d.find_last_not_of(t));
if (i == std::string::npos)
return "";
else
return d.erase(d.find_last_not_of(t) + 1);
}
//--------------------------------------------------------------------------------------------------
std::string trim_left(const std::string &s, const std::string &t) {
std::string d(s);
return d.erase(0, s.find_first_not_of(t));
}
//--------------------------------------------------------------------------------------------------
std::string trim(const std::string &s, const std::string &t) {
std::string d(s);
return trim_left(trim_right(d, t), t);
}
//--------------------------------------------------------------------------------------------------
/**
* Simple case conversion routine, which returns a new string.
* Note: converting to lower can be wrong when the returned string is used for string comparison,
* because in some cultures letter cases are more complicated. Use string_compare instead in such cases.
*/
std::string tolower(const std::string &s) {
char *str_down = g_utf8_strdown(s.c_str(), (gsize)s.length());
std::string result(str_down);
g_free(str_down);
return result;
}
//--------------------------------------------------------------------------------------------------
std::string toupper(const std::string &s) {
char *str_up = g_utf8_strup(s.c_str(), (gsize)s.length());
std::string result(str_up);
g_free(str_up);
return result;
}
//--------------------------------------------------------------------------------------------------
std::string truncate_text(const std::string &s, int max_length) {
if ((int)s.length() > max_length) {
std::string shortened(s.substr(0, max_length));
const char *prev = g_utf8_find_prev_char(shortened.c_str(), shortened.c_str() + (max_length - 1));
if (prev) {
shortened.resize(prev - shortened.c_str(), 0);
shortened.append("...");
}
return shortened;
}
return s;
}
//--------------------------------------------------------------------------------------------------
std::string sanitize_utf8(const std::string &s) {
const char *end = 0;
if (!g_utf8_validate(s.data(), (gsize)s.size(), &end))
return std::string(s.data(), end);
return s;
}
//--------------------------------------------------------------------------------------------------
std::vector<std::string> split(const std::string &s, const std::string &sep, int count) {
std::vector<std::string> parts;
std::string ss = s;
std::string::size_type p;
if (s.empty())
return parts;
if (count == 0)
count = -1;
p = ss.find(sep);
while (!ss.empty() && p != std::string::npos && (count < 0 || count > 0)) {
parts.push_back(ss.substr(0, p));
ss = ss.substr(p + sep.size());
--count;
p = ss.find(sep);
}
parts.push_back(ss);
return parts;
}
//--------------------------------------------------------------------------------------------------
std::vector<std::string> split_by_set(const std::string &s, const std::string &separator_set, int count) {
std::vector<std::string> parts;
std::string ss = s;
std::string::size_type p;
if (s.empty())
return parts;
if (count == 0)
count = -1;
p = ss.find_first_of(separator_set);
while (!ss.empty() && p != std::string::npos && (count < 0 || count > 0)) {
parts.push_back(ss.substr(0, p));
ss = ss.substr(p + 1);
--count;
p = ss.find_first_of(separator_set);
}
parts.push_back(ss);
return parts;
}
//--------------------------------------------------------------------------------------------------
static void findUntil(const char elem, const std::string &str, const int sep, std::string::size_type &p,
std::string::size_type &pe, std::string::size_type &end, std::vector<std::string> &parts) {
// keep going until we find closing '
while (pe < end) {
auto it = str[pe++];
if (it == elem) {
if (pe < end && str[pe] == elem)
pe++;
else
break;
} else if (it == '\\') {
if (pe < end)
pe++;
}
}
parts.push_back(str.substr(p, pe - p));
p = pe;
// skip whitespace
while (p < end && (str[p] == ' ' || str[p] == '\t' || str[p] == '\r' || str[p] == '\n'))
p++;
if (p < end) {
if (str[p] != sep)
logDebug("Error splitting string list\n");
else
p++;
}
}
std::vector<std::string> split_token_list(const std::string &s, int sep) {
std::vector<std::string> parts;
std::string ss = s;
std::string::size_type end = s.size(), pe, p = 0;
{
bool empty_pending = true;
while (p < end) {
empty_pending = false;
switch (s[p]) {
case '\'':
pe = p + 1;
findUntil('\'', s, sep, p, pe, end, parts);
break;
case '"':
pe = p + 1;
findUntil('"', s, sep, p, pe, end, parts);
break;
case ' ':
case '\t':
p++;
break;
default:
// skip until separator
pe = p;
while (pe < end) {
if (s[pe] == sep) {
empty_pending = true;
break;
}
pe++;
}
parts.push_back(trim_right(s.substr(p, pe - p)));
p = pe + 1;
// skip whitespace
while (p < end && (s[p] == ' ' || s[p] == '\t' || s[p] == '\r' || s[p] == '\n'))
p++;
break;
}
}
if (empty_pending)
parts.push_back("");
}
return parts;
}
//--------------------------------------------------------------------------------------------------
bool partition(const std::string &s, const std::string &sep, std::string &left, std::string &right) {
std::string::size_type p = s.find(sep);
if (p != std::string::npos) {
left = s.substr(0, p);
right = s.substr(p + sep.size());
return true;
}
left = s;
right = "";
return false;
}
//--------------------------------------------------------------------------------------------------
/**
* Returns the index of the given string in the given vector or -1 if not found.
*/
int index_of(const std::vector<std::string> &list, const std::string &s) {
std::vector<std::string>::const_iterator location = std::find(list.begin(), list.end(), s);
if (location == list.end())
return -1;
return (int)(location - list.begin());
}
//--------------------------------------------------------------------------------------------------
/**
* Returns a string containing all characters beginning at "start" from the given string "id", which form
* a valid, unqualified identifier. The returned identifier does not contain any quoting anymore.
* Note: this function is UTF-8 safe as it skips over all characters except some which are guaranteed
* not to be part of any valid UTF-8 sequence.
*
* @param id The string to examine.
* @param start The start position to search from.
*
* @result Returns the first found identifier starting at "start" or an empty string if nothing was
* found. Parameter "start" points to the first character after the found identifier.
*/
std::string get_identifier(const std::string &id, std::string::const_iterator &start) {
std::string::const_iterator token_end = id.end();
bool is_symbol_quoted = false;
for (std::string::const_iterator i = start, i_end = token_end; i != i_end; ++i) {
if (i_end != token_end)
break;
switch (*i) {
case '.':
if (!is_symbol_quoted)
token_end = i;
break;
case ' ':
if (!is_symbol_quoted)
token_end = i;
break;
case '\'':
case '"':
case '`':
if (*i == *start) {
if (i != start)
token_end = i + 1;
else
is_symbol_quoted = true;
}
break;
}
}
if (token_end - start < 2)
is_symbol_quoted = false;
std::string result(start, token_end);
start = token_end;
if (is_symbol_quoted)
return result.substr(1, result.size() - 2);
return result;
}
//--------------------------------------------------------------------------------------------------
/**
* Splits the given string into identifier parts assuming a format as allowed by the MySQL syntax for
* qualified identifiers, e.g. part1.part2.part3 (any of the parts might be quoted).
* In addition to the traditional syntax also these enhancements are supported:
* - Unlimited level of nesting.
* - Quoting might be done using single quotes, double quotes and back ticks.
*
* If an identifier is not separated by a dot from the rest of the input then this is considered
* invalid input and ignored. Only identifiers found until that syntax violation are returned.
*/
std::vector<std::string> split_qualified_identifier(const std::string &id) {
std::vector<std::string> result;
std::string::const_iterator iterator = id.begin();
std::string token;
do {
token = get_identifier(id, iterator);
if (token == "")
break;
result.push_back(token);
} while ((iterator != id.end()) && (*iterator++ == '.'));
return result;
}
//--------------------------------------------------------------------------------------------------
/**
* Removes the first path part from @path and returns this part as well as the shortend path.
*/
std::string pop_path_front(std::string &path) {
std::string::size_type p = path.find('/');
std::string res;
if (p == std::string::npos || p == path.length() - 1) {
res = path;
path.clear();
return res;
}
res = path.substr(0, p);
path = path.substr(p + 1);
return res;
}
//--------------------------------------------------------------------------------------------------
/**
* Removes the last path part from @path and returns this part as well as the shortend path.
*/
std::string pop_path_back(std::string &path) {
std::string::size_type p = path.rfind('/');
std::string res;
if (p == std::string::npos || p == path.length() - 1) {
res = path;
path.clear();
return res;
}
res = path.substr(p + 1);
path = path.substr(0, p);
return res;
}
//--------------------------------------------------------------------------------------------------
/**
* Helper routine to format a string into an STL string using the printf parameter syntax.
*/
std::string strfmt(const char *fmt, ...) {
va_list args;
char *tmp;
std::string ret;
va_start(args, fmt);
tmp = g_strdup_vprintf(fmt, args);
va_end(args);
ret = tmp;
g_free(tmp);
return ret;
}
//--------------------------------------------------------------------------------------------------
BASELIBRARY_PUBLIC_FUNC std::string sizefmt(int64_t s, bool metric) {
float one_kb;
const char *unit;
if (metric) {
one_kb = 1000;
unit = "B";
} else {
one_kb = 1024;
unit = "iB"; // http://en.wikipedia.org/wiki/Binary_prefix
}
if (s < one_kb)
return strfmt("%iB", (int)s);
else {
float value = s / one_kb;
if (value < one_kb)
return strfmt("%.02fK%s", value, unit);
else {
value /= one_kb;
if (value < one_kb)
return strfmt("%.02fM%s", value, unit);
else {
value /= one_kb;
if (value < one_kb)
return strfmt("%.02fG%s", value, unit);
else {
value /= one_kb;
if (value < one_kb)
return strfmt("%.02fT%s", value, unit);
else
return strfmt("%.02fP%s", value / one_kb, unit);
}
}
}
}
}
//--------------------------------------------------------------------------------------------------
/**
* Helper routine to strip a string into an STL string using the printf parameter syntax.
*/
std::string strip_text(const std::string &text, bool left, bool right) { // TODO sigc rewrite it in std/boost way
std::locale loc;
std::function<bool(std::string::value_type)> is_space =
std::bind(&std::isspace<std::string::value_type>, std::placeholders::_1, loc);
std::string::const_iterator l_edge =
!left ? text.begin()
: std::find_if(text.begin(), text.end(),
std::bind(std::logical_not<bool>(), std::bind(is_space, std::placeholders::_1)));
std::string::const_reverse_iterator r_edge =
!right ? text.rbegin()
: std::find_if(text.rbegin(), text.rend(),
std::bind(std::logical_not<bool>(), std::bind(is_space, std::placeholders::_1)));
return std::string(l_edge, r_edge.base());
}
//--------------------------------------------------------------------------------------------------
/**
* replaces a variable from a string in format %variable%
* a filter can be passed to the variable as in %variable|filter%
* supported filters are upper, lower and capitalize
*/
std::string replaceVariable(const std::string &format, const std::string &variable, const std::string &value) {
std::string result = format;
std::string::size_type pos;
for (;;) {
std::string s;
std::string::size_type end;
pos = result.find(variable.substr(0, variable.size() - 1));
if (pos == std::string::npos)
break;
end = result.find('%', pos + 1);
if (end == std::string::npos) // bad format
break;
s = result.substr(pos + 1, end - pos - 1);
std::string::size_type filter_pos = s.find("|");
std::string filtered_value = value;
if (filter_pos == std::string::npos) {
if (s.length() != variable.length() - 2)
break;
} else if (filter_pos != variable.length() - 2)
break;
else {
std::string filter = s.substr(filter_pos + 1, s.size() - filter_pos);
if (filter.compare("capitalize") == 0) {
gunichar ch = g_utf8_get_char(value.data());
ch = g_unichar_toupper(ch);
gchar *rest = g_utf8_find_next_char(value.data(), value.data() + value.size());
char utf8[10];
utf8[g_unichar_to_utf8(ch, utf8)] = 0;
filtered_value = std::string(utf8).append(rest);
} else if (filter.compare("uncapitalize") == 0) {
gunichar ch = g_utf8_get_char(value.data());
ch = g_unichar_tolower(ch);
gchar *rest = g_utf8_find_next_char(value.data(), value.data() + value.size());
char utf8[10];
utf8[g_unichar_to_utf8(ch, utf8)] = 0;
filtered_value = std::string(utf8).append(rest);
} else if (filter.compare("lower") == 0) {
gchar *l = g_utf8_strdown(value.data(), (gssize)value.size());
if (l)
filtered_value = l;
g_free(l);
} else if (filter.compare("upper") == 0) {
gchar *l = g_utf8_strup(value.data(), (gssize)value.size());
if (l)
filtered_value = l;
g_free(l);
}
}
result = result.substr(0, pos).append(filtered_value).append(result.substr(end + 1));
}
return result;
}
//--------------------------------------------------------------------------------------------------
/**
* Add the given extension to the filename, if necessary.
*
*/
std::string normalize_path_extension(std::string filename, std::string extension) {
if (!extension.empty() && !filename.empty()) {
std::string::size_type p = filename.rfind('.');
std::string old_extension = p != std::string::npos ? filename.substr(p) : "";
if (old_extension.find('/') != std::string::npos || old_extension.find('\\') != std::string::npos)
old_extension.clear();
if (!extension.empty() && extension[0] != '.')
extension = "." + extension;
if (old_extension.empty())
filename.append(extension);
else {
if (old_extension != extension)
filename = filename.substr(0, p).append(extension);
}
}
return filename;
}
/**
* Removes all unnecessary path separators as well as "./" combinations.
* If there is a parent-dir entry (../) then this as well as the directly prefacing
* dir entry is removed.
*/
std::string normalize_path(const std::string path) {
// First convert all separators to the one that is used on the platform (no mix)
// and ease so at the same time further processing here.
std::string result;
std::string separator(1, G_DIR_SEPARATOR);
result = path;
replaceStringInplace(result, "\\", separator);
replaceStringInplace(result, "/", separator);
std::string double_separator = separator + separator;
while (result.find(double_separator) != std::string::npos)
replaceStringInplace(result, double_separator, separator);
// Sanity check. Return *after* we have converted the slashs. This is part of the normalization.
if (result.size() < 2)
return result;
std::vector<std::string> parts = split(result, separator);
// Construct result backwards while examining the path parts.
result = "";
int pending_count = 0;
for (ssize_t i = parts.size() - 1; i >= 0; i--) {
if (parts[i].compare(".") == 0)
// References to the current directory can be removed without further change.
continue;
if (parts[i].compare("..") == 0) {
// An entry that points back to the parent dir.
// Ignore this and keep track for later removal of the parent dir.
pending_count++;
} else if (pending_count > 0) {
// If this is a normal dir entry and we have pending parent-dir redirections
// then go one step up by removing (ignoring) this entry.
pending_count--;
} else
result = separator + parts[i] + result;
}
// Don't return the leading separator.
return result.substr(1);
}
std::string expand_tilde(const std::string &path) {
if (!path.empty() && path[0] == '~' && (path.size() == 1 || path[1] == G_DIR_SEPARATOR)) {
const char *homedir = g_getenv("HOME");
if (!homedir)
homedir = g_get_home_dir();
return std::string(homedir).append(path.substr(1));
}
return path;
}
//--------------------------------------------------------------------------------------------------
/**
* Checks the input for characters not allowed in the file system and converts them to underscore.
*/
std::string make_valid_filename(const std::string &name) {
std::string result;
std::string illegal_chars = "\\/:?\"<>|*";
for (std::string::const_iterator iterator = name.begin(); iterator != name.end(); ++iterator) {
if (illegal_chars.find(*iterator) != std::string::npos)
result += '_';
else
result += *iterator;
}
return result;
}
//--------------------------------------------------------------------------------------------------
/**
* Get a string containing the 'len' left most characters.
*/
std::string left(const std::string &s, size_t len) {
return s.substr(0, len);
}
//--------------------------------------------------------------------------------------------------
/**
* Get a string containing the 'len' right most characters.
*/
std::string right(const std::string &s, size_t len) {
if (len > s.size())
len = s.size();
if (len < 1)
return "";
return s.substr(std::max(s.length() - len, (size_t)0));
}
//--------------------------------------------------------------------------------------------------
/**
* Tests if s begins with part.
*/
bool hasPrefix(const std::string &s, const std::string &part) {
return s.compare(0, part.length(), part) == 0;
}
//--------------------------------------------------------------------------------------------------
/**
* Tests if s ends with part.
*/
bool hasSuffix(const std::string &s, const std::string &part) {
int start_at = (int)s.length() - (int)part.length();
// If start_at < 0 then the search string is bigger then the source, so the results is false.
// On the other hand, if it starts after the end, something went wrong...
if (start_at < 0 || start_at > (int)s.length())
return false;
return s.compare(start_at, std::string::npos, part) == 0;
}
//--------------------------------------------------------------------------------------------------
void replaceStringInplace(std::string &value, const std::string &search, const std::string &replacement) {
std::string::size_type next;
for (next = value.find(search); next != std::string::npos; next = value.find(search, next)) {
value.replace(next, search.length(), replacement);
next += replacement.length();
}
}
//--------------------------------------------------------------------------------------------------
std::string replaceString(const std::string &s, const std::string &from, const std::string &to) {
std::string::size_type p;
std::string ss, res;
ss = s;
p = ss.find(from);
while (p != std::string::npos) {
if (p > 0)
res.append(ss.substr(0, p)).append(to);
else
res.append(to);
ss = ss.substr(p + from.size());
p = ss.find(from);
}
res.append(ss);
return res;
}
//--------------------------------------------------------------------------------------------------
/**
* Write text data to file, converting to \r\n if in Windows.
*/
void setTextFileContent(const std::string &filename, const std::string &data) {
#ifdef _MSC_VER
// Opening a file in text mode will automatically convert \n to \r\n.
FILE *f = base_fopen(filename.c_str(), "w+t");
if (!f)
throw std::runtime_error(g_strerror(errno));
size_t bytes_written = fwrite(data.data(), 1, data.size(), f);
fclose(f);
if (bytes_written != data.size())
throw std::runtime_error(g_strerror(errno));
#else
GError *error = NULL;
g_file_set_contents(filename.c_str(), data.data(), data.size(), &error);
if (error) {
std::string msg = error->message;
g_error_free(error);
throw std::runtime_error(msg);
}
#endif
}
//--------------------------------------------------------------------------------------------------
/**
* Reads text data from the given file (file name encoded as utf-8) and returns the content as utf-8.
* It can read ASCII/ANSI, utf-8 and utf-16 files (LE only) with and without BOM (BOM not included in result).
*/
std::string getTextFileContent(const std::string &filename) {
enum Encoding { ANSI, UTF8, UTF16LE } encoding = ANSI;
std::string result;
#ifdef _MSC_VER
std::ifstream stream(string_to_wstring(filename).c_str(), std::ios::binary);
#else
std::ifstream stream(filename.c_str(), std::ifstream::binary);
#endif
if (!stream.is_open() || stream.eof())
return "";
int ch1 = stream.get();
int ch2 = stream.get();
if (ch1 == 0xff && ch2 == 0xfe)
encoding = UTF16LE;
else if (ch1 == 0xfe && ch2 == 0xff)
return "UTF-16BE not supported";
else {
int ch3 = stream.get();
if (ch1 == 0xef && ch2 == 0xbb && ch3 == 0xbf)
encoding = UTF8;
else
stream.seekg(0);
}
std::string tmp;
stream.seekg(0, std::ios::end);
tmp.reserve(stream.tellg());
stream.seekg(0, std::ios::beg);
tmp.assign((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
switch (encoding) {
case UTF16LE:
return wstring_to_string(std::wstring((const wchar_t *)tmp.data()));
default:
return tmp;
}
}
//--------------------------------------------------------------------------------------------------
/**
* Escape a string to be used in a SQL query
* Same code as used by mysql. Handles null bytes in the middle of the string.
* If wildcards is true then _ and % are masked as well.
*/
std::string escape_sql_string(const std::string &s, bool wildcards) {
std::string result;
result.reserve(s.size());
for (std::string::const_iterator ch = s.begin(); ch != s.end(); ++ch) {
char escape = 0;
switch (*ch) {
case 0: /* Must be escaped for 'mysql' */
escape = '0';
break;
case '\n': /* Must be escaped for logs */
escape = 'n';
break;
case '\r':
escape = 'r';
break;
case '\\':
escape = '\\';
break;
case '\'':
escape = '\'';
break;
case '"': /* Better safe than sorry */
escape = '"';
break;
case '\032': /* This gives problems on Win32 */
escape = 'Z';
break;
case '_':
if (wildcards)
escape = '_';
break;
case '%':
if (wildcards)
escape = '%';
break;
}
if (escape) {
result.push_back('\\');
result.push_back(escape);
} else
result.push_back(*ch);
}
return result;
}
/**
* Escape a string to be used in a JSON
*/
std::string escape_json_string(const std::string &s) {
std::string result;
result.reserve(s.size());
for (auto ch : s) {
char escape = 0;
switch (ch) {
case '"':
escape = '"';
break;
case '\\':
escape = '\\';
break;
case '\b':
escape = 'b';
break;
case '\f':
escape = 'f';
break;
case '\n':
escape = 'n';
break;
case '\r':
escape = 'r';
break;
case '\t':
escape = 't';
break;
default:
break;
}
if (escape) {
result.push_back('\\');
result.push_back(escape);
} else
result.push_back(ch);
}
return result;
}
/**
* Removes repeated quote chars and supported escape sequences from the given string.
* Invalid escape sequences are handled like in the server, by dropping the backslash and
* using the wrong char as normal char.
* The outer quoting stays intact and is not removed.
*/
std::string unescape_sql_string(const std::string &s, char quote_char) {
// Early out if the string is simply empty but quoted.
if (s.size() == 2 && s[0] == quote_char && s[1] == quote_char)
return s;
std::string result;
result.reserve(s.size());
bool pendingQuote = false;
bool pendingEscape = false;
for (auto c : s) {
if (!pendingEscape && c == quote_char) {
if (pendingQuote)
pendingQuote = false;
else {
pendingQuote = true;
continue;
}
} else {
if (pendingQuote) {
pendingQuote = false;
result.push_back(quote_char);
}
if (pendingEscape) {
pendingEscape = false;
switch (c) {
case 'n':
c = '\n';
break;
case 't':
c = '\t';
break;
case 'r':
c = '\r';
break;
case 'b':
c = '\b';
break;
case '0':
c = 0;
break; // ASCII null
case 'Z':
c = '\032';
break; // Win32 end of file
}
} else if (c == '\\') {
pendingEscape = true;
continue;
}
}
result.push_back(c);
}
if (pendingQuote)
result.push_back(quote_char);
if (pendingEscape)
result.push_back('\\');
return result;
}
//--------------------------------------------------------------------------------------------------
// NOTE: This is not the same as escape_sql_string, as embedded ` must be escaped as ``, not \`
// and \ ' and " must not be escaped
std::string escape_backticks(const std::string &s) {
std::string result;
result.reserve(s.size());
for (std::string::const_iterator ch = s.begin(); ch != s.end(); ++ch) {
char escape = 0;
switch (*ch) {
case 0: /* Must be escaped for 'mysql' */
escape = '0';
break;
case '\n': /* Must be escaped for logs */
escape = 'n';
break;
case '\r':
escape = 'r';
break;
case '\032': /* This gives problems on Win32 */
escape = 'Z';
break;
case '`':
// special case
result.push_back('`');
break;
}
if (escape) {
result.push_back('\\');
result.push_back(escape);
} else
result.push_back(*ch);
}
return result;
}
//--------------------------------------------------------------------------------------------------
/**
* Parses the given command line (which must be a usual mysql start command) and extracts the
* value for the given parameter. The function can only return options of the form "option-name = option-value"
* (both quoted and unquoted).
*/
std::string extract_option_from_command_line(const std::string &option, const std::string &command_line) {
std::string result;
size_t position = command_line.find(option);
if (position != std::string::npos) {
position += option.size(); // Skip option name and find equal sign.
while (position < command_line.size() && command_line[position] != '=')
position++;
if (command_line[position] == '=') {
position++;
// Skip any white space.
while (position < command_line.size() && command_line[position] == ' ')
position++;
char terminator;
if (command_line[position] == '"' || command_line[position] == '\'')
terminator = command_line[position++];
else
terminator = ' ';
size_t end_position = command_line.find(terminator, position);
if (end_position == std::string::npos) {
// Terminator not found means the string was either not properly terminated (if quoted)
// or contains no space char. In this case take everything we can get.
if (terminator != ' ')
position++;
result = command_line.substr(position);
} else
result = command_line.substr(position, end_position - position);
}
}
return result;
}
//--------------------------------------------------------------------------------------------------
/**
* Splits the given font description and returns its details in the provided fields.
*
* @return True if successful, otherwise false.
*/
bool parse_font_description(const std::string &fontspec, std::string &font, float &size, bool &bold, bool &italic) {
std::vector<std::string> parts = split(fontspec, " ");
font = fontspec;
size = 12;
bold = false;
italic = false;
if (parts.empty())
return false;
for (std::vector<std::string>::iterator iter = parts.begin(); iter != parts.end(); ++iter) {
float size_check = 0;
if (sscanf(iter->c_str(), "%f", &size_check) == 1) {
size = size_check;
parts.erase(iter);
break;
}
}
/*
if (!parts.empty() && sscanf(parts.back().c_str(), "%f", &size) == 1)
parts.pop_back();*/
for (int i = 0; i < 2 && !parts.empty(); i++) {
if (g_ascii_strcasecmp(parts.back().c_str(), "bold") == 0) {
bold = true;
parts.pop_back();
}
if (g_ascii_strcasecmp(parts.back().c_str(), "italic") == 0) {
italic = true;
parts.pop_back();
}
}
if (!parts.empty()) {
font = parts[0];
for (unsigned int i = 1; i < parts.size(); i++)
font += " " + parts[i];
}
return true;
}
//--------------------------------------------------------------------------------------------------
std::string unquote_identifier(const std::string &identifier) {
int start = 0;
int size = (int)identifier.size();
if (size == 0)
return "";
if (identifier[0] == '"' || identifier[0] == '`')
start++;
if (identifier[size - 1] == '"' || identifier[size - 1] == '`')
size--;
size -= start;
return identifier.substr(start, size);
}
//--------------------------------------------------------------------------------------------------
/**
* @brief Remove outer quotes from any text.
*
* @param text Text to unquote
* @return Return unqoted text.
*/
std::string unquote(const std::string &text) {
if (text.size() < 2)
return text;
if ((text[0] == '"' || text[0] == '`' || text[0] == '\'') && text[0] == text[text.size() - 1])
return text.substr(1, text.size() - 2);
return text;
}
//--------------------------------------------------------------------------------------------------
std::string quote_identifier(const std::string &identifier, const char quote_char) {
return quote_char + identifier + quote_char;
}
//--------------------------------------------------------------------------------------------------
/**
* Quotes the given identifier, but only if it needs to be quoted.
*/
std::string quoteIdentifierIfNeeded(const std::string &ident, const char quote_char, MySQLVersion version) {
bool needs_quotation = MySQLSymbolInfo::isReservedKeyword(ident, version);
size_t digits = 0;
if (!needs_quotation) {
for (std::string::const_iterator i = ident.begin(); i != ident.end(); ++i) {
if ((*i >= 'a' && *i <= 'z') || (*i >= 'A' && *i <= 'Z') || (*i >= '0' && *i <= '9') || (*i == '_') ||
(*i == '$') || ((unsigned char)(*i) > 0x7F)) {
if (*i >= '0' && *i <= '9')
digits++;
continue;
}
needs_quotation = true;
break;
}
}
if (needs_quotation || digits == ident.length())
return quote_char + ident + quote_char;
else
return ident;
}
bool is_number(const std::string &word) {
if (word.empty())
return false;
size_t i = 0;
if (word[0] == '-')
i++;
for (; i < word.size(); i++)
if (!isdigit(word[i]))
return false;
return true;
}
//--------------------------------------------------------------------------------------------------
/**
* @brief Determine if a string is a boolean.
*
* @param text Text to check
* @return Return true if given string is a boolean.
**/
bool isBool(const std::string &text) {
std::string transformed;
std::transform(text.begin(), text.end(), std::back_inserter(transformed), ::tolower);
if (transformed.compare("true") != 0 && transformed.compare("false") != 0)
return false;
return true;
}
//--------------------------------------------------------------------------------------------------
/**
* Function : stl_string_compare
* Description : comparison function to be used on the sorting process
* Return Value : following the STL requirements should return true if the
* first string is lower than the second
*/
bool stl_string_compare(const std::string &first, const std::string &second, bool case_sensitive) {
return string_compare(first, second, case_sensitive) < 0;
}
//--------------------------------------------------------------------------------------------------
/**
* Culturally correct string comparison. Also properly compares different normalization forms.
* For a large amount of strings this function is not very effective as it generates the sort keys
* repeatedly (not to mention normalization).
* So if we ever need sorting of 10000 strings we have to add a separate implementation.
*
* @param first, the left string to compare.
* @param second, the right string to compare.
* @result 0 - If the strings are equal.
* < 0 - If first sorts before second.
* > 0 - If second sorts before first.
*/
int string_compare(const std::string &first, const std::string &second, bool case_sensitive) {
int result = 0;
gchar *left = g_utf8_normalize(first.c_str(), -1, G_NORMALIZE_DEFAULT);
gchar *right = g_utf8_normalize(second.c_str(), -1, G_NORMALIZE_DEFAULT);
if (!case_sensitive) {
gchar *s1 = g_utf8_casefold(left, -1);
gchar *s2 = g_utf8_casefold(right, -1);
result = g_utf8_collate(s1, s2);
g_free(s1);
g_free(s2);
} else
result = g_utf8_collate(left, right);
g_free(left);
g_free(right);
return result;
}
//--------------------------------------------------------------------------------------------------
/**
* Convenience function to determine if 2 strings are the same. This works also for culturally
* equal letters (e.g. german ß and ss) and any normalization form.
*/
bool same_string(const std::string &first, const std::string &second, bool case_sensitive) {
return string_compare(first, second, case_sensitive) == 0;
}
//--------------------------------------------------------------------------------------------------
/**
* Determines if the given candidate is part of the given text. As with the string_compare matches
* are culturally correct.
*/
bool contains_string(const std::string &text, const std::string &candidate, bool case_sensitive) {
if (text.size() == 0 || candidate.size() == 0)
return false;
gchar *hay_stack = g_utf8_normalize(text.c_str(), -1, G_NORMALIZE_DEFAULT);
gchar *needle = g_utf8_normalize(candidate.c_str(), -1, G_NORMALIZE_DEFAULT);
if (!case_sensitive) {
gchar *temp = g_utf8_casefold(hay_stack, -1);
g_free(hay_stack);
hay_stack = temp;
temp = g_utf8_casefold(needle, -1);
g_free(needle);
needle = temp;
}
gunichar start_char = g_utf8_get_char(needle);
bool result = false;
gchar *run = hay_stack;
while (!result) {
gchar *p = g_utf8_strchr(run, -1, start_char);
if (p == NULL)
break;
// Found the start char in the remaining text. See if that part matches the needle.
gchar *needle_run = needle;
bool mismatch = false;
for (size_t i = 0; i < candidate.size(); ++i, ++p, ++needle_run) {
if (g_utf8_get_char(needle_run) != g_utf8_get_char(p)) {
mismatch = true;
break;
}
}
if (mismatch)
++run;
else
result = true;
}
g_free(hay_stack);
g_free(needle);
return result;
}
//--------------------------------------------------------------------------------------------------
EolHelpers::Eol_format EolHelpers::detect(const std::string &text) {
std::string::size_type pos = text.find_first_of("\r\n");
if (std::string::npos == pos)
return default_eol_format();
if ('\r' == text[pos])
return ('\n' == text[pos + 1]) ? eol_crlf : eol_cr;
else
return eol_lf;
}
int EolHelpers::count_lines(const std::string &text) {
Eol_format eol_format = detect(text);
char eol_sym = (eol_cr == eol_format) ? '\r' : '\n';
return (int)std::count(text.begin(), text.end(), eol_sym);
}
bool EolHelpers::check(const std::string &text) {
std::string::size_type pos = text.find_first_of("\n\r");
if (std::string::npos == pos)
return true;
Eol_format eol_format = detect(text);
if (eol_lf == eol_format) {
if (text.find("\r") != std::string::npos)
return false;
} else if (eol_cr == eol_format) {
if (text.find("\n") != std::string::npos)
return false;
} else if (eol_crlf == eol_format) {
do {
if (('\n' == text[pos]) || ('\n' != text[pos + 1]))
return false;
++pos;
++pos;
pos = text.find_first_of("\n\r", pos);
} while (std::string::npos != pos);
}
return true;
}
void EolHelpers::conv(const std::string &src_text, Eol_format src_eol_format, std::string &dest_text,
Eol_format dest_eol_format) {
if (src_eol_format == dest_eol_format)
throw std::logic_error("source and target line ending formats coincide, no need to convert");
const std::string &src_eol = eol(src_eol_format);
const std::string &dest_eol = eol(dest_eol_format);
std::string::size_type src_eol_length = src_eol.size();
if (dest_eol.size() != src_eol.size()) {
dest_text.clear();
int line_count = count_lines(src_text);
size_t dest_size = src_text.size() + line_count * (dest_eol.size() - src_eol.size());
dest_text.reserve(dest_size);
std::string::size_type prev_pos = 0;
std::string::size_type pos = 0;
while ((pos = src_text.find(src_eol, pos)) != std::string::npos) {
dest_text.append(src_text, prev_pos, pos - prev_pos).append(dest_eol);
pos += src_eol_length;
prev_pos = pos;
}
dest_text.append(src_text, prev_pos, std::string::npos);
} else {
dest_text = src_text;
std::string::size_type pos = 0;
while ((pos = dest_text.find(src_eol, pos)) != std::string::npos) {
dest_text.replace(pos, src_eol_length, dest_eol);
pos += src_eol_length;
}
}
}
void EolHelpers::fix(const std::string &src_text, std::string &dest_text, Eol_format eol_format) {
const std::string &dest_eol = eol(eol_format);
std::string::size_type dest_eol_length = dest_eol.size();
dest_text.clear();
if (eol_crlf == eol_format) {
int cr_count = (int)std::count(src_text.begin(), src_text.end(), '\r');
int lf_count = (int)std::count(src_text.begin(), src_text.end(), '\n');
int crlf_count = 0;
{
std::string::size_type pos = 0;
while ((pos = src_text.find(dest_eol, pos)) != std::string::npos) {
++crlf_count;
pos += dest_eol_length;
}
}
size_t dest_size = src_text.size() + (cr_count - crlf_count) + (lf_count - crlf_count);
dest_text.reserve(dest_size);
}
std::string::size_type prev_pos = 0;
std::string::size_type pos = 0;
std::string crlf = "\r\n";
while ((pos = src_text.find_first_of(crlf, pos)) != std::string::npos) {
dest_text.append(src_text, prev_pos, pos - prev_pos).append(dest_eol);
if (('\r' == src_text[pos]) && ('\n' == src_text[pos + 1]))
++pos;
++pos;
prev_pos = pos;
}
dest_text.append(src_text, prev_pos, std::string::npos);
}
//--------------------------------------------------------------------------------------------------
std::string reflow_text(const std::string &text, unsigned int line_length, const std::string &left_fill,
bool indent_first, unsigned int max_lines) {
bool use_fill = true;
const unsigned int minimum_text_length = 5;
// Check if the line length complies to the minimum required
if (line_length < minimum_text_length)
return "";
// Only use left_fill when it's small enough to fit in the line and make the function able
// to do what it has to do
const unsigned int left_fill_length = (unsigned)left_fill.size();
if (left_fill_length + minimum_text_length >= line_length)
use_fill = false;
// Check for empty string...if we let it go, a left_fill will be inserted
if (text.size() == 0)
return "";
// Check if it's a valid utf8 string
const char *invalid_data_ptr = NULL;
if (g_utf8_validate(text.c_str(), (gsize)text.size(), &invalid_data_ptr) != TRUE)
throw std::invalid_argument(std::string("base::reflow_text received an invalid string: ") + text);
const std::string initial = (indent_first && use_fill) ? left_fill : "";
const std::string new_line = use_fill ? std::string("\n") + left_fill : std::string("\n");
std::string result = initial;
const char *char_string = text.c_str();
const char *iter = char_string;
unsigned int space_position_source = 0;
unsigned int line_char_counter = 0;
unsigned int line_counter = 0;
unsigned int char_count_after_space = 0;
unsigned int text_real_length = use_fill ? line_length - left_fill_length : line_length;
while (*iter) {
// Get the full utf8 char into the result string
result += std::string(iter, g_utf8_skip[*(const guchar *)(iter)]);
line_char_counter++;
char_count_after_space++;
if (g_unichar_isspace(*iter) && line_char_counter > left_fill_length) {
space_position_source = (unsigned)(iter - char_string + 1);
char_count_after_space = 0;
}
if (line_char_counter == text_real_length) {
// Check for special case when we have a word as big as a line
if (char_count_after_space == text_real_length) {
result += new_line;
space_position_source += char_count_after_space;
line_char_counter = char_count_after_space = 0;
} else {
// Find last space character position in the result string
unsigned int break_position =
space_position_source + line_counter * (unsigned)new_line.size() + (unsigned)initial.size();
// Insert a \n in the right position, right after the space char(or at the end of the string)
result.size() == break_position ? result += new_line : result.insert(break_position, new_line);
// Mark the characters that were already inserted after the new line
line_char_counter = char_count_after_space;
}
if (++line_counter == max_lines) {
result.resize(result.size() - char_count_after_space - new_line.size());
result += "\n(...)";
break;
}
}
iter = g_utf8_next_char((gchar *)iter); // Get the next char from the sequence
}
#ifdef DEBUG
if (g_utf8_validate(result.c_str(), result.size(), &invalid_data_ptr) != TRUE)
throw std::logic_error(
strfmt("base::reflow_text produced an invalid string:\nInput:\n%s\nOutput:\n%s", text.c_str(), result.c_str()));
#endif
return result;
}
} // namespace base