lib/core/CStringUtils.cc (811 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#include <core/CStringUtils.h>
#include <core/CLogger.h>
#include <core/CRegex.h>
#include <core/CStrCaseCmp.h>
#include <core/Constants.h>
#include <boost/multi_array.hpp>
#include <algorithm>
#include <cmath>
#include <limits>
#include <locale>
#include <ctype.h>
#include <errno.h>
#include <fstream>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
namespace {
//! In order to avoid a failure on read we need to account for the rounding
//! performed by strtod which can result in the rounded value being outside
//! the representable range for values near minimum and maximum double.
double clampToReadable(double x) {
static const double SMALLEST = -(1.0 - 5e-16) * std::numeric_limits<double>::max();
static const double LARGEST = (1.0 - 5e-16) * std::numeric_limits<double>::max();
return (x < SMALLEST ? SMALLEST : x > LARGEST ? LARGEST : x);
}
//! Get a locale object for character transformations.
//!
//! TODO - remove when we switch to a character conversion library (e.g. ICU).
const std::locale& locale() {
static std::locale loc;
return loc;
}
// To ensure the singleton locale is constructed before multiple threads may
// require it, call locale() during the static initialisation phase of the
// program. Of course, the locale may already be constructed before this if
// another static object has used it.
const std::locale& DO_NOT_USE_THIS_VARIABLE = locale();
// Constants for parsing & converting memory size strings in standard ES format
const std::string MEMORY_SIZE_FORMAT{"([\\d]+)(b|k|kb|m|mb|g|gb|t|tb|p|pb)"};
const char BYTES{'b'};
const char KILOBYTES{'k'};
const char MEGABYTES{'m'};
const char GIGABYTES{'g'};
const char TERABYTES{'t'};
const char PETABYTES{'p'};
}
namespace ml {
namespace core {
// Initialise static class members
const std::string CStringUtils::WHITESPACE_CHARS(" \t\r\n\v\f");
int CStringUtils::utf8ByteType(char c) {
unsigned char u = static_cast<unsigned char>(c);
if ((u & 0x80) == 0) {
// Single byte character
return 1;
}
if ((u & 0xC0) == 0x80) {
// Continuation character
return -1;
}
if ((u & 0xE0) == 0xC0) {
// Start of two byte character
return 2;
}
if ((u & 0xF0) == 0xE0) {
// Start of three byte character
return 3;
}
if ((u & 0xF8) == 0xF0) {
// Start of four byte character
return 4;
}
if ((u & 0xFC) == 0xF8) {
// Start of five byte character
return 5;
}
return 6;
}
std::size_t CStringUtils::utf16LengthOfUtf8String(const std::string& str) {
std::size_t result{0};
for (const auto& c : str) {
int byteType{utf8ByteType(c)};
if (byteType >= 1 && byteType <= 3) {
result += 1;
} else if (byteType >= 4 && byteType <= 6) {
result += 2;
} else if (byteType == -1) {
result += 0; // I guess this will optimized out
} else {
LOG_ERROR(<< "Input error: unexpected utf8ByteType " << byteType);
}
}
return result;
}
std::string CStringUtils::toLower(std::string str) {
std::transform(str.begin(), str.end(), str.begin(), &::tolower);
return str;
}
std::string CStringUtils::toUpper(std::string str) {
std::transform(str.begin(), str.end(), str.begin(), &::toupper);
return str;
}
size_t CStringUtils::numMatches(const std::string& str, const std::string& word) {
size_t count(0);
std::string::size_type pos(0);
while (pos != std::string::npos) {
pos = str.find(word, pos);
if (pos != std::string::npos) {
++count;
// start next search after this word
pos += word.length();
}
}
return count;
}
void CStringUtils::trimWhitespace(std::string& str) {
CStringUtils::trim(WHITESPACE_CHARS, str);
}
void CStringUtils::trim(const std::string& toTrim, std::string& str) {
if (toTrim.empty() || str.empty()) {
return;
}
std::string::size_type pos = str.find_last_not_of(toTrim);
if (pos == std::string::npos) {
// Special case - entire string is being trimmed
str.clear();
return;
}
str.erase(pos + 1);
pos = str.find_first_not_of(toTrim);
if (pos != std::string::npos && pos > 0) {
str.erase(0, pos);
}
}
std::string CStringUtils::normaliseWhitespace(const std::string& str) {
std::string result;
result.reserve(str.length());
bool outputSpace(true);
for (std::string::const_iterator iter = str.begin(); iter != str.end(); ++iter) {
char current(*iter);
if (::isspace(static_cast<unsigned char>(current))) {
if (outputSpace) {
outputSpace = false;
result += ' ';
}
} else {
outputSpace = true;
result += current;
}
}
return result;
}
size_t CStringUtils::replace(const std::string& from, const std::string& to, std::string& str) {
if (from == to) {
return 0;
}
size_t count(0);
std::string::size_type pos(0);
while (pos != std::string::npos) {
pos = str.find(from, pos);
if (pos == std::string::npos) {
return count;
}
str.replace(pos, from.size(), to);
++count;
pos += to.size();
}
return count;
}
size_t CStringUtils::replaceFirst(const std::string& from,
const std::string& to,
std::string& str) {
if (from == to) {
return 0;
}
std::string::size_type pos = str.find(from);
if (pos == std::string::npos) {
return 0;
}
str.replace(pos, from.size(), to);
return 1;
}
void CStringUtils::escape(char escape, const std::string& toEscape, std::string& str) {
if (escape == '\0' || toEscape.empty()) {
return;
}
std::string::size_type pos(0);
while (pos < str.length()) {
pos = str.find_first_of(toEscape, pos);
if (pos == std::string::npos) {
break;
}
str.insert(pos, 1, escape);
// Skip the next 2 characters as we don't want to process the same one
// twice and we've just inserted one
pos += 2;
}
}
void CStringUtils::unEscape(char escape, std::string& str) {
if (escape == '\0') {
return;
}
std::string::size_type pos(0);
while (pos < str.length()) {
pos = str.find(escape, pos);
if (pos == std::string::npos) {
break;
} else if (pos + 1 == str.length()) {
LOG_WARN(<< "String to be unescaped ends with escape character: " << str);
}
str.erase(pos, 1);
// Skip the next character so that an escaped escape character
// is converted to a single escape character
++pos;
}
}
std::string CStringUtils::_typeToString(const unsigned long long& i) {
char buf[4 * sizeof(unsigned long long)];
::memset(buf, 0, sizeof(buf));
::sprintf(buf, "%llu", i);
return buf;
}
std::string CStringUtils::_typeToString(const unsigned long& i) {
char buf[4 * sizeof(unsigned long)];
::memset(buf, 0, sizeof(buf));
::sprintf(buf, "%lu", i);
return buf;
}
std::string CStringUtils::_typeToString(const unsigned int& i) {
char buf[4 * sizeof(unsigned int)];
::memset(buf, 0, sizeof(buf));
::sprintf(buf, "%u", i);
return buf;
}
std::string CStringUtils::_typeToString(const unsigned short& i) {
char buf[4 * sizeof(unsigned short)];
::memset(buf, 0, sizeof(buf));
::sprintf(buf, "%hu", i);
return buf;
}
std::string CStringUtils::_typeToString(const long long& i) {
char buf[4 * sizeof(long long)];
::memset(buf, 0, sizeof(buf));
::sprintf(buf, "%lld", i);
return buf;
}
std::string CStringUtils::_typeToString(const long& i) {
char buf[4 * sizeof(long)];
::memset(buf, 0, sizeof(buf));
::sprintf(buf, "%ld", i);
return buf;
}
std::string CStringUtils::_typeToString(const int& i) {
char buf[4 * sizeof(int)];
::memset(buf, 0, sizeof(buf));
::sprintf(buf, "%d", i);
return buf;
}
std::string CStringUtils::_typeToString(const short& i) {
char buf[4 * sizeof(short)];
::memset(buf, 0, sizeof(buf));
::sprintf(buf, "%hd", i);
return buf;
}
std::string CStringUtils::_typeToString(const bool& b) {
return (b ? "true" : "false");
}
std::string CStringUtils::_typeToString(const double& i) {
// Note the extra large buffer here, which is because the format string is
// "%f" rather than "%g", which means we could be printing a 308 digit
// number without resorting to scientific notation
char buf[64 * sizeof(double)];
::memset(buf, 0, sizeof(buf));
::sprintf(buf, "%f", i);
return buf;
}
std::string CStringUtils::_typeToString(const char* str) {
return str;
}
std::string CStringUtils::_typeToString(const char& c) {
return std::string(1, c);
}
// This may seem silly, but it allows generic code to be written
const std::string& CStringUtils::_typeToString(const std::string& str) {
return str;
}
std::string CStringUtils::typeToStringPretty(double d) {
// Maximum size = 1 (for sign)
// + 7 (for # s.f.)
// + 1 (for decimal point)
// + 5 (for w.c. e+308)
// + 1 (for terminating character)
// = 16
char buf[16];
::memset(buf, 0, sizeof(buf));
::sprintf(buf, "%.7g", d);
return buf;
}
std::string CStringUtils::typeToStringPrecise(double d, CIEEE754::EPrecision precision) {
// Just use a large enough buffer to hold maximum precision.
char buf[4 * sizeof(double)];
::memset(buf, 0, sizeof(buf));
// Floats need higher precision to precisely round trip to decimal than
// considering their effective precision base 10. There's a good discussion
// at https://randomascii.wordpress.com/2012/03/08/float-precisionfrom-zero-to-100-digits-2/.
// We use g format since it is the most efficient. Note also that when
// printing to limited precision we must round the value before printing
// because printing just truncates, i.e. sprintf(buf, "%.6e", 0.49999998)
// gives 4.999999e-1 rather than the correctly rounded value 0.5.
int ret = 0;
switch (precision) {
case CIEEE754::E_HalfPrecision:
ret = ::sprintf(buf, "%.5g",
clampToReadable(CIEEE754::round(d, CIEEE754::E_HalfPrecision)));
break;
case CIEEE754::E_SinglePrecision:
ret = ::sprintf(buf, "%.9g",
clampToReadable(CIEEE754::round(d, CIEEE754::E_SinglePrecision)));
break;
case CIEEE754::E_DoublePrecision:
ret = ::sprintf(buf, "%.17g", clampToReadable(d));
break;
}
// Workaround for Visual C++ 2010 misformatting - replace
// 123.45e010 with 123.45e10 and 123.45e-010 with 123.45e-10.
// Also it is inefficient to output trailing zeros, i.e.
// 1.23456000000000e-11 so we strip these off in the following.
if (ret > 2) {
// Look for an 'e'
char* ptr(static_cast<char*>(::memchr(buf, 'e', ret - 1)));
if (ptr != nullptr) {
bool edit = false;
bool minus = false;
// Strip off any trailing zeros and a trailing point.
char* bwd = ptr;
for (;;) {
--bwd;
if (*bwd == '0' || *bwd == '.') {
edit = true;
} else {
break;
}
}
// Strip off any leading zeros in the exponent.
char* fwd = ptr;
for (;;) {
++fwd;
if (*fwd == '-') {
minus = true;
} else if (*fwd == '+' || *fwd == '0') {
edit = true;
} else {
break;
}
}
if (edit) {
std::string adjResult;
adjResult.reserve(ret - 1);
// mantissa
adjResult.assign(buf, bwd + 1);
if (::isdigit(static_cast<unsigned char>(*fwd))) {
adjResult.append(minus ? "e-" : "e");
// exponent
adjResult.append(fwd);
}
return adjResult;
}
}
}
return buf;
}
CStringUtils::TSizeBoolPr
CStringUtils::memorySizeStringToBytes(const std::string& memorySizeStr, std::size_t defaultValue) {
if (memorySizeStr.empty()) {
LOG_ERROR(<< "Unable to parse empty memory size string");
return {defaultValue, false};
}
CRegex regex;
if (regex.init(MEMORY_SIZE_FORMAT) == false) {
LOG_ERROR(<< "Unable to init regex " << MEMORY_SIZE_FORMAT);
return {defaultValue, false};
}
CRegex::TStrVec tokens;
const std::string lowerSizeStr{toLower(memorySizeStr)};
// permit '0' to be unit-less
if (lowerSizeStr == "0") {
return {std::size_t{0}, true};
}
if (regex.tokenise(lowerSizeStr, tokens) == false) {
LOG_ERROR(<< "Unable to parse a memory limit from " << memorySizeStr);
return {defaultValue, false};
}
if (tokens.size() != 2) {
LOG_INFO(<< "Got wrong number of tokens:: " << tokens.size());
return {defaultValue, false};
}
const std::string& sizeStr = tokens[0];
const std::string& multiplierStr = tokens[1];
std::size_t size{0};
if (stringToType(sizeStr, size) == false) {
LOG_ERROR(<< "Could not convert " << sizeStr << " to an integral size.");
return {defaultValue, false};
}
// The assumption here is that the model memory limit string has already been validated
// i.e. the limit is already rounded down to the closest byte with a minimum value of 1 byte.
if (multiplierStr[0] == BYTES) {
// no-op
} else if (multiplierStr[0] == KILOBYTES) {
size *= constants::BYTES_IN_KILOBYTES;
} else if (multiplierStr[0] == MEGABYTES) {
size *= constants::BYTES_IN_MEGABYTES;
} else if (multiplierStr[0] == GIGABYTES) {
size *= constants::BYTES_IN_GIGABYTES;
} else if (multiplierStr[0] == TERABYTES) {
size *= constants::BYTES_IN_TERABYTES;
} else if (multiplierStr[0] == PETABYTES) {
size *= constants::BYTES_IN_PETABYTES;
}
return {size, true};
}
bool CStringUtils::_stringToType(bool silent, const std::string& str, unsigned long long& i) {
if (str.empty()) {
if (!silent) {
LOG_ERROR(<< "Unable to convert empty string to unsigned long long");
}
return false;
}
char* endPtr(nullptr);
errno = 0;
unsigned long long ret(::strtoull(str.c_str(), &endPtr, 0));
if (ret == 0 && errno == EINVAL) {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to unsigned long long: " << ::strerror(errno));
}
return false;
}
if (ret == ULLONG_MAX && errno == ERANGE) // note ULLONG_MAX used for compatability with strtoull
{
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to unsigned long long: " << ::strerror(errno));
}
return false;
}
if (endPtr != nullptr && *endPtr != '\0') {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to unsigned long long: first invalid character " << endPtr);
}
return false;
}
i = ret;
return true;
}
bool CStringUtils::_stringToType(bool silent, const std::string& str, unsigned long& i) {
if (str.empty()) {
if (!silent) {
LOG_ERROR(<< "Unable to convert empty string to unsigned long");
}
return false;
}
char* endPtr(nullptr);
errno = 0;
unsigned long ret(::strtoul(str.c_str(), &endPtr, 0));
if (ret == 0 && errno == EINVAL) {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to unsigned long: " << ::strerror(errno));
}
return false;
}
if (ret == ULONG_MAX && errno == ERANGE) // note ULONG_MAX used for compatability with strtoul
{
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to unsigned long: " << ::strerror(errno));
}
return false;
}
if (endPtr != nullptr && *endPtr != '\0') {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to unsigned long: first invalid character " << endPtr);
}
return false;
}
i = ret;
return true;
}
bool CStringUtils::_stringToType(bool silent, const std::string& str, unsigned int& i) {
// First try to convert to unsigned long.
// If that works check the range for unsigned int.
unsigned long ret(0);
if (CStringUtils::_stringToType(silent, str, ret) == false) {
return false;
}
// Now check if the result is in range for unsigned int
if (ret > std::numeric_limits<unsigned int>::max()) {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str << "' to unsigned int - out of range");
}
return false;
}
i = static_cast<unsigned int>(ret);
return true;
}
bool CStringUtils::_stringToType(bool silent, const std::string& str, unsigned short& i) {
// First try to convert to unsigned long.
// If that works check the range for unsigned short.
unsigned long ret(0);
if (CStringUtils::_stringToType(silent, str, ret) == false) {
return false;
}
// Now check if the result is in range for unsigned short
if (ret > std::numeric_limits<unsigned short>::max()) {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to unsigned short - out of range");
}
return false;
}
i = static_cast<unsigned short>(ret);
return true;
}
bool CStringUtils::_stringToType(bool silent, const std::string& str, long long& i) {
if (str.empty()) {
if (!silent) {
LOG_ERROR(<< "Unable to convert empty string to long long");
}
return false;
}
char* endPtr(nullptr);
errno = 0;
long long ret(::strtoll(str.c_str(), &endPtr, 0));
if (ret == 0 && errno == EINVAL) {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to long long: " << ::strerror(errno));
}
return false;
}
if ((ret == LLONG_MIN || ret == LLONG_MAX) && errno == ERANGE) // note LLONG_MAX used for compatability with strtoll
{
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to long long: " << ::strerror(errno));
}
return false;
}
if (endPtr != nullptr && *endPtr != '\0') {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to long long: first invalid character " << endPtr);
}
return false;
}
i = ret;
return true;
}
bool CStringUtils::_stringToType(bool silent, const std::string& str, long& i) {
if (str.empty()) {
if (!silent) {
LOG_ERROR(<< "Unable to convert empty string to long");
}
return false;
}
char* endPtr(nullptr);
errno = 0;
long ret(::strtol(str.c_str(), &endPtr, 0));
if (ret == 0 && errno == EINVAL) {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to long: " << ::strerror(errno));
}
return false;
}
if ((ret == LONG_MIN || ret == LONG_MAX) && errno == ERANGE) // note LONG_MAX used for compatability with strtol
{
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to long: " << ::strerror(errno));
}
return false;
}
if (endPtr != nullptr && *endPtr != '\0') {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to long: first invalid character " << endPtr);
}
return false;
}
i = ret;
return true;
}
bool CStringUtils::_stringToType(bool silent, const std::string& str, int& i) {
// First try to convert to long. If that works check the range for int.
long ret(0);
if (CStringUtils::_stringToType(silent, str, ret) == false) {
return false;
}
// Now check if the result is in range for int
if (ret < std::numeric_limits<int>::min() || ret > std::numeric_limits<int>::max()) {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "'"
" to int - out of range");
}
return false;
}
i = static_cast<int>(ret);
return true;
}
bool CStringUtils::_stringToType(bool silent, const std::string& str, short& i) {
// First try to convert to long. If that works check the range for short.
long ret(0);
if (CStringUtils::_stringToType(silent, str, ret) == false) {
return false;
}
// Now check if the result is in range for short
if (ret < std::numeric_limits<short>::min() ||
ret > std::numeric_limits<short>::max()) {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str << "' to short - out of range");
}
return false;
}
i = static_cast<short>(ret);
return true;
}
bool CStringUtils::_stringToType(bool silent, const std::string& str, bool& ret) {
switch (str.length()) {
case 0:
if (!silent) {
LOG_ERROR(<< "Cannot convert empty string to bool");
}
return false;
case 1:
switch (str[0]) {
case 'T':
case 'Y':
case 't':
case 'y':
ret = true;
return true;
case 'F':
case 'N':
case 'f':
case 'n':
ret = false;
return true;
}
break;
case 2:
if (CStrCaseCmp::strCaseCmp(str.c_str(), "no") == 0) {
ret = false;
return true;
}
if (CStrCaseCmp::strCaseCmp(str.c_str(), "on") == 0) {
ret = true;
return true;
}
break;
case 3:
if (CStrCaseCmp::strCaseCmp(str.c_str(), "yes") == 0) {
ret = true;
return true;
}
if (CStrCaseCmp::strCaseCmp(str.c_str(), "off") == 0) {
ret = false;
return true;
}
break;
case 4:
if (CStrCaseCmp::strCaseCmp(str.c_str(), "true") == 0) {
ret = true;
return true;
}
break;
case 5:
if (CStrCaseCmp::strCaseCmp(str.c_str(), "false") == 0) {
ret = false;
return true;
}
break;
}
long l(0);
if (CStringUtils::_stringToType(silent, str, l) == false) {
if (!silent) {
LOG_ERROR(<< "Cannot convert " << str << " to bool");
}
return false;
}
ret = (l != 0);
return true;
}
bool CStringUtils::_stringToType(bool silent, const std::string& str, double& d) {
if (str.empty()) {
if (!silent) {
LOG_ERROR(<< "Unable to convert empty string to double");
}
return false;
}
char* endPtr(nullptr);
errno = 0;
double ret(::strtod(str.c_str(), &endPtr));
if (ret == 0 && errno == EINVAL) {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to double: " << ::strerror(errno));
}
return false;
}
if ((ret == HUGE_VAL || ret == -HUGE_VAL) && errno == ERANGE) {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to double: " << ::strerror(errno));
}
return false;
}
if (endPtr != nullptr && *endPtr != '\0') {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to double: first invalid character " << endPtr);
}
return false;
}
d = ret;
return true;
}
bool CStringUtils::_stringToType(bool silent, const std::string& str, char& c) {
if (str.length() != 1) {
if (!silent) {
LOG_ERROR(<< "Unable to convert string '" << str
<< "' to char: " << (str.empty() ? "too short" : "too long"));
}
return false;
}
c = str[0];
return true;
}
// This may seem silly, but it allows generic code to be written
bool CStringUtils::_stringToType(bool /* silent */, const std::string& str, std::string& outStr) {
outStr = str;
return true;
}
void CStringUtils::tokenise(const std::string& delim,
const std::string& str,
TStrVec& tokens,
std::string& remainder) {
std::string::size_type pos(0);
for (;;) {
std::string::size_type pos2(str.find(delim, pos));
if (pos2 == std::string::npos) {
remainder.assign(str, pos, str.size() - pos);
break;
} else {
tokens.push_back(str.substr(pos, pos2 - pos));
pos = pos2 + delim.size();
}
}
}
std::string CStringUtils::longestCommonSubstr(const std::string& str1,
const std::string& str2) {
std::string common;
if (str1.empty() || str2.empty()) {
return common;
}
size_t firstLen(str1.length());
size_t secondLen(str2.length());
// Set up the matrix
using T2DSizeArray = boost::multi_array<size_t, 2>;
T2DSizeArray matrix(boost::extents[firstLen][secondLen]);
size_t maxLen(0);
size_t lastSubstrBegin(0);
for (size_t i = 0; i < firstLen; ++i) {
for (size_t j = 0; j < secondLen; ++j) {
if (str1[i] != str2[j]) {
matrix[i][j] = 0;
} else {
if (i == 0 || j == 0) {
matrix[i][j] = 1;
} else {
matrix[i][j] = 1 + matrix[i - 1][j - 1];
}
if (matrix[i][j] > maxLen) {
maxLen = matrix[i][j];
size_t thisSubstrBegin(i - maxLen + 1);
if (lastSubstrBegin == thisSubstrBegin) {
// We're continuing the current longest common substring
common += str1[i];
} else {
// We're starting a new longest common substring
common.assign(str1, thisSubstrBegin, maxLen);
lastSubstrBegin = thisSubstrBegin;
}
}
}
}
}
return common;
}
std::string CStringUtils::longestCommonSubsequence(const std::string& str1,
const std::string& str2) {
std::string common;
if (str1.empty() || str2.empty()) {
return common;
}
size_t firstLen(str1.length());
size_t secondLen(str2.length());
// Set up the matrix - dimensions are one bigger than the string lengths
using T2DSizeArray = boost::multi_array<size_t, 2>;
T2DSizeArray matrix(boost::extents[firstLen + 1][secondLen + 1]);
// Initialise the top row and left column of the matrix to zero
for (size_t i = 0; i <= firstLen; ++i) {
matrix[i][0] = 0;
}
for (size_t j = 0; j <= secondLen; ++j) {
matrix[0][j] = 0;
}
// Fill in the rest of the matrix
for (size_t i = 1; i <= firstLen; ++i) {
for (size_t j = 1; j <= secondLen; ++j) {
if (str1[i - 1] == str2[j - 1]) {
matrix[i][j] = matrix[i - 1][j - 1] + 1;
} else {
matrix[i][j] = std::max(matrix[i][j - 1], matrix[i - 1][j]);
}
}
}
// Find the length of the longest common subsequence from the bottom right
// corner of the matrix - if this length is zero, we don't need to backtrack
// to find the actual characters
size_t seqLen(matrix[firstLen][secondLen]);
if (seqLen > 0) {
// Create a string of NULLs to be overwritten (in reverse order) by the
// actual characters
common.resize(seqLen);
size_t resPos(seqLen - 1);
// Now backtrack through the matrix to find the common sequence
size_t i(firstLen);
size_t j(secondLen);
while (i > 0 && j > 0) {
if (str1[i - 1] == str2[j - 1]) {
common[resPos] = str1[i - 1];
// If we've got all the characters we need we can stop early
if (resPos == 0) {
break;
}
--i;
--j;
--resPos;
} else {
if (matrix[i][j - 1] >= matrix[i - 1][j]) {
--j;
} else {
--i;
}
}
}
}
return common;
}
std::string CStringUtils::wideToNarrow(const std::wstring& wideStr) {
// Annoyingly, the STL character transformations only work on
// character arrays, and not std::string objects
std::string narrowStr(wideStr.length(), '\0');
// Note: this won't always work for non-ASCII data, and it can't
// cope with UTF8 either, so we should replace it with a proper
// string conversion library, e.g. ICU
using TWCharTCType = std::ctype<wchar_t>;
std::use_facet<TWCharTCType>(locale()).narrow(
wideStr.data(), wideStr.data() + wideStr.length(), '?', &narrowStr[0]);
return narrowStr;
}
std::wstring CStringUtils::narrowToWide(const std::string& narrowStr) {
// Annoyingly, the STL character transformations only work on
// character arrays, and not std::string objects
std::wstring wideStr(narrowStr.length(), L'\0');
// Note: this won't always work for non-ASCII data, and it can't
// cope with UTF8 either, so we should replace it with a proper
// string conversion library, e.g. ICU
using TWCharTCType = std::ctype<wchar_t>;
std::use_facet<TWCharTCType>(locale()).widen(
narrowStr.data(), narrowStr.data() + narrowStr.length(), &wideStr[0]);
return wideStr;
}
std::pair<std::string, bool> CStringUtils::readFileToString(const std::string& fileName) {
std::ifstream fileStream{fileName};
if (fileStream.is_open() == false) {
LOG_ERROR(<< "Environment error: failed to open file '" << fileName << "'.");
return {std::string{}, false};
}
return {std::string{std::istreambuf_iterator<char>{fileStream},
std::istreambuf_iterator<char>{}},
true};
}
}
}