hphp/runtime/base/zend-string.cpp (2,009 lines of code) (raw):

/* +----------------------------------------------------------------------+ | HipHop for PHP | +----------------------------------------------------------------------+ | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) | | Copyright (c) 1998-2010 Zend Technologies Ltd. (http://www.zend.com) | +----------------------------------------------------------------------+ | This source file is subject to version 2.00 of the Zend license, | | that is bundled with this package in the file LICENSE, and is | | available through the world-wide-web at the following url: | | http://www.zend.com/license/2_00.txt. | | If you did not receive a copy of the Zend license and are unable to | | obtain it through the world-wide-web, please send a note to | | license@zend.com so we can mail you a copy immediately. | +----------------------------------------------------------------------+ */ // NOTE: See also "hphp/zend/zend-string.*". #include "hphp/runtime/base/zend-string.h" #include "hphp/runtime/base/zend-printf.h" #include "hphp/util/lock.h" #include "hphp/util/overflow.h" #include "hphp/zend/zend-math.h" #include <algorithm> #include <cmath> #ifndef _MSC_VER #include <monetary.h> #endif #include "hphp/util/bstring.h" #include "hphp/runtime/base/builtin-functions.h" #include "hphp/runtime/base/exceptions.h" #include "hphp/runtime/base/memory-manager.h" #include "hphp/runtime/base/request-info.h" #include "hphp/runtime/base/runtime-error.h" #include "hphp/runtime/base/string-buffer.h" #include "hphp/runtime/base/string-util.h" #include <folly/lang/CString.h> #include <folly/portability/String.h> #define PHP_QPRINT_MAXL 75 namespace HPHP { /////////////////////////////////////////////////////////////////////////////// // helpers void string_charmask(const char *sinput, int len, char *mask) { const unsigned char *input = (unsigned char *)sinput; const unsigned char *end; unsigned char c; memset(mask, 0, 256); for (end = input+len; input < end; input++) { c=*input; if ((input+3 < end) && input[1] == '.' && input[2] == '.' && input[3] >= c) { memset(mask+c, 1, input[3] - c + 1); input+=3; } else if ((input+1 < end) && input[0] == '.' && input[1] == '.') { /* Error, try to be as helpful as possible: (a range ending/starting with '.' won't be captured here) */ if (end-len >= input) { /* there was no 'left' char */ raise_invalid_argument_warning ("charlist: Invalid '..'-range, missing left of '..'"); continue; } if (input+2 >= end) { /* there is no 'right' char */ raise_invalid_argument_warning ("charlist: Invalid '..'-range, missing right of '..'"); continue; } if (input[-1] > input[2]) { /* wrong order */ raise_invalid_argument_warning ("charlist: '..'-range needs to be incrementing"); continue; } /* FIXME: better error (a..b..c is the only left possibility?) */ raise_invalid_argument_warning("charlist: Invalid '..'-range"); continue; } else { mask[c]=1; } } } /////////////////////////////////////////////////////////////////////////////// void string_to_case(String& s, int (*tocase)(int)) { assertx(!s.isNull()); assertx(tocase); auto data = s.mutableData(); auto len = s.size(); for (int i = 0; i < len; i++) { data[i] = tocase(data[i]); } } /////////////////////////////////////////////////////////////////////////////// #define STR_PAD_LEFT 0 #define STR_PAD_RIGHT 1 #define STR_PAD_BOTH 2 String string_pad(const char *input, int len, int pad_length, const char *pad_string, int pad_str_len, int pad_type) { assertx(input); int num_pad_chars = pad_length - len; /* If resulting string turns out to be shorter than input string, we simply copy the input and return. */ if (pad_length < 0 || num_pad_chars < 0) { return String(input, len, CopyString); } /* Setup the padding string values if specified. */ if (pad_str_len == 0) { SystemLib::throwRuntimeExceptionObject( "Invalid argument: pad_string: (empty)"); } String ret(pad_length, ReserveString); char *result = ret.mutableData(); /* We need to figure out the left/right padding lengths. */ int left_pad, right_pad; switch (pad_type) { case STR_PAD_RIGHT: left_pad = 0; right_pad = num_pad_chars; break; case STR_PAD_LEFT: left_pad = num_pad_chars; right_pad = 0; break; case STR_PAD_BOTH: left_pad = num_pad_chars / 2; right_pad = num_pad_chars - left_pad; break; default: SystemLib::throwRuntimeExceptionObject( folly::sformat("Invalid argument: pad_type: {}", pad_type)); } /* First we pad on the left. */ int result_len = 0; for (int i = 0; i < left_pad; i++) { result[result_len++] = pad_string[i % pad_str_len]; } /* Then we copy the input string. */ memcpy(result + result_len, input, len); result_len += len; /* Finally, we pad on the right. */ for (int i = 0; i < right_pad; i++) { result[result_len++] = pad_string[i % pad_str_len]; } ret.setSize(result_len); return ret; } /////////////////////////////////////////////////////////////////////////////// int string_find(const char *input, int len, char ch, int pos, bool case_sensitive) { assertx(input); if (pos < 0 || pos > len) { return -1; } const void *ptr; if (case_sensitive) { ptr = memchr(input + pos, ch, len - pos); } else { ptr = bstrcasechr(input + pos, ch, len - pos); } if (ptr != nullptr) { return (int)((const char *)ptr - input); } return -1; } int string_rfind(const char *input, int len, char ch, int pos, bool case_sensitive) { assertx(input); if (pos < -len || pos > len) { return -1; } const void *ptr; if (case_sensitive) { if (pos >= 0) { ptr = folly::memrchr(input + pos, ch, len - pos); } else { ptr = folly::memrchr(input, ch, len + pos + 1); } } else { if (pos >= 0) { ptr = bstrrcasechr(input + pos, ch, len - pos); } else { ptr = bstrrcasechr(input, ch, len + pos + 1); } } if (ptr != nullptr) { return (int)((const char *)ptr - input); } return -1; } int string_find(const char *input, int len, const char *s, int s_len, int pos, bool case_sensitive) { assertx(input); assertx(s); if (!s_len || pos < 0 || pos > len) { return -1; } void *ptr; if (case_sensitive) { ptr = (void*)string_memnstr(input + pos, s, s_len, input + len); } else { ptr = bstrcasestr(input + pos, len - pos, s, s_len); } if (ptr != nullptr) { return (int)((const char *)ptr - input); } return -1; } int string_rfind(const char *input, int len, const char *s, int s_len, int pos, bool case_sensitive) { assertx(input); assertx(s); if (!s_len || pos < -len || pos > len) { return -1; } void *ptr; if (case_sensitive) { if (pos >= 0) { ptr = bstrrstr(input + pos, len - pos, s, s_len); } else { ptr = bstrrstr(input, len + std::min(pos + s_len, 0), s, s_len); } } else { if (pos >= 0) { ptr = bstrrcasestr(input + pos, len - pos, s, s_len); } else { ptr = bstrrcasestr(input, len + std::min(pos + s_len, 0), s, s_len); } } if (ptr != nullptr) { return (int)((const char *)ptr - input); } return -1; } const char *string_memnstr(const char *haystack, const char *needle, int needle_len, const char *end) { const char *p = haystack; char ne = needle[needle_len-1]; end -= needle_len; while (p <= end) { if ((p = (char *)memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) { if (!memcmp(needle, p, needle_len-1)) { return p; } } if (p == nullptr) { return nullptr; } p++; } return nullptr; } String string_replace(const char *s, int len, int start, int length, const char *replacement, int len_repl) { assertx(s); assertx(replacement); assertx(len >= 0); // if "start" position is negative, count start position from the end // of the string if (start < 0) { start = len + start; if (start < 0) { start = 0; } } if (start > len) { start = len; } // if "length" position is negative, set it to the length // needed to stop that many chars from the end of the string if (length < 0) { length = (len - start) + length; if (length < 0) { length = 0; } } // check if length is too large if (length > len) { length = len; } // check if the length is too large adjusting for non-zero start // Write this way instead of start + length > len to avoid overflow if (length > len - start) { length = len - start; } String retString(len + len_repl - length, ReserveString); char *ret = retString.mutableData(); int ret_len = 0; if (start) { memcpy(ret, s, start); ret_len += start; } if (len_repl) { memcpy(ret + ret_len, replacement, len_repl); ret_len += len_repl; } len -= (start + length); if (len) { memcpy(ret + ret_len, s + start + length, len); ret_len += len; } retString.setSize(ret_len); return retString; } String string_replace(const char *input, int len, const char *search, int len_search, const char *replacement, int len_replace, int &count, bool case_sensitive) { assertx(input); assertx(search && len_search); assertx(len >= 0); assertx(len_search >= 0); assertx(len_replace >= 0); if (len == 0) { return String(); } req::vector<int> founds; founds.reserve(16); if (len_search == 1) { for (int pos = string_find(input, len, *search, 0, case_sensitive); pos >= 0; pos = string_find(input, len, *search, pos + len_search, case_sensitive)) { founds.push_back(pos); } } else { for (int pos = string_find(input, len, search, len_search, 0, case_sensitive); pos >= 0; pos = string_find(input, len, search, len_search, pos + len_search, case_sensitive)) { founds.push_back(pos); } } count = founds.size(); if (count == 0) { return String(); // not found } int reserve; // Make sure the new size of the string wouldn't overflow int32_t. Don't // bother if the replacement wouldn't make the string longer. if (len_replace > len_search) { auto raise = [&] { raise_error("String too large"); }; if (mul_overflow(len_replace - len_search, count)) { raise(); } int diff = (len_replace - len_search) * count; if (add_overflow(len, diff)) { raise(); } reserve = len + diff; } else { reserve = len + (len_replace - len_search) * count; } String retString(reserve, ReserveString); char *ret = retString.mutableData(); char *p = ret; int pos = 0; // last position in input that hasn't been copied over yet int n; for (unsigned int i = 0; i < founds.size(); i++) { n = founds[i]; if (n > pos) { n -= pos; memcpy(p, input, n); p += n; input += n; pos += n; } if (len_replace) { memcpy(p, replacement, len_replace); p += len_replace; } input += len_search; pos += len_search; } n = len; if (n > pos) { n -= pos; memcpy(p, input, n); p += n; } retString.setSize(p - ret); return retString; } /////////////////////////////////////////////////////////////////////////////// String string_chunk_split(const char *src, int srclen, const char *end, int endlen, int chunklen) { int chunks = srclen / chunklen; // complete chunks! int restlen = srclen - chunks * chunklen; /* srclen % chunklen */ String ret( safe_address( chunks + 1, endlen, srclen ), ReserveString ); char *dest = ret.mutableData(); const char *p; char *q; const char *pMax = src + srclen - chunklen + 1; for (p = src, q = dest; p < pMax; ) { memcpy(q, p, chunklen); q += chunklen; memcpy(q, end, endlen); q += endlen; p += chunklen; } if (restlen) { memcpy(q, p, restlen); q += restlen; memcpy(q, end, endlen); q += endlen; } ret.setSize(q - dest); return ret; } /////////////////////////////////////////////////////////////////////////////// #define PHP_TAG_BUF_SIZE 1023 /** * Check if tag is in a set of tags * * states: * * 0 start tag * 1 first non-whitespace char seen */ static int string_tag_find(const char *tag, int len, const char *set) { char c, *n; const char *t; int state=0, done=0; char *norm; if (len <= 0) { return 0; } norm = (char *)req::malloc_noptrs(len+1); SCOPE_EXIT { req::free(norm); }; n = norm; t = tag; c = tolower(*t); /* normalize the tag removing leading and trailing whitespace and turn any <a whatever...> into just <a> and any </tag> into <tag> */ while (!done) { switch (c) { case '<': *(n++) = c; break; case '>': done =1; break; default: if (!isspace((int)c)) { if (state == 0) { state=1; } if (c != '/') { *(n++) = c; } } else { if (state == 1) done=1; } break; } c = tolower(*(++t)); } *(n++) = '>'; *n = '\0'; if (strstr(set, norm)) { done=1; } else { done=0; } return done; } /** * A simple little state-machine to strip out html and php tags * * State 0 is the output state, State 1 means we are inside a * normal html tag and state 2 means we are inside a php tag. * * The state variable is passed in to allow a function like fgetss * to maintain state across calls to the function. * * lc holds the last significant character read and br is a bracket * counter. * * When an allow string is passed in we keep track of the string * in state 1 and when the tag is closed check it against the * allow string to see if we should allow it. * swm: Added ability to strip <?xml tags without assuming it PHP * code. */ String string_strip_tags(const char *s, const int len, const char *allow, const int allow_len, bool allow_tag_spaces) { const char *abuf, *p; char *rbuf, *tbuf, *tp, *rp, c, lc; int br, i=0, depth=0, in_q = 0; int state = 0, pos; assertx(s); assertx(allow); String retString(s, len, CopyString); rbuf = retString.mutableData(); String allowString; c = *s; lc = '\0'; p = s; rp = rbuf; br = 0; if (allow_len) { assertx(allow); allowString = String(allow_len, ReserveString); char *atmp = allowString.mutableData(); for (const char *tmp = allow; *tmp; tmp++, atmp++) { *atmp = tolower((int)*(const unsigned char *)tmp); } allowString.setSize(allow_len); abuf = allowString.data(); tbuf = (char *)req::malloc_noptrs(PHP_TAG_BUF_SIZE+1); tp = tbuf; } else { abuf = nullptr; tbuf = tp = nullptr; } auto move = [&pos, &tbuf, &tp]() { if (tp - tbuf >= PHP_TAG_BUF_SIZE) { pos = tp - tbuf; tbuf = (char*)req::realloc_noptrs(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1); tp = tbuf + pos; } }; while (i < len) { switch (c) { case '\0': break; case '<': if (isspace(*(p + 1)) && !allow_tag_spaces) { goto reg_char; } if (state == 0) { lc = '<'; state = 1; if (allow_len) { move(); *(tp++) = '<'; } } else if (state == 1) { depth++; } break; case '(': if (state == 2) { if (lc != '"' && lc != '\'') { lc = '('; br++; } } else if (allow_len && state == 1) { move(); *(tp++) = c; } else if (state == 0) { *(rp++) = c; } break; case ')': if (state == 2) { if (lc != '"' && lc != '\'') { lc = ')'; br--; } } else if (allow_len && state == 1) { move(); *(tp++) = c; } else if (state == 0) { *(rp++) = c; } break; case '>': if (depth) { depth--; break; } if (in_q) { break; } switch (state) { case 1: /* HTML/XML */ lc = '>'; in_q = state = 0; if (allow_len) { move(); *(tp++) = '>'; *tp='\0'; if (string_tag_find(tbuf, tp-tbuf, abuf)) { memcpy(rp, tbuf, tp-tbuf); rp += tp-tbuf; } tp = tbuf; } break; case 2: /* PHP */ if (!br && lc != '\"' && *(p-1) == '?') { in_q = state = 0; tp = tbuf; } break; case 3: in_q = state = 0; tp = tbuf; break; case 4: /* JavaScript/CSS/etc... */ if (p >= s + 2 && *(p-1) == '-' && *(p-2) == '-') { in_q = state = 0; tp = tbuf; } break; default: *(rp++) = c; break; } break; case '"': case '\'': if (state == 4) { /* Inside <!-- comment --> */ break; } else if (state == 2 && *(p-1) != '\\') { if (lc == c) { lc = '\0'; } else if (lc != '\\') { lc = c; } } else if (state == 0) { *(rp++) = c; } else if (allow_len && state == 1) { move(); *(tp++) = c; } if (state && p != s && *(p-1) != '\\' && (!in_q || *p == in_q)) { if (in_q) { in_q = 0; } else { in_q = *p; } } break; case '!': /* JavaScript & Other HTML scripting languages */ if (state == 1 && *(p-1) == '<') { state = 3; lc = c; } else { if (state == 0) { *(rp++) = c; } else if (allow_len && state == 1) { move(); *(tp++) = c; } } break; case '-': if (state == 3 && p >= s + 2 && *(p-1) == '-' && *(p-2) == '!') { state = 4; } else { goto reg_char; } break; case '?': if (state == 1 && *(p-1) == '<') { br=0; state=2; break; } case 'E': case 'e': /* !DOCTYPE exception */ if (state==3 && p > s+6 && tolower(*(p-1)) == 'p' && tolower(*(p-2)) == 'y' && tolower(*(p-3)) == 't' && tolower(*(p-4)) == 'c' && tolower(*(p-5)) == 'o' && tolower(*(p-6)) == 'd') { state = 1; break; } /* fall-through */ case 'l': /* swm: If we encounter '<?xml' then we shouldn't be in * state == 2 (PHP). Switch back to HTML. */ if (state == 2 && p > s+2 && *(p-1) == 'm' && *(p-2) == 'x') { state = 1; break; } /* fall-through */ default: reg_char: if (state == 0) { *(rp++) = c; } else if (allow_len && state == 1) { move(); *(tp++) = c; } break; } c = *(++p); i++; } if (rp < rbuf + len) { *rp = '\0'; } if (allow_len) { req::free(tbuf); } retString.setSize(rp - rbuf); return retString; } /////////////////////////////////////////////////////////////////////////////// static char string_hex2int(int c) { if (isdigit(c)) { return c - '0'; } if (c >= 'A' && c <= 'F') { return c - 'A' + 10; } if (c >= 'a' && c <= 'f') { return c - 'a' + 10; } return -1; } String string_quoted_printable_encode(const char *input, int len) { size_t length = len; const unsigned char *str = (unsigned char*)input; unsigned long lp = 0; unsigned char c; char *d, *buffer; char *hex = "0123456789ABCDEF"; String ret( safe_address( 3, length + ((safe_address(3, length, 0)/(PHP_QPRINT_MAXL-9)) + 1), 1), ReserveString ); d = buffer = ret.mutableData(); while (length--) { if (((c = *str++) == '\015') && (*str == '\012') && length > 0) { *d++ = '\015'; *d++ = *str++; length--; lp = 0; } else { if (iscntrl (c) || (c == 0x7f) || (c & 0x80) || (c == '=') || ((c == ' ') && (*str == '\015'))) { if ((((lp+= 3) > PHP_QPRINT_MAXL) && (c <= 0x7f)) || ((c > 0x7f) && (c <= 0xdf) && ((lp + 3) > PHP_QPRINT_MAXL)) || ((c > 0xdf) && (c <= 0xef) && ((lp + 6) > PHP_QPRINT_MAXL)) || ((c > 0xef) && (c <= 0xf4) && ((lp + 9) > PHP_QPRINT_MAXL))) { *d++ = '='; *d++ = '\015'; *d++ = '\012'; lp = 3; } *d++ = '='; *d++ = hex[c >> 4]; *d++ = hex[c & 0xf]; } else { if ((++lp) > PHP_QPRINT_MAXL) { *d++ = '='; *d++ = '\015'; *d++ = '\012'; lp = 1; } *d++ = c; } } } len = d - buffer; ret.setSize(len); return ret; } String string_quoted_printable_decode(const char *input, int len, bool is_q) { assertx(input); if (len == 0) { return String(); } int i = 0, j = 0, k; const char *str_in = input; String ret(len, ReserveString); char *str_out = ret.mutableData(); while (i < len && str_in[i]) { switch (str_in[i]) { case '=': if (i + 2 < len && str_in[i + 1] && str_in[i + 2] && isxdigit((int) str_in[i + 1]) && isxdigit((int) str_in[i + 2])) { str_out[j++] = (string_hex2int((int) str_in[i + 1]) << 4) + string_hex2int((int) str_in[i + 2]); i += 3; } else /* check for soft line break according to RFC 2045*/ { k = 1; while (str_in[i + k] && ((str_in[i + k] == 32) || (str_in[i + k] == 9))) { /* Possibly, skip spaces/tabs at the end of line */ k++; } if (!str_in[i + k]) { /* End of line reached */ i += k; } else if ((str_in[i + k] == 13) && (str_in[i + k + 1] == 10)) { /* CRLF */ i += k + 2; } else if ((str_in[i + k] == 13) || (str_in[i + k] == 10)) { /* CR or LF */ i += k + 1; } else { str_out[j++] = str_in[i++]; } } break; case '_': if (is_q) { str_out[j++] = ' '; i++; } else { str_out[j++] = str_in[i++]; } break; default: str_out[j++] = str_in[i++]; } } ret.setSize(j); return ret; } Variant string_base_to_numeric(const char *s, int len, int base) { int64_t num = 0; double fnum = 0; int mode = 0; int64_t cutoff; int cutlim; assertx(string_validate_base(base)); cutoff = LONG_MAX / base; cutlim = LONG_MAX % base; for (int i = len; i > 0; i--) { char c = *s++; /* might not work for EBCDIC */ if (c >= '0' && c <= '9') c -= '0'; else if (c >= 'A' && c <= 'Z') c -= 'A' - 10; else if (c >= 'a' && c <= 'z') c -= 'a' - 10; else continue; if (c >= base) continue; switch (mode) { case 0: /* Integer */ if (num < cutoff || (num == cutoff && c <= cutlim)) { num = num * base + c; break; } else { fnum = num; mode = 1; } /* fall-through */ case 1: /* Float */ fnum = fnum * base + c; } } if (mode == 1) { return fnum; } return num; } String string_long_to_base(unsigned long value, int base) { static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz"; char buf[(sizeof(unsigned long) << 3) + 1]; char *ptr, *end; assertx(string_validate_base(base)); end = ptr = buf + sizeof(buf) - 1; do { *--ptr = digits[value % base]; value /= base; } while (ptr > buf && value); return String(ptr, end - ptr, CopyString); } String string_numeric_to_base(const Variant& value, int base) { static char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz"; assertx(string_validate_base(base)); if ((!value.isInteger() && !value.isDouble())) { return empty_string(); } if (value.isDouble()) { double fvalue = floor(value.toDouble()); /* floor it just in case */ char *ptr, *end; char buf[(sizeof(double) << 3) + 1]; /* Don't try to convert +/- infinity */ if (fvalue == HUGE_VAL || fvalue == -HUGE_VAL) { raise_warning("Number too large"); return empty_string(); } end = ptr = buf + sizeof(buf) - 1; do { *--ptr = digits[(int) fmod(fvalue, base)]; fvalue /= base; } while (ptr > buf && fabs(fvalue) >= 1); return String(ptr, end - ptr, CopyString); } return string_long_to_base(value.toInt64(), base); } /////////////////////////////////////////////////////////////////////////////// // uuencode #define PHP_UU_ENC(c) \ ((c) ? ((c) & 077) + ' ' : '`') #define PHP_UU_ENC_C2(c) \ PHP_UU_ENC(((*(c) * 16) & 060) | ((*((c) + 1) >> 4) & 017)) #define PHP_UU_ENC_C3(c) \ PHP_UU_ENC(((*(c + 1) * 4) & 074) | ((*((c) + 2) >> 6) & 03)) #define PHP_UU_DEC(c) \ (((c) - ' ') & 077) String string_uuencode(const char *src, int src_len) { assertx(src); assertx(src_len); int len = 45; char *p; const char *s, *e, *ee; char *dest; /* encoded length is ~ 38% greater than the original */ String ret((int)ceil(src_len * 1.38) + 45, ReserveString); p = dest = ret.mutableData(); s = src; e = src + src_len; while ((s + 3) < e) { ee = s + len; if (ee > e) { ee = e; len = ee - s; if (len % 3) { ee = s + (int) (floor(len / 3) * 3); } } *p++ = PHP_UU_ENC(len); while (s < ee) { *p++ = PHP_UU_ENC(*s >> 2); *p++ = PHP_UU_ENC_C2(s); *p++ = PHP_UU_ENC_C3(s); *p++ = PHP_UU_ENC(*(s + 2) & 077); s += 3; } if (len == 45) { *p++ = '\n'; } } if (s < e) { if (len == 45) { *p++ = PHP_UU_ENC(e - s); len = 0; } *p++ = PHP_UU_ENC(*s >> 2); *p++ = PHP_UU_ENC_C2(s); *p++ = ((e - s) > 1) ? PHP_UU_ENC_C3(s) : PHP_UU_ENC('\0'); *p++ = ((e - s) > 2) ? PHP_UU_ENC(*(s + 2) & 077) : PHP_UU_ENC('\0'); } if (len < 45) { *p++ = '\n'; } *p++ = PHP_UU_ENC('\0'); *p++ = '\n'; *p = '\0'; ret.setSize(p - dest); return ret; } String string_uudecode(const char *src, int src_len) { int total_len = 0; int len; const char *s, *e, *ee; char *p, *dest; String ret(ceil(src_len * 0.75), ReserveString); p = dest = ret.mutableData(); s = src; e = src + src_len; while (s < e) { if ((len = PHP_UU_DEC(*s++)) <= 0) { break; } /* sanity check */ if (len > src_len) { goto err; } total_len += len; ee = s + (len == 45 ? 60 : (int) floor(len * 1.33)); /* sanity check */ if (ee > e) { goto err; } while (s < ee) { if (s + 4 > e) goto err; *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4; *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2; *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3)); s += 4; } if (len < 45) { break; } /* skip \n */ s++; } if ((len = total_len > (p - dest))) { *p++ = PHP_UU_DEC(*s) << 2 | PHP_UU_DEC(*(s + 1)) >> 4; if (len > 1) { *p++ = PHP_UU_DEC(*(s + 1)) << 4 | PHP_UU_DEC(*(s + 2)) >> 2; if (len > 2) { *p++ = PHP_UU_DEC(*(s + 2)) << 6 | PHP_UU_DEC(*(s + 3)); } } } ret.setSize(total_len); return ret; err: return String(); } /////////////////////////////////////////////////////////////////////////////// // base64 namespace { const char base64_table[] = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' }; const char base64_pad = '='; const short base64_reverse_table[256] = { -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, -2, -2, -2, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2, -2, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -2, -2, -2, -2, -2, -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2 }; Optional<int> maxEncodedSize(int length) { if ((length + 2) < 0 || ((length + 2) / 3) >= (1 << (sizeof(int) * 8 - 2))) { return std::nullopt; } return ((length + 2) / 3) * 4; } // outstr must be at least maxEncodedSize(length) bytes size_t php_base64_encode(const unsigned char *str, int length, unsigned char* outstr) { const unsigned char *current = str; unsigned char *p = outstr; while (length > 2) { /* keep going until we have less than 24 bits */ *p++ = base64_table[current[0] >> 2]; *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)]; *p++ = base64_table[((current[1] & 0x0f) << 2) + (current[2] >> 6)]; *p++ = base64_table[current[2] & 0x3f]; current += 3; length -= 3; /* we just handle 3 octets of data */ } /* now deal with the tail end of things */ if (length != 0) { *p++ = base64_table[current[0] >> 2]; if (length > 1) { *p++ = base64_table[((current[0] & 0x03) << 4) + (current[1] >> 4)]; *p++ = base64_table[(current[1] & 0x0f) << 2]; *p++ = base64_pad; } else { *p++ = base64_table[(current[0] & 0x03) << 4]; *p++ = base64_pad; *p++ = base64_pad; } } return p - outstr; } // outstr must be at least length bytes ssize_t php_base64_decode(const char *str, int length, bool strict, unsigned char* outstr) { const unsigned char *current = (unsigned char*)str; int ch, i = 0, j = 0, k; /* this sucks for threaded environments */ unsigned char* result = outstr; /* run through the whole string, converting as we go */ while ((ch = *current++) != '\0' && length-- > 0) { if (ch == base64_pad) { if (*current != '=' && ((i % 4) == 1 || (strict && length > 0))) { if ((i % 4) != 1) { while (isspace(*(++current))) { continue; } if (*current == '\0') { continue; } } return -1; } continue; } ch = base64_reverse_table[ch]; if ((!strict && ch < 0) || ch == -1) { /* a space or some other separator character, we simply skip over */ continue; } else if (ch == -2) { return -1; } switch(i % 4) { case 0: result[j] = ch << 2; break; case 1: result[j++] |= ch >> 4; result[j] = (ch & 0x0f) << 4; break; case 2: result[j++] |= ch >>2; result[j] = (ch & 0x03) << 6; break; case 3: result[j++] |= ch; break; } i++; } k = j; /* mop things up if we ended on a boundary */ if (ch == base64_pad) { switch(i % 4) { case 1: return -1; case 2: k++; case 3: result[k] = 0; } } return j; } } String string_base64_encode(const char* input, int len) { if (auto const wantedSize = maxEncodedSize(len)) { String ret(*wantedSize, ReserveString); auto actualSize = php_base64_encode((unsigned char*)input, len, (unsigned char*)ret.mutableData()); ret.setSize(actualSize); return ret; } return String(); } String string_base64_decode(const char* input, int len, bool strict) { String ret(len, ReserveString); auto actualSize = php_base64_decode(input, len, strict, (unsigned char*)ret.mutableData()); if (actualSize < 0) return String(); ret.setSize(actualSize); return ret; } std::string base64_encode(const char* input, int len) { if (auto const wantedSize = maxEncodedSize(len)) { std::string ret; ret.resize(*wantedSize); auto actualSize = php_base64_encode((unsigned char*)input, len, (unsigned char*)ret.data()); ret.resize(actualSize); return ret; } return std::string(); } std::string base64_decode(const char* input, int len, bool strict) { if (!len) return std::string(); std::string ret; ret.resize(len); auto actualSize = php_base64_decode(input, len, strict, (unsigned char*)ret.data()); if (!actualSize) return std::string(); ret.resize(actualSize); return ret; } /////////////////////////////////////////////////////////////////////////////// String string_escape_shell_arg(const char *str) { int x, y, l; char *cmd; y = 0; l = strlen(str); String ret(safe_address(l, 4, 3), ReserveString); /* worst case */ cmd = ret.mutableData(); #ifdef _MSC_VER cmd[y++] = '"'; #else cmd[y++] = '\''; #endif for (x = 0; x < l; x++) { switch (str[x]) { #ifdef _MSC_VER case '"': case '%': case '!': cmd[y++] = ' '; break; #else case '\'': cmd[y++] = '\''; cmd[y++] = '\\'; cmd[y++] = '\''; #endif /* fall-through */ default: cmd[y++] = str[x]; } } #ifdef _MSC_VER if (y > 0 && '\\' == cmd[y - 1]) { int k = 0, n = y - 1; for (; n >= 0 && '\\' == cmd[n]; n--, k++); if (k % 2) { cmd[y++] = '\\'; } } cmd[y++] = '"'; #else cmd[y++] = '\''; #endif ret.setSize(y); return ret; } String string_escape_shell_cmd(const char *str) { register int x, y, l; char *cmd; char *p = nullptr; l = strlen(str); String ret(safe_address(l, 2, 1), ReserveString); cmd = ret.mutableData(); for (x = 0, y = 0; x < l; x++) { switch (str[x]) { #ifndef _MSC_VER case '"': case '\'': if (!p && (p = (char *)memchr(str + x + 1, str[x], l - x - 1))) { /* noop */ } else if (p && *p == str[x]) { p = nullptr; } else { cmd[y++] = '\\'; } cmd[y++] = str[x]; break; #else /* % is Windows specific for environmental variables, ^%PATH% will output PATH while ^%PATH^% will not. escapeshellcmd->val will escape all % and !. */ case '%': case '!': case '"': case '\'': #endif case '#': /* This is character-set independent */ case '&': case ';': case '`': case '|': case '*': case '?': case '~': case '<': case '>': case '^': case '(': case ')': case '[': case ']': case '{': case '}': case '$': case '\\': case '\x0A': /* excluding these two */ case '\xFF': #ifdef _MSC_VER cmd[y++] = '^'; #else cmd[y++] = '\\'; #endif /* fall-through */ default: cmd[y++] = str[x]; } } ret.setSize(y); return ret; } /////////////////////////////////////////////////////////////////////////////// static void string_similar_str(const char *txt1, int len1, const char *txt2, int len2, int *pos1, int *pos2, int *max) { const char *p, *q; const char *end1 = txt1 + len1; const char *end2 = txt2 + len2; int l; *max = 0; for (p = txt1; p < end1; p++) { for (q = txt2; q < end2; q++) { for (l = 0; (p + l < end1) && (q + l < end2) && (p[l] == q[l]); l++); if (l > *max) { *max = l; *pos1 = p - txt1; *pos2 = q - txt2; } } } } static int string_similar_char(const char *txt1, int len1, const char *txt2, int len2) { int sum; int pos1 = 0, pos2 = 0, max; string_similar_str(txt1, len1, txt2, len2, &pos1, &pos2, &max); if ((sum = max)) { if (pos1 && pos2) { sum += string_similar_char(txt1, pos1, txt2, pos2); } if ((pos1 + max < len1) && (pos2 + max < len2)) { sum += string_similar_char(txt1 + pos1 + max, len1 - pos1 - max, txt2 + pos2 + max, len2 - pos2 - max); } } return sum; } int string_similar_text(const char *t1, int len1, const char *t2, int len2, double *percent) { if (len1 == 0 && len2 == 0) { if (percent) *percent = 0.0; return 0; } int sim = string_similar_char(t1, len1, t2, len2); if (percent) *percent = sim * 200.0 / (len1 + len2); return sim; } /////////////////////////////////////////////////////////////////////////////// #define LEVENSHTEIN_MAX_LENTH 255 // reference implementation, only optimized for memory usage, not speed int string_levenshtein(const char *s1, int l1, const char *s2, int l2, int cost_ins, int cost_rep, int cost_del ) { int *p1, *p2, *tmp; int i1, i2, c0, c1, c2; if (l1==0) return l2*cost_ins; if (l2==0) return l1*cost_del; if ((l1>LEVENSHTEIN_MAX_LENTH)||(l2>LEVENSHTEIN_MAX_LENTH)) { raise_warning("levenshtein(): Argument string(s) too long"); return -1; } p1 = (int*)req::malloc_noptrs((l2+1) * sizeof(int)); SCOPE_EXIT { req::free(p1); }; p2 = (int*)req::malloc_noptrs((l2+1) * sizeof(int)); SCOPE_EXIT { req::free(p2); }; for(i2=0;i2<=l2;i2++) { p1[i2] = i2*cost_ins; } for(i1=0;i1<l1;i1++) { p2[0]=p1[0]+cost_del; for(i2=0;i2<l2;i2++) { c0=p1[i2]+((s1[i1]==s2[i2])?0:cost_rep); c1=p1[i2+1]+cost_del; if (c1<c0) c0=c1; c2=p2[i2]+cost_ins; if (c2<c0) c0=c2; p2[i2+1]=c0; } tmp=p1; p1=p2; p2=tmp; } c0=p1[l2]; return c0; } /////////////////////////////////////////////////////////////////////////////// String string_money_format(const char *format, double value) { bool check = false; const char *p = format; while ((p = strchr(p, '%'))) { if (*(p + 1) == '%') { p += 2; } else if (!check) { check = true; p++; } else { raise_invalid_argument_warning ("format: Only a single %%i or %%n token can be used"); return String(); } } int format_len = strlen(format); int str_len = safe_address(format_len, 1, 1024); String ret(str_len, ReserveString); char *str = ret.mutableData(); if ((str_len = strfmon(str, str_len, format, value)) < 0) { return String(); } ret.setSize(str_len); return ret; } /////////////////////////////////////////////////////////////////////////////// String string_number_format(double d, int dec, const String& dec_point, const String& thousand_sep) { char *tmpbuf = nullptr, *resbuf; char *s, *t; /* source, target */ char *dp; int integral; int tmplen, reslen=0; int count=0; int is_negative=0; if (d < 0) { is_negative = 1; d = -d; } if (dec < 0) dec = 0; d = php_math_round(d, dec); if (dec >= StringData::MaxSize) raiseStringLengthExceededError(dec); // snprintf can allocate a large amount memory if you specify a // large value for dec. Also, the generated string can be large as // well. Check against the memory limit before allocating. auto const checkAlloc = [&] (size_t extra) { // Empirically, snprintf memory usage seems to be slightly more // than 4 bytes per decimal point. Be conservative and assume 8. auto const size = 8 * (size_t)dec + extra; if (size <= kMaxSmallSize) return; if (tl_heap->preAllocOOM(size)) check_non_safepoint_surprise(); }; // departure from PHP: we got rid of dependencies on spprintf() here. // This actually means 63 bytes for characters + 1 byte for '\0' String tmpstr(63, ReserveString); tmpbuf = tmpstr.mutableData(); checkAlloc(0); tmplen = snprintf(tmpbuf, 64, "%.*F", dec, d); // From the man page of snprintf, the return value is: // The number of characters that would have been written if n had been // sufficiently large, not counting the terminating null character. if (tmplen < 0) return empty_string(); if (tmplen < 64 && (tmpbuf == nullptr || !isdigit((int)tmpbuf[0]))) { tmpstr.setSize(tmplen); return tmpstr; } if (tmplen >= 64) { // Uncommon, asked for more than 64 chars worth of precision checkAlloc(tmplen + kStringOverhead); tmpstr = String(tmplen, ReserveString); tmpbuf = tmpstr.mutableData(); tmplen = snprintf(tmpbuf, tmplen + 1, "%.*F", dec, d); if (tmplen < 0) return empty_string(); if (tmpbuf == nullptr || !isdigit((int)tmpbuf[0])) { tmpstr.setSize(tmplen); return tmpstr; } } /* find decimal point, if expected */ if (dec) { dp = strpbrk(tmpbuf, ".,"); } else { dp = nullptr; } /* calculate the length of the return buffer */ if (dp) { integral = dp - tmpbuf; } else { /* no decimal point was found */ integral = tmplen; } /* allow for thousand separators */ if (!thousand_sep.empty()) { if (integral + thousand_sep.size() * ((integral-1) / 3) < integral) { /* overflow */ raise_error("String overflow"); } integral += ((integral-1) / 3) * thousand_sep.size(); } reslen = integral; if (dec) { reslen += dec; if (!dec_point.empty()) { if (reslen + dec_point.size() < dec_point.size()) { /* overflow */ raise_error("String overflow"); } reslen += dec_point.size(); } } /* add a byte for minus sign */ if (is_negative) { reslen++; } String resstr(reslen, ReserveString); resbuf = resstr.mutableData(); s = tmpbuf+tmplen-1; t = resbuf+reslen-1; /* copy the decimal places. * Take care, as the sprintf implementation may return less places than * we requested due to internal buffer limitations */ if (dec) { int declen = dp ? s - dp : 0; int topad = dec > declen ? dec - declen : 0; /* pad with '0's */ while (topad--) { *t-- = '0'; } if (dp) { s -= declen + 1; /* +1 to skip the point */ t -= declen; /* now copy the chars after the point */ memcpy(t + 1, dp + 1, declen); } /* add decimal point */ if (!dec_point.empty()) { memcpy(t + (1 - dec_point.size()), dec_point.data(), dec_point.size()); t -= dec_point.size(); } } /* copy the numbers before the decimal point, adding thousand * separator every three digits */ while(s >= tmpbuf) { *t-- = *s--; if (thousand_sep && (++count%3)==0 && s>=tmpbuf) { memcpy(t + (1 - thousand_sep.size()), thousand_sep.data(), thousand_sep.size()); t -= thousand_sep.size(); } } /* and a minus sign, if needed */ if (is_negative) { *t-- = '-'; } resstr.setSize(reslen); return resstr; } /////////////////////////////////////////////////////////////////////////////// // soundex /* Simple soundex algorithm as described by Knuth in TAOCP, vol 3 */ String string_soundex(const String& str) { assertx(!str.empty()); int _small, code, last; String retString(4, ReserveString); char* soundex = retString.mutableData(); static char soundex_table[26] = { 0, /* A */ '1', /* B */ '2', /* C */ '3', /* D */ 0, /* E */ '1', /* F */ '2', /* G */ 0, /* H */ 0, /* I */ '2', /* J */ '2', /* K */ '4', /* L */ '5', /* M */ '5', /* N */ 0, /* O */ '1', /* P */ '2', /* Q */ '6', /* R */ '2', /* S */ '3', /* T */ 0, /* U */ '1', /* V */ 0, /* W */ '2', /* X */ 0, /* Y */ '2' /* Z */ }; /* build soundex string */ last = -1; auto p = str.slice().data(); for (_small = 0; *p && _small < 4; p++) { /* convert chars to upper case and strip non-letter chars */ /* BUG: should also map here accented letters used in non */ /* English words or names (also found in English text!): */ /* esstsett, thorn, n-tilde, c-cedilla, s-caron, ... */ code = toupper((int)(unsigned char)(*p)); if (code >= 'A' && code <= 'Z') { if (_small == 0) { /* remember first valid char */ soundex[_small++] = code; last = soundex_table[code - 'A']; } else { /* ignore sequences of consonants with same soundex */ /* code in trail, and vowels unless they separate */ /* consonant letters */ code = soundex_table[code - 'A']; if (code != last) { if (code != 0) { soundex[_small++] = code; } last = code; } } } } /* pad with '0' and terminate with 0 ;-) */ while (_small < 4) { soundex[_small++] = '0'; } retString.setSize(4); return retString; } /////////////////////////////////////////////////////////////////////////////// // metaphone /** * this is now the original code by Michael G Schwern: * i've changed it just a slightly bit (use emalloc, * get rid of includes etc) * - thies - 13.09.1999 */ /*----------------------------- */ /* this used to be "metaphone.h" */ /*----------------------------- */ /* Special encodings */ #define SH 'X' #define TH '0' /*----------------------------- */ /* end of "metaphone.h" */ /*----------------------------- */ /*----------------------------- */ /* this used to be "metachar.h" */ /*----------------------------- */ /* Metachar.h ... little bits about characters for metaphone */ /*-- Character encoding array & accessing macros --*/ /* Stolen directly out of the book... */ char _codes[26] = { 1,16,4,16,9,2,4,16,9,2,0,2,2,2,1,4,0,2,4,4,1,0,0,0,8,0}; #define ENCODE(c) (isalpha(c) ? _codes[((toupper(c)) - 'A')] : 0) #define isvowel(c) (ENCODE(c) & 1) /* AEIOU */ /* These letters are passed through unchanged */ #define NOCHANGE(c) (ENCODE(c) & 2) /* FJMNR */ /* These form dipthongs when preceding H */ #define AFFECTH(c) (ENCODE(c) & 4) /* CGPST */ /* These make C and G soft */ #define MAKESOFT(c) (ENCODE(c) & 8) /* EIY */ /* These prevent GH from becoming F */ #define NOGHTOF(c) (ENCODE(c) & 16) /* BDH */ /*----------------------------- */ /* end of "metachar.h" */ /*----------------------------- */ /* I suppose I could have been using a character pointer instead of * accesssing the array directly... */ /* Look at the next letter in the word */ #define Next_Letter ((char)toupper(word[w_idx+1])) /* Look at the current letter in the word */ #define Curr_Letter ((char)toupper(word[w_idx])) /* Go N letters back. */ #define Look_Back_Letter(n) (w_idx >= n ? (char)toupper(word[w_idx-n]) : '\0') /* Previous letter. I dunno, should this return null on failure? */ #define Prev_Letter (Look_Back_Letter(1)) /* Look two letters down. It makes sure you don't walk off the string. */ #define After_Next_Letter (Next_Letter != '\0' ? (char)toupper(word[w_idx+2]) \ : '\0') #define Look_Ahead_Letter(n) ((char)toupper(Lookahead(word+w_idx, n))) /* Allows us to safely look ahead an arbitrary # of letters */ /* I probably could have just used strlen... */ static char Lookahead(unsigned char *word, int how_far) { char letter_ahead = '\0'; /* null by default */ int idx; for (idx = 0; word[idx] != '\0' && idx < how_far; idx++); /* Edge forward in the string... */ letter_ahead = (char)word[idx]; /* idx will be either == to how_far or * at the end of the string */ return letter_ahead; } /* phonize one letter * We don't know the buffers size in advance. On way to solve this is to just * re-allocate the buffer size. We're using an extra of 2 characters (this * could be one though; or more too). */ #define Phonize(c) { buffer.append(c); } /* How long is the phoned word? */ #define Phone_Len (buffer.size()) /* Note is a letter is a 'break' in the word */ #define Isbreak(c) (!isalpha(c)) String string_metaphone(const char *input, int word_len, long max_phonemes, int traditional) { unsigned char *word = (unsigned char *)input; int w_idx = 0; /* point in the phonization we're at. */ int max_buffer_len = 0; /* maximum length of the destination buffer */ /*-- Parameter checks --*/ /* Negative phoneme length is meaningless */ if (max_phonemes < 0) return String(); /* Empty/null string is meaningless */ /* Overly paranoid */ /* always_assert(word != NULL && word[0] != '\0'); */ if (word == nullptr) return String(); /*-- Allocate memory for our phoned_phrase --*/ if (max_phonemes == 0) { /* Assume largest possible */ max_buffer_len = word_len; } else { max_buffer_len = max_phonemes; } StringBuffer buffer(max_buffer_len); /*-- The first phoneme has to be processed specially. --*/ /* Find our first letter */ for (; !isalpha(Curr_Letter); w_idx++) { /* On the off chance we were given nothing but crap... */ if (Curr_Letter == '\0') { return buffer.detach(); /* For testing */ } } switch (Curr_Letter) { /* AE becomes E */ case 'A': if (Next_Letter == 'E') { Phonize('E'); w_idx += 2; } /* Remember, preserve vowels at the beginning */ else { Phonize('A'); w_idx++; } break; /* [GKP]N becomes N */ case 'G': case 'K': case 'P': if (Next_Letter == 'N') { Phonize('N'); w_idx += 2; } break; /* WH becomes H, WR becomes R W if followed by a vowel */ case 'W': if (Next_Letter == 'H' || Next_Letter == 'R') { Phonize(Next_Letter); w_idx += 2; } else if (isvowel(Next_Letter)) { Phonize('W'); w_idx += 2; } /* else ignore */ break; /* X becomes S */ case 'X': Phonize('S'); w_idx++; break; /* Vowels are kept */ /* We did A already case 'A': case 'a': */ case 'E': case 'I': case 'O': case 'U': Phonize(Curr_Letter); w_idx++; break; default: /* do nothing */ break; } /* On to the metaphoning */ for (; Curr_Letter != '\0' && (max_phonemes == 0 || Phone_Len < max_phonemes); w_idx++) { /* How many letters to skip because an eariler encoding handled * multiple letters */ unsigned short int skip_letter = 0; /* THOUGHT: It would be nice if, rather than having things like... * well, SCI. For SCI you encode the S, then have to remember * to skip the C. So the phonome SCI invades both S and C. It would * be better, IMHO, to skip the C from the S part of the encoding. * Hell, I'm trying it. */ /* Ignore non-alphas */ if (!isalpha(Curr_Letter)) continue; /* Drop duplicates, except CC */ if (Curr_Letter == Prev_Letter && Curr_Letter != 'C') continue; switch (Curr_Letter) { /* B -> B unless in MB */ case 'B': if (Prev_Letter != 'M') Phonize('B'); break; /* 'sh' if -CIA- or -CH, but not SCH, except SCHW. * (SCHW is handled in S) * S if -CI-, -CE- or -CY- * dropped if -SCI-, SCE-, -SCY- (handed in S) * else K */ case 'C': if (MAKESOFT(Next_Letter)) { /* C[IEY] */ if (After_Next_Letter == 'A' && Next_Letter == 'I') { /* CIA */ Phonize(SH); } /* SC[IEY] */ else if (Prev_Letter == 'S') { /* Dropped */ } else { Phonize('S'); } } else if (Next_Letter == 'H') { if ((!traditional) && (After_Next_Letter == 'R' || Prev_Letter == 'S')) { /* Christ, School */ Phonize('K'); } else { Phonize(SH); } skip_letter++; } else { Phonize('K'); } break; /* J if in -DGE-, -DGI- or -DGY- * else T */ case 'D': if (Next_Letter == 'G' && MAKESOFT(After_Next_Letter)) { Phonize('J'); skip_letter++; } else Phonize('T'); break; /* F if in -GH and not B--GH, D--GH, -H--GH, -H---GH * else dropped if -GNED, -GN, * else dropped if -DGE-, -DGI- or -DGY- (handled in D) * else J if in -GE-, -GI, -GY and not GG * else K */ case 'G': if (Next_Letter == 'H') { if (!(NOGHTOF(Look_Back_Letter(3)) || Look_Back_Letter(4) == 'H')) { Phonize('F'); skip_letter++; } else { /* silent */ } } else if (Next_Letter == 'N') { if (Isbreak(After_Next_Letter) || (After_Next_Letter == 'E' && Look_Ahead_Letter(3) == 'D')) { /* dropped */ } else Phonize('K'); } else if (MAKESOFT(Next_Letter) && Prev_Letter != 'G') { Phonize('J'); } else { Phonize('K'); } break; /* H if before a vowel and not after C,G,P,S,T */ case 'H': if (isvowel(Next_Letter) && !AFFECTH(Prev_Letter)) Phonize('H'); break; /* dropped if after C * else K */ case 'K': if (Prev_Letter != 'C') Phonize('K'); break; /* F if before H * else P */ case 'P': if (Next_Letter == 'H') { Phonize('F'); } else { Phonize('P'); } break; /* K */ case 'Q': Phonize('K'); break; /* 'sh' in -SH-, -SIO- or -SIA- or -SCHW- * else S */ case 'S': if (Next_Letter == 'I' && (After_Next_Letter == 'O' || After_Next_Letter == 'A')) { Phonize(SH); } else if (Next_Letter == 'H') { Phonize(SH); skip_letter++; } else if ((!traditional) && (Next_Letter == 'C' && Look_Ahead_Letter(2) == 'H' && Look_Ahead_Letter(3) == 'W')) { Phonize(SH); skip_letter += 2; } else { Phonize('S'); } break; /* 'sh' in -TIA- or -TIO- * else 'th' before H * else T */ case 'T': if (Next_Letter == 'I' && (After_Next_Letter == 'O' || After_Next_Letter == 'A')) { Phonize(SH); } else if (Next_Letter == 'H') { Phonize(TH); skip_letter++; } else { Phonize('T'); } break; /* F */ case 'V': Phonize('F'); break; /* W before a vowel, else dropped */ case 'W': if (isvowel(Next_Letter)) Phonize('W'); break; /* KS */ case 'X': Phonize('K'); Phonize('S'); break; /* Y if followed by a vowel */ case 'Y': if (isvowel(Next_Letter)) Phonize('Y'); break; /* S */ case 'Z': Phonize('S'); break; /* No transformation */ case 'F': case 'J': case 'L': case 'M': case 'N': case 'R': Phonize(Curr_Letter); break; default: /* nothing */ break; } /* END SWITCH */ w_idx += skip_letter; } /* END FOR */ return buffer.detach(); } /////////////////////////////////////////////////////////////////////////////// // Cyrillic /** * This is codetables for different Cyrillic charsets (relative to koi8-r). * Each table contains data for 128-255 symbols from ASCII table. * First 256 symbols are for conversion from koi8-r to corresponding charset, * second 256 symbols are for reverse conversion, from charset to koi8-r. * * Here we have the following tables: * _cyr_win1251 - for windows-1251 charset * _cyr_iso88595 - for iso8859-5 charset * _cyr_cp866 - for x-cp866 charset * _cyr_mac - for x-mac-cyrillic charset */ typedef unsigned char _cyr_charset_table[512]; static const _cyr_charset_table _cyr_win1251 = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31, 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63, 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79, 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95, 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111, 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127, 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46, 46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46, 154,174,190,46,159,189,46,46,179,191,180,157,46,46,156,183, 46,46,182,166,173,46,46,158,163,152,164,155,46,46,46,167, 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240, 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241, 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208, 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31, 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63, 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79, 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95, 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111, 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127, 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32, 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32, 32,32,32,184,186,32,179,191,32,32,32,32,32,180,162,32, 32,32,32,168,170,32,178,175,32,32,32,32,32,165,161,169, 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238, 239,255,240,241,242,243,230,226,252,251,231,248,253,249,247,250, 222,192,193,214,196,197,212,195,213,200,201,202,203,204,205,206, 207,223,208,209,210,211,198,194,220,219,199,216,221,217,215,218, }; static const _cyr_charset_table _cyr_cp866 = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31, 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63, 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79, 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95, 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111, 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127, 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240, 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241, 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208, 35,35,35,124,124,124,124,43,43,124,124,43,43,43,43,43, 43,45,45,124,45,43,124,124,43,43,45,45,124,45,43,45, 45,45,45,43,43,43,43,43,43,43,43,35,35,124,124,35, 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209, 179,163,180,164,183,167,190,174,32,149,158,32,152,159,148,154, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31, 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63, 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79, 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95, 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111, 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127, 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32, 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32, 205,186,213,241,243,201,32,245,187,212,211,200,190,32,247,198, 199,204,181,240,242,185,32,244,203,207,208,202,216,32,246,32, 238,160,161,230,164,165,228,163,229,168,169,170,171,172,173,174, 175,239,224,225,226,227,166,162,236,235,167,232,237,233,231,234, 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142, 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154, }; static const _cyr_charset_table _cyr_iso88595 = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31, 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63, 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79, 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95, 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111, 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127, 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32, 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32, 32,179,32,32,32,32,32,32,32,32,32,32,32,32,32,32, 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240, 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241, 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208, 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,209, 32,163,32,32,32,32,32,32,32,32,32,32,32,32,32,32, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31, 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63, 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79, 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95, 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111, 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127, 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32, 32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32, 32,32,32,241,32,32,32,32,32,32,32,32,32,32,32,32, 32,32,32,161,32,32,32,32,32,32,32,32,32,32,32,32, 238,208,209,230,212,213,228,211,229,216,217,218,219,220,221,222, 223,239,224,225,226,227,214,210,236,235,215,232,237,233,231,234, 206,176,177,198,180,181,196,179,197,184,185,186,187,188,189,190, 191,207,192,193,194,195,182,178,204,203,183,200,205,201,199,202, }; static const _cyr_charset_table _cyr_mac = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31, 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63, 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79, 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95, 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111, 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127, 225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240, 242,243,244,245,230,232,227,254,251,253,255,249,248,252,224,241, 160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175, 176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191, 128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143, 144,145,146,147,148,149,150,151,152,153,154,155,156,179,163,209, 193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208, 210,211,212,213,198,200,195,222,219,221,223,217,216,220,192,255, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31, 32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47, 48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63, 64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79, 80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95, 96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111, 112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127, 192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207, 208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223, 160,161,162,222,164,165,166,167,168,169,170,171,172,173,174,175, 176,177,178,221,180,181,182,183,184,185,186,187,188,189,190,191, 254,224,225,246,228,229,244,227,245,232,233,234,235,236,237,238, 239,223,240,241,242,243,230,226,252,251,231,248,253,249,247,250, 158,128,129,150,132,133,148,131,149,136,137,138,139,140,141,142, 143,159,144,145,146,147,134,130,156,155,135,152,157,153,151,154, }; /** * This is the function that performs real in-place conversion of the string * between charsets. * Parameters: * str - string to be converted * from,to - one-symbol label of source and destination charset * The following symbols are used as labels: * k - koi8-r * w - windows-1251 * i - iso8859-5 * a - x-cp866 * d - x-cp866 * m - x-mac-cyrillic */ String string_convert_cyrillic_string(const String& input, char from, char to) { const unsigned char *from_table, *to_table; unsigned char tmp; auto uinput = (unsigned char*)input.slice().data(); String retString(input.size(), ReserveString); unsigned char *str = (unsigned char *)retString.mutableData(); from_table = nullptr; to_table = nullptr; switch (toupper((int)(unsigned char)from)) { case 'W': from_table = _cyr_win1251; break; case 'A': case 'D': from_table = _cyr_cp866; break; case 'I': from_table = _cyr_iso88595; break; case 'M': from_table = _cyr_mac; break; case 'K': break; default: raise_invalid_argument_warning("Unknown source charset: %c", from); break; } switch (toupper((int)(unsigned char)to)) { case 'W': to_table = _cyr_win1251; break; case 'A': case 'D': to_table = _cyr_cp866; break; case 'I': to_table = _cyr_iso88595; break; case 'M': to_table = _cyr_mac; break; case 'K': break; default: raise_invalid_argument_warning("Unknown destination charset: %c", to); break; } for (int i = 0; i < input.size(); i++) { tmp = from_table == nullptr ? uinput[i] : from_table[uinput[i]]; str[i] = to_table == nullptr ? tmp : to_table[tmp + 256]; } retString.setSize(input.size()); return retString; } /////////////////////////////////////////////////////////////////////////////// // Hebrew #define HEB_BLOCK_TYPE_ENG 1 #define HEB_BLOCK_TYPE_HEB 2 #define isheb(c) \ (((((unsigned char) c) >= 224) && (((unsigned char) c) <= 250)) ? 1 : 0) #define _isblank(c) \ (((((unsigned char) c) == ' ' || ((unsigned char) c) == '\t')) ? 1 : 0) #define _isnewline(c) \ (((((unsigned char) c) == '\n' || ((unsigned char) c) == '\r')) ? 1 : 0) /** * Converts Logical Hebrew text (Hebrew Windows style) to Visual text * Cheers/complaints/flames - Zeev Suraski <zeev@php.net> */ String string_convert_hebrew_string(const String& inStr, int /*max_chars_per_line*/, int convert_newlines) { assertx(!inStr.empty()); auto str = inStr.data(); auto str_len = inStr.size(); const char *tmp; char *heb_str, *broken_str; char *target; int block_start, block_end, block_type, block_length, i; long max_chars=0; int begin, end, char_count, orig_begin; tmp = str; block_start=block_end=0; heb_str = (char *) req::malloc_noptrs(str_len + 1); SCOPE_EXIT { req::free(heb_str); }; target = heb_str+str_len; *target = 0; target--; block_length=0; if (isheb(*tmp)) { block_type = HEB_BLOCK_TYPE_HEB; } else { block_type = HEB_BLOCK_TYPE_ENG; } do { if (block_type == HEB_BLOCK_TYPE_HEB) { while ((isheb((int)*(tmp+1)) || _isblank((int)*(tmp+1)) || ispunct((int)*(tmp+1)) || (int)*(tmp+1)=='\n' ) && block_end<str_len-1) { tmp++; block_end++; block_length++; } for (i = block_start; i<= block_end; i++) { *target = str[i]; switch (*target) { case '(': *target = ')'; break; case ')': *target = '('; break; case '[': *target = ']'; break; case ']': *target = '['; break; case '{': *target = '}'; break; case '}': *target = '{'; break; case '<': *target = '>'; break; case '>': *target = '<'; break; case '\\': *target = '/'; break; case '/': *target = '\\'; break; default: break; } target--; } block_type = HEB_BLOCK_TYPE_ENG; } else { while (!isheb(*(tmp+1)) && (int)*(tmp+1)!='\n' && block_end < str_len-1) { tmp++; block_end++; block_length++; } while ((_isblank((int)*tmp) || ispunct((int)*tmp)) && *tmp!='/' && *tmp!='-' && block_end > block_start) { tmp--; block_end--; } for (i = block_end; i >= block_start; i--) { *target = str[i]; target--; } block_type = HEB_BLOCK_TYPE_HEB; } block_start=block_end+1; } while (block_end < str_len-1); String brokenStr(str_len, ReserveString); broken_str = brokenStr.mutableData(); begin=end=str_len-1; target = broken_str; while (1) { char_count=0; while ((!max_chars || char_count < max_chars) && begin > 0) { char_count++; begin--; if (begin <= 0 || _isnewline(heb_str[begin])) { while (begin > 0 && _isnewline(heb_str[begin-1])) { begin--; char_count++; } break; } } if (char_count == max_chars) { /* try to avoid breaking words */ int new_char_count=char_count, new_begin=begin; while (new_char_count > 0) { if (_isblank(heb_str[new_begin]) || _isnewline(heb_str[new_begin])) { break; } new_begin++; new_char_count--; } if (new_char_count > 0) { char_count=new_char_count; begin=new_begin; } } orig_begin=begin; if (_isblank(heb_str[begin])) { heb_str[begin]='\n'; } while (begin <= end && _isnewline(heb_str[begin])) { /* skip leading newlines */ begin++; } for (i = begin; i <= end; i++) { /* copy content */ *target = heb_str[i]; target++; } for (i = orig_begin; i <= end && _isnewline(heb_str[i]); i++) { *target = heb_str[i]; target++; } begin=orig_begin; if (begin <= 0) { *target = 0; break; } begin--; end=begin; } if (convert_newlines) { int count; auto ret = string_replace(broken_str, str_len, "\n", strlen("\n"), "<br />\n", strlen("<br />\n"), count, true); if (!ret.isNull()) { return ret; } } brokenStr.setSize(str_len); return brokenStr; } /////////////////////////////////////////////////////////////////////////////// }