in src/utils/utf8.php [120:249]
function phutil_is_utf8_slowly($string, $only_bmp = false) {
// First, check the common case of normal ASCII strings. We're fine if
// the string contains no bytes larger than 127.
if (preg_match('/^[\x01-\x7F]+\z/', $string)) {
return true;
}
// NOTE: In the past, we used a large regular expression in the form of
// '(x|y|z)+' to match UTF8 strings. However, PCRE can segfaults on patterns
// like this at relatively small input sizes, at least on some systems
// (observed on OSX and Windows). This is apparently because the internal
// implementation is recursive and it blows the stack.
// See <https://bugs.php.net/bug.php?id=45735> for some discussion. Since the
// input limit is extremely low (less than 50KB on my system), do this check
// very very slowly in PHP instead. See also T5316.
$len = strlen($string);
for ($ii = 0; $ii < $len; $ii++) {
$chr = ord($string[$ii]);
if ($chr >= 0x01 && $chr <= 0x7F) {
continue;
} else if ($chr >= 0xC2 && $chr <= 0xDF) {
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
if ($chr >= 0x80 && $chr <= 0xBF) {
continue;
}
return false;
} else if ($chr > 0xE0 && $chr <= 0xEF) {
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
if ($chr >= 0x80 && $chr <= 0xBF) {
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
if ($chr >= 0x80 && $chr <= 0xBF) {
continue;
}
}
return false;
} else if ($chr == 0xE0) {
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
// NOTE: This range starts at 0xA0, not 0x80. The values 0x80-0xA0 are
// "valid", but not minimal representations, and MySQL rejects them. We're
// special casing this part of the range.
if ($chr >= 0xA0 && $chr <= 0xBF) {
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
if ($chr >= 0x80 && $chr <= 0xBF) {
continue;
}
}
return false;
} else if (!$only_bmp) {
if ($chr > 0xF0 && $chr <= 0xF4) {
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
if ($chr >= 0x80 && $chr <= 0xBF) {
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
if ($chr >= 0x80 && $chr <= 0xBF) {
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
if ($chr >= 0x80 && $chr <= 0xBF) {
continue;
}
}
}
} else if ($chr == 0xF0) {
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
// NOTE: As above, this range starts at 0x90, not 0x80. The values
// 0x80-0x90 are not minimal representations.
if ($chr >= 0x90 && $chr <= 0xBF) {
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
if ($chr >= 0x80 && $chr <= 0xBF) {
++$ii;
if ($ii >= $len) {
return false;
}
$chr = ord($string[$ii]);
if ($chr >= 0x80 && $chr <= 0xBF) {
continue;
}
}
}
}
}
return false;
}
return true;
}