function phutil_is_utf8_slowly()

in src/utils/utf8.php [120:249]


function phutil_is_utf8_slowly($string, $only_bmp = false) {
  // First, check the common case of normal ASCII strings. We're fine if
  // the string contains no bytes larger than 127.
  if (preg_match('/^[\x01-\x7F]+\z/', $string)) {
    return true;
  }

  // NOTE: In the past, we used a large regular expression in the form of
  // '(x|y|z)+' to match UTF8 strings. However, PCRE can segfaults on patterns
  // like this at relatively small input sizes, at least on some systems
  // (observed on OSX and Windows). This is apparently because the internal
  // implementation is recursive and it blows the stack.

  // See <https://bugs.php.net/bug.php?id=45735> for some discussion. Since the
  // input limit is extremely low (less than 50KB on my system), do this check
  // very very slowly in PHP instead. See also T5316.

  $len = strlen($string);
  for ($ii = 0; $ii < $len; $ii++) {
    $chr = ord($string[$ii]);
    if ($chr >= 0x01 && $chr <= 0x7F) {
      continue;
    } else if ($chr >= 0xC2 && $chr <= 0xDF) {
      ++$ii;
      if ($ii >= $len) {
        return false;
      }
      $chr = ord($string[$ii]);
      if ($chr >= 0x80 && $chr <= 0xBF) {
        continue;
      }
      return false;
    } else if ($chr > 0xE0 && $chr <= 0xEF) {
      ++$ii;
      if ($ii >= $len) {
        return false;
      }
      $chr = ord($string[$ii]);
      if ($chr >= 0x80 && $chr <= 0xBF) {
        ++$ii;
        if ($ii >= $len) {
          return false;
        }
        $chr = ord($string[$ii]);
        if ($chr >= 0x80 && $chr <= 0xBF) {
          continue;
        }
      }
      return false;
    } else if ($chr == 0xE0) {
      ++$ii;
      if ($ii >= $len) {
        return false;
      }
      $chr = ord($string[$ii]);

      // NOTE: This range starts at 0xA0, not 0x80. The values 0x80-0xA0 are
      // "valid", but not minimal representations, and MySQL rejects them. We're
      // special casing this part of the range.

      if ($chr >= 0xA0 && $chr <= 0xBF) {
        ++$ii;
        if ($ii >= $len) {
          return false;
        }
        $chr = ord($string[$ii]);
        if ($chr >= 0x80 && $chr <= 0xBF) {
          continue;
        }
      }
      return false;
    } else if (!$only_bmp) {
      if ($chr > 0xF0 && $chr <= 0xF4) {
        ++$ii;
        if ($ii >= $len) {
          return false;
        }
        $chr = ord($string[$ii]);
        if ($chr >= 0x80 && $chr <= 0xBF) {
          ++$ii;
          if ($ii >= $len) {
            return false;
          }
          $chr = ord($string[$ii]);
          if ($chr >= 0x80 && $chr <= 0xBF) {
            ++$ii;
            if ($ii >= $len) {
              return false;
            }
            $chr = ord($string[$ii]);
            if ($chr >= 0x80 && $chr <= 0xBF) {
              continue;
            }
          }
        }
      } else if ($chr == 0xF0) {
        ++$ii;
        if ($ii >= $len) {
          return false;
        }
        $chr = ord($string[$ii]);

        // NOTE: As above, this range starts at 0x90, not 0x80. The values
        // 0x80-0x90 are not minimal representations.

        if ($chr >= 0x90 && $chr <= 0xBF) {
          ++$ii;
          if ($ii >= $len) {
            return false;
          }
          $chr = ord($string[$ii]);
          if ($chr >= 0x80 && $chr <= 0xBF) {
            ++$ii;
            if ($ii >= $len) {
              return false;
            }
            $chr = ord($string[$ii]);
            if ($chr >= 0x80 && $chr <= 0xBF) {
              continue;
            }
          }
        }
      }
    }

    return false;
  }

  return true;
}