bool url_parse()

in hphp/runtime/base/zend-url.cpp [47:264]


bool url_parse(Url &output, const char *str, size_t length) {
  char port_buf[6];
  // s: full string
  // ue: end of full string
  // p: start of string slice we're looking at
  // e: index of something we searched for, e.g. ':'. usually end of string
  //    slice, but not always
  // pp: start of string sub-slice
  const char *s, *e, *p, *pp, *ue;

  s = str;
  ue = s + length;

  /* parse scheme */
  if ((e = (const char *)memchr((const void *)s, ':', length)) && e != s) {
    /* validate scheme */
    p = s;
    while (p < e) {
      /* scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] */
      if (!isalpha(*p) && !isdigit(*p) &&
          *p != '+' && *p != '.' && *p != '-') {
        if (e + 1 < ue && e < s + strcspn(s, "?#")) {
          goto parse_port;
        } else if (s + 1 < ue && *s == '/' && *(s + 1) == '/') {
          /* relative-scheme URL */
          s += 2;
          e = nullptr;
          goto parse_host;
        } else {
          goto just_path;
        }
      }
      p++;
    }

    if ((e + 1) == ue) { /* only scheme is available */
      replace_controlchars(output.scheme, s, (e - s));
      return true;
    }

    /*
     * certain schemas like mailto: and zlib: may not have any / after them
     * this check ensures we support those.
     */
    if (*(e+1) != '/') {
      /* check if the data we get is a port this allows us to
       * correctly parse things like a.com:80
       */
      p = e + 1;
      while (p < ue && isdigit(*p)) {
        p++;
      }

      if ((p == ue || *p == '/') && (p - e) < 7) {
        goto parse_port;
      }

      replace_controlchars(output.scheme, s, (e - s));

      s = e + 1;
      goto just_path;
    } else {
      replace_controlchars(output.scheme, s, (e - s));

      if (e + 2 < ue && *(e+2) == '/') {
        s = e + 3;
        if (output.scheme.get()->isame(s_file.get())) {
          if (e + 3 < ue && *(e + 3) == '/') {
            /* support windows drive letters as in:
               file:///c:/somedir/file.txt
            */
            if (e + 5 < ue && e[4] != '\0' && e[5] == ':') {
              s = e + 4;
            }
            goto just_path;
          }
        }
      } else {
        s = e + 1;
        goto just_path;
      }
    }
  } else if (e) { /* no scheme; starts with colon: look for port */
    parse_port:
    p = e + 1;
    pp = p;

    while (pp < ue && pp - p < 6 && isdigit(*pp)) {
      pp++;
    }

    if (pp - p > 0 && pp - p < 6 && (pp == ue || *pp == '/')) {
      memcpy(port_buf, p, (pp-p));
      port_buf[pp-p] = '\0';
      auto port = atoi(port_buf);
      if (port > 0 && port <= 65535) {
        output.port = port;
        if (s + 1 < ue && *s == '/' && *(s+1) == '/') {
          /* relative-scheme URL */
          s += 2;
        }
      } else {
        return false;
      }
    } else if (p == pp && pp == ue) {
      return false;
    } else if (s + 1 < ue && *s == '/' && *(s+1) == '/') {
      /* relative-scheme URL */
      s += 2;
    } else {
      goto just_path;
    }
  } else if (s + 1 < ue && *s == '/' && *(s +1 ) == '/') {
    /* relative-scheme URL */
    s += 2;
  } else {
    goto just_path;
  }

  parse_host:
  /* Binary-safe strcspn(s, "/?#") */
  e = ue;
  if ((p = (const char*)memchr(s, '/', e - s))) {
    e = p;
  }
  if ((p = (const char*)memchr(s, '?', e - s))) {
    e = p;
  }
  if ((p = (const char*)memchr(s, '#', e - s))) {
    e = p;
  }

  /* check for login and password */
  if ((p = (const char*)folly::memrchr(s, '@', (e-s)))) {
    if ((pp = (const char*)memchr(s, ':', (p-s)))) {
      replace_controlchars(output.user, s, (pp - s));

      pp++;
      replace_controlchars(output.pass, pp, (p-pp));
    } else {
      replace_controlchars(output.user, s, (p-s));
    }

    s = p + 1;
  }

  /* check for port */
  if (s < ue && *s == '[' && *(e-1) == ']') {
    /* Short circuit portscan,
       we're dealing with an
       IPv6 embedded address */
    p = nullptr;
  } else {
    p = (const char*)folly::memrchr(s, ':', e - s);
  }

  if (p) {
    if (!output.port) {
      p++;
      if (e-p > 5) { /* port cannot be longer then 5 characters */
        return false;
      } else if (e - p > 0) {
        memcpy(port_buf, p, (e-p));
        port_buf[e-p] = '\0';
        auto port = atoi(port_buf);
        if (port > 0 && port <= 65535) {
          output.port = port;
        } else {
          return false;
        }
      }
      p--;
    }
  } else {
    p = e;
  }

  /* check if we have a valid host, if we don't reject the string as url */
  if ((p-s) < 1) {
    return false;
  }

  replace_controlchars(output.host, s, (p - s));

  if (e == ue) {
    return true;
  }

  s = e;

  just_path:

  e = ue;
  p = (const char*)memchr(s, '#', (e - s));

  if (p) {
    p++;
    if (p < e) {
      replace_controlchars(output.fragment, p, e - p);
    }
    e = p - 1;
  }

  p = (const char*)memchr(s, '?', (e - s));
  if (p) {
    p++;
    if (p < e) {
      replace_controlchars(output.query, p, e - p);
    }
    e = p - 1;
  }

  if (s < e || s == ue) {
    replace_controlchars(output.path, s, e - s);
  }

  return true;
}