int string_sscanf()

in hphp/runtime/base/zend-scanf.cpp [567:1071]


int string_sscanf(const char *string, const char *format, int numVars,
                  Variant &return_value) {
  int  nconversions;
  int  totalVars = -1;
  int64_t value;
  char *end;
  const char *baseString;
  char op   = 0;
  int  base = 0;
  int  underflow = 0;
  size_t width;
  long (*fn)(const char *, char **, int) = nullptr;
  const char *ch;
  char sch;
  int  flags;
  char buf[64];  /* Temporary buffer to hold scanned number
                  * strings before they are passed to strtoul() */

  Array returnArray;

  /*
   * Check for errors in the format string.
   */
  if (ValidateFormat(format, numVars, &totalVars) != SCAN_SUCCESS) {
    scan_set_error_return(numVars, return_value);
    return SCAN_ERROR_INVALID_FORMAT;
  }

  baseString = string;

  /*
   * Iterate over the format string filling in the result objects until
   * we reach the end of input, the end of the format string, or there
   * is a mismatch.
   */
  nconversions = 0;

  while (*format != '\0') {
    ch    = format++;
    flags = 0;

    /*
     * If we see whitespace in the format, skip whitespace in the string.
     */
    if ( isspace( (int)*ch ) ) {
      sch = *string;
      while ( isspace( (int)sch ) ) {
        if (*string == '\0') {
          goto done;
        }
        string++;
        sch = *string;
      }
      continue;
    }

    if (*ch != '%') {
    literal:
      if (*string == '\0') {
        underflow = 1;
        goto done;
      }
      sch = *string;
      string++;
      if (*ch != sch) {
        goto done;
      }
      continue;
    }

    ch = format++;
    if (*ch == '%') {
      goto literal;
    }

    /*
     * Check for assignment suppression ('*') or an XPG3-style
     * assignment ('%n$').
     */
    if (*ch == '*') {
      flags |= SCAN_SUPPRESS;
      ch = format++;
    } else if ( isdigit(UCHAR(*ch))) {
      value = strtoul(format-1, &end, 10);
      if (*end == '$') {
        format = end+1;
        ch = format++;
      }
    }

    /*
     * Parse any width specifier.
     */
    if ( isdigit(UCHAR(*ch))) {
      char *endptr;
      width = strtoul(format-1, &endptr, 10);
      format = endptr;
      ch = format++;
    } else {
      width = 0;
    }

    /*
     * Ignore size specifier.
     */
    if ((*ch == 'l') || (*ch == 'L') || (*ch == 'h')) {
      ch = format++;
    }

    /*
     * Handle the various field types.
     */
    switch (*ch) {
    case 'n':
      if (!(flags & SCAN_SUPPRESS)) {
        auto const key = safe_cast<int64_t>(returnArray.size());
        returnArray.set(key, (int)(string - baseString));
      }
      nconversions++;
      continue;

    case 'd':
    case 'D':
      op = 'i';
      base = 10;
      fn = (long (*)(const char *, char **, int))strtol;
      break;
    case 'i':
      op = 'i';
      base = 0;
      fn = (long (*)(const char *, char **, int))strtol;
      break;
    case 'o':
      op = 'i';
      base = 8;
      fn = (long (*)(const char *, char **, int))strtol;
      break;
    case 'x':
    case 'X':
      op = 'i';
      base = 16;
      fn = (long (*)(const char *, char **, int))strtol;
      break;
    case 'u':
      op = 'i';
      base = 10;
      flags |= SCAN_UNSIGNED;
      fn = (long (*)(const char *, char **, int))strtoul;
      break;

    case 'f':
    case 'e':
    case 'E':
    case 'g':
      op = 'f';
      break;

    case 's':
      op = 's';
      break;

    case 'c':
      op = 's';
      flags |= SCAN_NOSKIP;
      /*-cc-*/
      if (0 == width) {
        width = 1;
      }
      /*-cc-*/
      break;
    case '[':
      op = '[';
      flags |= SCAN_NOSKIP;
      break;
    } /* switch */

    /*
     * At this point, we will need additional characters from the
     * string to proceed.
     */
    if (*string == '\0') {
      underflow = 1;
      goto done;
    }

    /*
     * Skip any leading whitespace at the beginning of a field unless
     * the format suppresses this behavior.
     */
    if (!(flags & SCAN_NOSKIP)) {
      while (*string != '\0') {
        sch = *string;
        if (! isspace((int)sch) ) {
          break;
        }
        string++;
      }
      if (*string == '\0') {
        underflow = 1;
        goto done;
      }
    }

    /*
     * Perform the requested scanning operation.
     */
    switch (op) {
    case 'c':
    case 's':
      /*
       * Scan a string up to width characters or whitespace.
       */
      if (width == 0) {
        width = (size_t) ~0;
      }
      end = (char*)string;
      while (*end != '\0') {
        sch = *end;
        if ( isspace( (int)sch ) ) {
          break;
        }
        end++;
        if (--width == 0) {
          break;
        }
      }
      if (!(flags & SCAN_SUPPRESS)) {
        auto const key = safe_cast<int64_t>(returnArray.size());
        returnArray.set(key, String(string, end-string, CopyString));
      }
      string = end;
      break;

    case '[': {
      CharSet cset;

      if (width == 0) {
        width = (size_t) ~0;
      }
      end = (char*)string;

      format = BuildCharSet(&cset, format);
      while (*end != '\0') {
        sch = *end;
        if (!CharInSet(&cset, (int)sch)) {
          break;
        }
        end++;
        if (--width == 0) {
          break;
        }
      }
      ReleaseCharSet(&cset);

      if (string == end) {
        /*
         * Nothing matched the range, stop processing
         */
        goto done;
      }
      if (!(flags & SCAN_SUPPRESS)) {
        auto const key = safe_cast<int64_t>(returnArray.size());
        returnArray.set(key, String(string, end-string, CopyString));
      }
      string = end;
      break;
    }
    case 'i':
      /*
       * Scan an unsigned or signed integer.
       */
      /*-cc-*/
      buf[0] = '\0';
      /*-cc-*/
      if ((width == 0) || (width > sizeof(buf) - 1)) {
        width = sizeof(buf) - 1;
      }

      flags |= SCAN_SIGNOK | SCAN_NODIGITS | SCAN_NOZERO;
      for (end = buf; width > 0; width--) {
        switch (*string) {
          /*
           * The 0 digit has special meaning at the beginning of
           * a number.  If we are unsure of the base, it
           * indicates that we are in base 8 or base 16 (if it is
           * followed by an 'x').
           */
        case '0':
          /*-cc-*/
          if (base == 16) {
            flags |= SCAN_XOK;
          }
          /*-cc-*/
          if (base == 0) {
            base = 8;
            flags |= SCAN_XOK;
          }
          if (flags & SCAN_NOZERO) {
            flags &= ~(SCAN_SIGNOK | SCAN_NODIGITS | SCAN_NOZERO);
          } else {
            flags &= ~(SCAN_SIGNOK | SCAN_XOK | SCAN_NODIGITS);
          }
          goto addToInt;

        case '1': case '2': case '3': case '4':
        case '5': case '6': case '7':
          if (base == 0) {
            base = 10;
          }
          flags &= ~(SCAN_SIGNOK | SCAN_XOK | SCAN_NODIGITS);
          goto addToInt;

        case '8': case '9':
          if (base == 0) {
            base = 10;
          }
          if (base <= 8) {
            break;
          }
          flags &= ~(SCAN_SIGNOK | SCAN_XOK | SCAN_NODIGITS);
          goto addToInt;

        case 'A': case 'B': case 'C':
        case 'D': case 'E': case 'F':
        case 'a': case 'b': case 'c':
        case 'd': case 'e': case 'f':
          if (base <= 10) {
            break;
          }
          flags &= ~(SCAN_SIGNOK | SCAN_XOK | SCAN_NODIGITS);
          goto addToInt;

        case '+': case '-':
          if (flags & SCAN_SIGNOK) {
            flags &= ~SCAN_SIGNOK;
            goto addToInt;
          }
          break;

        case 'x': case 'X':
          if ((flags & SCAN_XOK) && (end == buf+1)) {
            base = 16;
            flags &= ~SCAN_XOK;
            goto addToInt;
          }
          break;
        }

        /*
         * We got an illegal character so we are done accumulating.
         */
        break;

      addToInt:
        /*
         * Add the character to the temporary buffer.
         */
        *end++ = *string++;
        if (*string == '\0') {
          break;
        }
      }

      /*
       * Check to see if we need to back up because we only got a
       * sign or a trailing x after a 0.
       */
      if (flags & SCAN_NODIGITS) {
        if (*string == '\0') {
          underflow = 1;
        }
        goto done;
      } else if (end[-1] == 'x' || end[-1] == 'X') {
        end--;
        string--;
      }

      /*
       * Scan the value from the temporary buffer.  If we are
       * returning a large unsigned value, we have to convert it back
       * to a string since PHP only supports signed values.
       */
      if (!(flags & SCAN_SUPPRESS)) {
        *end = '\0';
        value = (int64_t) (*fn)(buf, nullptr, base);
        auto const key = safe_cast<int64_t>(returnArray.size());
        if ((flags & SCAN_UNSIGNED) && (value < 0)) {
          snprintf(buf, sizeof(buf), "%lu", (long)value); /* INTL: ISO digit */
          returnArray.set(key, String(buf, CopyString));
        } else {
          returnArray.set(key, value);
        }
      }
      break;

    case 'f':
      /*
       * Scan a floating point number
       */
      buf[0] = '\0';     /* call me pedantic */
      if ((width == 0) || (width > sizeof(buf) - 1)) {
        width = sizeof(buf) - 1;
      }
      flags |= SCAN_SIGNOK | SCAN_NODIGITS | SCAN_PTOK | SCAN_EXPOK;
      for (end = buf; width > 0; width--) {
        switch (*string) {
        case '0': case '1': case '2': case '3':
        case '4': case '5': case '6': case '7':
        case '8': case '9':
          flags &= ~(SCAN_SIGNOK | SCAN_NODIGITS);
          goto addToFloat;
        case '+':
        case '-':
          if (flags & SCAN_SIGNOK) {
            flags &= ~SCAN_SIGNOK;
            goto addToFloat;
          }
          break;
        case '.':
          if (flags & SCAN_PTOK) {
            flags &= ~(SCAN_SIGNOK | SCAN_PTOK);
            goto addToFloat;
          }
          break;
        case 'e':
        case 'E':
          /*
           * An exponent is not allowed until there has
           * been at least one digit.
           */
          if ((flags & (SCAN_NODIGITS | SCAN_EXPOK)) == SCAN_EXPOK) {
            flags = (flags & ~(SCAN_EXPOK|SCAN_PTOK))
              | SCAN_SIGNOK | SCAN_NODIGITS;
            goto addToFloat;
          }
          break;
        }

        /*
         * We got an illegal character so we are done accumulating.
         */
        break;

      addToFloat:
        /*
         * Add the character to the temporary buffer.
         */
        *end++ = *string++;
        if (*string == '\0') {
          break;
        }
      }

      /*
       * Check to see if we need to back up because we saw a
       * trailing 'e' or sign.
       */
      if (flags & SCAN_NODIGITS) {
        if (flags & SCAN_EXPOK) {
          /*
           * There were no digits at all so scanning has
           * failed and we are done.
           */
          if (*string == '\0') {
            underflow = 1;
          }
          goto done;
        }

        /*
         * We got a bad exponent ('e' and maybe a sign).
         */
        end--;
        string--;
        if (*end != 'e' && *end != 'E') {
          end--;
          string--;
        }
      }

      /*
       * Scan the value from the temporary buffer.
       */
      if (!(flags & SCAN_SUPPRESS)) {
        double dvalue;
        *end = '\0';
        dvalue = strtod(buf, nullptr);
        auto const key = safe_cast<int64_t>(returnArray.size());
        returnArray.set(key, dvalue);
      }
      break;
    } /* switch (op) */
    nconversions++;
  } /*  while (*format != '\0') */

done:
  if (underflow && (0==nconversions)) {
    scan_set_error_return(numVars, return_value);
    return SCAN_ERROR_EOF;
  } else if (nconversions < totalVars) {
    /* TODO: not all elements converted. we need to prune the list - cc */
  }
  return_value = returnArray;
  return SCAN_SUCCESS;
}