void escapeStringImpl()

in folly/json.cpp [798:930]


void escapeStringImpl(
    StringPiece input, std::string& out, const serialization_opts& opts) {
  auto hexDigit = [](uint8_t c) -> char {
    return c < 10 ? c + '0' : c - 10 + 'a';
  };

  out.push_back('\"');

  auto* p = reinterpret_cast<const unsigned char*>(input.begin());
  auto* q = reinterpret_cast<const unsigned char*>(input.begin());
  auto* e = reinterpret_cast<const unsigned char*>(input.end());

  while (p < e) {
    // Find the longest prefix that does not need escaping, and copy
    // it literally into the output string.
    auto firstEsc = p;
    while (firstEsc < e) {
      auto avail = to_unsigned(e - firstEsc);
      uint64_t word = 0;
      if (avail >= 8) {
        word = folly::loadUnaligned<uint64_t>(firstEsc);
      } else {
        word = folly::partialLoadUnaligned<uint64_t>(firstEsc, avail);
      }
      auto prefix = firstEscapableInWord<EnableExtraAsciiEscapes>(word, opts);
      DCHECK_LE(prefix, avail);
      firstEsc += prefix;
      if (prefix < 8) {
        break;
      }
    }
    if (firstEsc > p) {
      out.append(reinterpret_cast<const char*>(p), firstEsc - p);
      p = firstEsc;
      // We can't be in the middle of a multibyte sequence, so we can reset q.
      q = p;
      if (p == e) {
        break;
      }
    }

    // Handle the next byte that may need escaping.

    // Since non-ascii encoding inherently does utf8 validation
    // we explicitly validate utf8 only if non-ascii encoding is disabled.
    if ((opts.validate_utf8 || opts.skip_invalid_utf8) &&
        !opts.encode_non_ascii) {
      // To achieve better spatial and temporal coherence
      // we do utf8 validation progressively along with the
      // string-escaping instead of two separate passes.

      // As the encoding progresses, q will stay at or ahead of p.
      CHECK_GE(q, p);

      // As p catches up with q, move q forward.
      if (q == p) {
        // calling utf8_decode has the side effect of
        // checking that utf8 encodings are valid
        char32_t v = utf8ToCodePoint(q, e, opts.skip_invalid_utf8);
        if (opts.skip_invalid_utf8 && v == U'\ufffd') {
          out.append(reinterpret_cast<const char*>(u8"\ufffd"));
          p = q;
          continue;
        }
      }
    }

    auto encodeUnicode = opts.encode_non_ascii && (*p & 0x80);
    if /* constexpr */ (EnableExtraAsciiEscapes) {
      encodeUnicode = encodeUnicode ||
          (*p >= 0x20 && *p < 0x80 &&
           (opts.extra_ascii_to_escape_bitmap[*p / 64] &
            (uint64_t(1) << (*p % 64))));
    }

    if (encodeUnicode) {
      // note that this if condition captures utf8 chars
      // with value > 127, so size > 1 byte (or they are whitelisted for
      // Unicode encoding).
      // NOTE: char32_t / char16_t are both unsigned.
      char32_t cp = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
      auto writeHex = [&](char16_t v) {
        char buf[] = "\\u\0\0\0\0";
        buf[2] = hexDigit((v >> 12) & 0x0f);
        buf[3] = hexDigit((v >> 8) & 0x0f);
        buf[4] = hexDigit((v >> 4) & 0x0f);
        buf[5] = hexDigit(v & 0x0f);
        out.append(buf, 6);
      };
      // From the ECMA-404 The JSON Data Interchange Syntax 2nd Edition Dec 2017
      if (cp < 0x10000u) {
        // If the code point is in the Basic Multilingual Plane (U+0000 through
        // U+FFFF), then it may be represented as a six-character sequence:
        // a reverse solidus, followed by the lowercase letter u, followed by
        // four hexadecimal digits that encode the code point.
        writeHex(static_cast<char16_t>(cp));
      } else {
        // To escape a code point that is not in the Basic Multilingual Plane,
        // the character may be represented as a twelve-character sequence,
        // encoding the UTF-16 surrogate pair corresponding to the code point.
        writeHex(static_cast<char16_t>(
            0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu)));
        writeHex(static_cast<char16_t>(0xdc00u + ((cp - 0x10000u) & 0x3ffu)));
      }
    } else if (*p == '\\' || *p == '\"') {
      char buf[] = "\\\0";
      buf[1] = char(*p++);
      out.append(buf, 2);
    } else if (*p <= 0x1f) {
      switch (*p) {
        // clang-format off
        case '\b': out.append("\\b"); p++; break;
        case '\f': out.append("\\f"); p++; break;
        case '\n': out.append("\\n"); p++; break;
        case '\r': out.append("\\r"); p++; break;
        case '\t': out.append("\\t"); p++; break;
        // clang-format on
        default:
          // Note that this if condition captures non readable chars
          // with value < 32, so size = 1 byte (e.g control chars).
          char buf[] = "\\u00\0\0";
          buf[4] = hexDigit(uint8_t((*p & 0xf0) >> 4));
          buf[5] = hexDigit(uint8_t(*p & 0xf));
          out.append(buf, 6);
          p++;
      }
    } else {
      out.push_back(char(*p++));
    }
  }

  out.push_back('\"');
}