in folly/json.cpp [798:930]
void escapeStringImpl(
StringPiece input, std::string& out, const serialization_opts& opts) {
auto hexDigit = [](uint8_t c) -> char {
return c < 10 ? c + '0' : c - 10 + 'a';
};
out.push_back('\"');
auto* p = reinterpret_cast<const unsigned char*>(input.begin());
auto* q = reinterpret_cast<const unsigned char*>(input.begin());
auto* e = reinterpret_cast<const unsigned char*>(input.end());
while (p < e) {
// Find the longest prefix that does not need escaping, and copy
// it literally into the output string.
auto firstEsc = p;
while (firstEsc < e) {
auto avail = to_unsigned(e - firstEsc);
uint64_t word = 0;
if (avail >= 8) {
word = folly::loadUnaligned<uint64_t>(firstEsc);
} else {
word = folly::partialLoadUnaligned<uint64_t>(firstEsc, avail);
}
auto prefix = firstEscapableInWord<EnableExtraAsciiEscapes>(word, opts);
DCHECK_LE(prefix, avail);
firstEsc += prefix;
if (prefix < 8) {
break;
}
}
if (firstEsc > p) {
out.append(reinterpret_cast<const char*>(p), firstEsc - p);
p = firstEsc;
// We can't be in the middle of a multibyte sequence, so we can reset q.
q = p;
if (p == e) {
break;
}
}
// Handle the next byte that may need escaping.
// Since non-ascii encoding inherently does utf8 validation
// we explicitly validate utf8 only if non-ascii encoding is disabled.
if ((opts.validate_utf8 || opts.skip_invalid_utf8) &&
!opts.encode_non_ascii) {
// To achieve better spatial and temporal coherence
// we do utf8 validation progressively along with the
// string-escaping instead of two separate passes.
// As the encoding progresses, q will stay at or ahead of p.
CHECK_GE(q, p);
// As p catches up with q, move q forward.
if (q == p) {
// calling utf8_decode has the side effect of
// checking that utf8 encodings are valid
char32_t v = utf8ToCodePoint(q, e, opts.skip_invalid_utf8);
if (opts.skip_invalid_utf8 && v == U'\ufffd') {
out.append(reinterpret_cast<const char*>(u8"\ufffd"));
p = q;
continue;
}
}
}
auto encodeUnicode = opts.encode_non_ascii && (*p & 0x80);
if /* constexpr */ (EnableExtraAsciiEscapes) {
encodeUnicode = encodeUnicode ||
(*p >= 0x20 && *p < 0x80 &&
(opts.extra_ascii_to_escape_bitmap[*p / 64] &
(uint64_t(1) << (*p % 64))));
}
if (encodeUnicode) {
// note that this if condition captures utf8 chars
// with value > 127, so size > 1 byte (or they are whitelisted for
// Unicode encoding).
// NOTE: char32_t / char16_t are both unsigned.
char32_t cp = utf8ToCodePoint(p, e, opts.skip_invalid_utf8);
auto writeHex = [&](char16_t v) {
char buf[] = "\\u\0\0\0\0";
buf[2] = hexDigit((v >> 12) & 0x0f);
buf[3] = hexDigit((v >> 8) & 0x0f);
buf[4] = hexDigit((v >> 4) & 0x0f);
buf[5] = hexDigit(v & 0x0f);
out.append(buf, 6);
};
// From the ECMA-404 The JSON Data Interchange Syntax 2nd Edition Dec 2017
if (cp < 0x10000u) {
// If the code point is in the Basic Multilingual Plane (U+0000 through
// U+FFFF), then it may be represented as a six-character sequence:
// a reverse solidus, followed by the lowercase letter u, followed by
// four hexadecimal digits that encode the code point.
writeHex(static_cast<char16_t>(cp));
} else {
// To escape a code point that is not in the Basic Multilingual Plane,
// the character may be represented as a twelve-character sequence,
// encoding the UTF-16 surrogate pair corresponding to the code point.
writeHex(static_cast<char16_t>(
0xd800u + (((cp - 0x10000u) >> 10) & 0x3ffu)));
writeHex(static_cast<char16_t>(0xdc00u + ((cp - 0x10000u) & 0x3ffu)));
}
} else if (*p == '\\' || *p == '\"') {
char buf[] = "\\\0";
buf[1] = char(*p++);
out.append(buf, 2);
} else if (*p <= 0x1f) {
switch (*p) {
// clang-format off
case '\b': out.append("\\b"); p++; break;
case '\f': out.append("\\f"); p++; break;
case '\n': out.append("\\n"); p++; break;
case '\r': out.append("\\r"); p++; break;
case '\t': out.append("\\t"); p++; break;
// clang-format on
default:
// Note that this if condition captures non readable chars
// with value < 32, so size = 1 byte (e.g control chars).
char buf[] = "\\u00\0\0";
buf[4] = hexDigit(uint8_t((*p & 0xf0) >> 4));
buf[5] = hexDigit(uint8_t(*p & 0xf));
out.append(buf, 6);
p++;
}
} else {
out.push_back(char(*p++));
}
}
out.push_back('\"');
}