glean/rts/string.cpp (140 lines of code) (raw):

/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. */ #include "glean/rts/binary.h" #include "glean/rts/error.h" #include "glean/rts/string.h" #include <cassert> #include <cstring> #include <folly/Memory.h> #include <glog/logging.h> #include <unicode/utf8.h> #include <unicode/uchar.h> namespace facebook { namespace glean { namespace rts { namespace { /// Iterate over chunks of a mangled string, delimited by NULs, and call /// Chunk for each chunk except the last and Last for the last one, passing /// a pointer one past the end of the chunk (including delimiters). Examples: /// /// mangled string calls /// /// abc\0\0 last(q) /// p q /// /// abc\0\1def\0\1gh\0\0 chunk(p), chunk(q), chunk(r), last(s) /// p q r s /// template<typename Chunk> FOLLY_ALWAYS_INLINE size_t untrustedChunks(folly::ByteRange range, Chunk&& chunk) { const unsigned char * const p = range.data(); const size_t size = range.size(); assert(p != nullptr); int i; for (i = 0; i < size && p[i] > 0 && p[i] < 0x80; ++i) {} if (i+1 < size && p[i] == 0 && p[i+1] == 0) { chunk(p, i); return i+2; } int k = 0; while (true) { UChar c; // NOTE: U8_NEXT returns c<0 on overlong (invalid) points so this doesn't // transcode (and we don't have to worry about, say, overlong \NUL). U8_NEXT(p, i, size, c); if (c == 0) { if (i < size) { switch (p[i]) { case 0: chunk(p+k, i-k-1); return i+1; case 1: chunk(p+k, i-k); ++i; k = i; break; default: rts::error("invalid NUL in mangled string"); } } else { rts::error("truncated terminator in mangled string"); } } else if (c < 0) { rts::error("invalid UTF-8 string"); } } } } size_t validateUntrustedString(folly::ByteRange range) { return untrustedChunks(range, [](auto, auto) {}); } size_t demangleUntrustedString(folly::ByteRange range, binary::Output& output) { return untrustedChunks( range, [&](auto p, auto n) { output.bytes(p, n); } ); } namespace { template<typename Chunk> FOLLY_ALWAYS_INLINE size_t trustedChunks(folly::ByteRange range, Chunk&& chunk) noexcept { const auto end = range.end(); auto p = range.begin(); while (true) { auto q = static_cast<const unsigned char *>(std::memchr(p, 0, end-p)); CHECK(q && q+1 < end); if (q[1] == 0) { chunk(p, q-p); return q - range.begin() + 2; } else { chunk(p, q-p+1); p = q+2; } } } } std::pair<size_t, size_t> skipTrustedString(folly::ByteRange range) noexcept { size_t nuls = 0; auto size = trustedChunks( range, [&](auto,auto) { ++nuls; } ); assert (nuls > 0); assert (size >= nuls*2); return std::make_pair(size, size - nuls - 1); } size_t demangleTrustedString(folly::ByteRange range, uint8_t *buffer) noexcept { auto out = buffer; trustedChunks( range, [&](auto p, auto n) { std::memcpy(out, p, n); out += n; } ); return out-buffer; } void mangleString(folly::ByteRange range, binary::Output& output) { if (!range.empty()) { auto p = range.begin(); while (auto q = static_cast<const unsigned char *>( std::memchr(p, 0, range.end()-p))) { ++q; output.put({p,q}); output.fixed<uint8_t>(1); p = q; } output.put({p, range.end()}); } const unsigned char terminator[2] = {0,0}; output.bytes(terminator, 2); } void toLowerTrustedString(folly::ByteRange range, binary::Output& output) { output.expect(range.size()); for (int32_t i = 0; i < range.size(); ) { unsigned char b = static_cast<unsigned char>(range[i]); if (b < 0x80) { // This will also ignore \0 and \1 in the mangled string if (b >= 'A' && b <= 'Z') { b = b - 'A' + 'a'; } output.fixed(b); i++; } else { UChar32 c; U8_NEXT_UNSAFE(range.data(), i, c); c = u_tolower(c); uint8_t buf[4]; auto j = 0; U8_APPEND_UNSAFE(buf, j, c); output.bytes(buf, j); } } } } } }