glean/rts/nat.h (85 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <folly/Bits.h>
#include <folly/CppAttributes.h>
#include <folly/Likely.h>
#include <folly/Range.h>
namespace facebook {
namespace glean {
namespace rts {
// Packed natural numbers
//
// This module provides a packed representation of unsigned 64-bit numbers
// with the following properties (in order of importance).
//
// - Packed numbers compare lexicographically (i.e., memcmp can be used to
// to compare two numbers).
// - As a collorary, every 64-bit number has exacly one packed representation
// (some other compact encodings don't necessarily guarantee this).
// - They are very compact (max. 9 bytes for a 64-bit number compared to max
// 10 for varints).
// - They are quite fast to encode/decode.
//
// This encoding is a variant of prefix varints, albeit not one I've seen
// before. The first byte of an encoded number indicates how many bytes the
// number takes up and contains some data.
//
// First byte Bytes Bits Range
// 0nnnnnnn 1 7 0 - 0x7F
// 10nnnnnn 2 14 0x80 - 0x407F
// 110nnnnn 3 21 0x4080 - 0x20407F
// 1110nnnn 4 28 0x204080 - 0x1020407F
// 11110nnn 5 35 0x10204080 - 0x081020407F
// 111110nn 6 42 0x0810204080 - 0x04081020407F
// 1111110n 7 49 0x040810204080 - 0x0204081020407F
// 11111110 8 56 0x02040810204080 - 0x010204081020407F
// 11111111 9 64 0x010204081020407F - 0xFFFFFFFFFFFFFFFF
//
// Numbers are stored in big endian format. Here is how 0x123456 is stored:
//
// 11010001 11110011 11010110
// +--+----------------------+
// | |
// | +- 21 significant bits storing 0x11F3D6, i.e.,
// | 0x123456 - 0x4080 (which is the start of the range)
// |
// +- prefix indicates 3 byte representation
//
// Subtracting the start of the range from the number rather than just storing
// the bit pattern as is has two advantages. Firstly, this ensures that every
// number has exactly one packed representation without us having to do any
// checks. There is one exception: for 9 bytes encodings, the bit patterns
// above 0xFEFDFBF7EFDFBF7F are invalid - but this is an easy check which will
// will almost never be done because the vast majority of numbers are encoded
// in fewer than 9 bytes. Secondly, this is slightly more compact.
/// Compute the total number of bytes used to encode a number from the first
/// byte of the encoding.
inline size_t natSize(unsigned char b0) {
static constexpr unsigned char natSizes[] = {
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
1, 1, 1, 1, 1, 1, 1, 1, //
2, 2, 2, 2, 2, 2, 2, 2, //
2, 2, 2, 2, 2, 2, 2, 2, //
2, 2, 2, 2, 2, 2, 2, 2, //
2, 2, 2, 2, 2, 2, 2, 2, //
2, 2, 2, 2, 2, 2, 2, 2, //
2, 2, 2, 2, 2, 2, 2, 2, //
2, 2, 2, 2, 2, 2, 2, 2, //
2, 2, 2, 2, 2, 2, 2, 2, //
3, 3, 3, 3, 3, 3, 3, 3, //
3, 3, 3, 3, 3, 3, 3, 3, //
3, 3, 3, 3, 3, 3, 3, 3, //
3, 3, 3, 3, 3, 3, 3, 3, //
4, 4, 4, 4, 4, 4, 4, 4, //
4, 4, 4, 4, 4, 4, 4, 4, //
5, 5, 5, 5, 5, 5, 5, 5, //
6, 6, 6, 6, 7, 7, 8, 9, //
};
// NOTE: For smaller numbers, this benchmarks as significantly faster than
// just
// return natSizes[b0]
// I'm actually not entirely sure why.
if (b0 < 0x80) {
return 1;
} else if (b0 < 0xC0) {
return 2;
} else if (b0 < 0xE0) {
return 3;
} else if (b0 < 0xF0) {
return 4;
} else if (b0 < 0xF8) {
return 5;
} else {
return natSizes[b0];
}
}
/// Skip over an encoded nat but check it for validity. Return a pointer to the
/// first byte after the nat or NULL if the encoding is invalid or the buffer
/// is too short.
const unsigned char* FOLLY_NULLABLE skipUntrustedNat(
const unsigned char* p, // pointer to first byte
const unsigned char* e // pointer to end of buffer
);
/// Skip over an encoded nat without any checks. Return a pointer to the first
/// byte after the nat.
inline const unsigned char* skipTrustedNat(const unsigned char* p) {
return p + natSize(*p);
}
/// Check and decode an encoded nat and return the decoded number and a pointer
/// to the first byte after the nat. The pointer will be NULL is the number is
/// invalid of the buffer is too short.
std::pair<uint64_t, const unsigned char * FOLLY_NULLABLE> loadUntrustedNat(
const unsigned char* p, // pointer to first byte
const unsigned char* e // pointer to end of buffer
);
/// Decode an encoded nat without any checks. Return the decoded number and a
/// pointer to the first byte after the nat.
std::pair<uint64_t, const unsigned char*> loadTrustedNat(const unsigned char*);
/// Max number of bytes that an encoded nat can take up.
inline constexpr size_t MAX_NAT_SIZE = 9;
/// Encode and store a number in the buffer and return the number of bytes
/// taken up by the encoding. This assumes that the buffer has enough space.
size_t storeNat(unsigned char* out, uint64_t val);
/// A stack-allocated encoded nat
struct EncodedNat {
explicit EncodedNat(uint64_t val) {
encoded_size = storeNat(buf, val);
}
folly::ByteRange byteRange() {
return folly::ByteRange(buf, buf + encoded_size);
}
private:
uint8_t buf[rts::MAX_NAT_SIZE];
size_t encoded_size;
};
} // namespace rts
} // namespace glean
} // namespace facebook