include/hermes/BCGen/HBC/BytecodeFileFormat.h

/* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #ifndef HERMES_BCGEN_HBC_BYTECODEFILEFORMAT_H #define HERMES_BCGEN_HBC_BYTECODEFILEFORMAT_H #include "hermes/BCGen/HBC/BytecodeVersion.h" #include "hermes/BCGen/HBC/StringKind.h" #include "hermes/Support/Compiler.h" #include "hermes/Support/RegExpSerialization.h" #include "hermes/Support/SHA1.h" #include "hermes/Support/StringTableEntry.h" #include <cassert> #include <cstdint> #include <cstring> namespace hermes { namespace hbc { // "Hermes" in ancient Greek encoded in UTF-16BE and truncated to 8 bytes. const static uint64_t MAGIC = 0x1F1903C103BC1FC6; // The "delta prepped" form: a different magic number indicating that the // bytecode file is in a form suitable for delta diffing, not execution. const static uint64_t DELTA_MAGIC = ~MAGIC; /// Property cache index which indicates no caching. static constexpr uint8_t PROPERTY_CACHING_DISABLED = 0; /// Alignment of data structures of in file. static constexpr size_t BYTECODE_ALIGNMENT = alignof(uint32_t); /// Bytecode forms enum class BytecodeForm { /// Execution form (the default) is the bytecode prepared for execution. Execution, /// Delta form is the bytecode prepared to minimize binary diff size. Delta, }; /// Storing information about the bytecode, needed when it is loaded by the /// runtime. union BytecodeOptions { struct { bool staticBuiltins : 1; bool cjsModulesStaticallyResolved : 1; bool hasAsync : 1; }; uint8_t _flags; BytecodeOptions() : _flags(0) {} }; // See BytecodeFileFormatTest for details about bit field layouts static_assert( sizeof(BytecodeOptions) == 1, "BytecodeOptions should take up 1 byte total"); /** * Header of binary file. */ LLVM_PACKED_START struct BytecodeFileHeader { uint64_t magic; uint32_t version; uint8_t sourceHash[SHA1_NUM_BYTES]; uint32_t fileLength; // Until the end of the BytecodeFileFooter. uint32_t globalCodeIndex; uint32_t functionCount; uint32_t stringKindCount; // Number of string kind entries. uint32_t identifierCount; // Number of strings which are identifiers. uint32_t stringCount; // Number of strings in the string table. uint32_t overflowStringCount; // Number of strings in the overflow table. uint32_t stringStorageSize; // Bytes in the blob of string contents. uint32_t regExpCount; uint32_t regExpStorageSize; uint32_t arrayBufferSize; uint32_t objKeyBufferSize; uint32_t objValueBufferSize; uint32_t segmentID; // The ID of this segment. uint32_t cjsModuleCount; // Number of modules. uint32_t functionSourceCount; // Number of function sources preserved. uint32_t debugInfoOffset; BytecodeOptions options; // Insert any padding to make function headers that follow this file header // less likely to cross cache lines. uint8_t padding[27]; BytecodeFileHeader( uint64_t magic, uint32_t version, const SHA1 &sourceHash, uint32_t fileLength, uint32_t globalCodeIndex, uint32_t functionCount, uint32_t stringKindCount, uint32_t identifierCount, uint32_t stringCount, uint32_t overflowStringCount, uint32_t stringStorageSize, uint32_t regExpCount, uint32_t regExpStorageSize, uint32_t arrayBufferSize, uint32_t objKeyBufferSize, uint32_t objValueBufferSize, uint32_t segmentID, uint32_t cjsModuleCount, uint32_t functionSourceCount, uint32_t debugInfoOffset, BytecodeOptions options) : magic(magic), version(version), sourceHash(), fileLength(fileLength), globalCodeIndex(globalCodeIndex), functionCount(functionCount), stringKindCount(stringKindCount), identifierCount(identifierCount), stringCount(stringCount), overflowStringCount(overflowStringCount), stringStorageSize(stringStorageSize), regExpCount(regExpCount), regExpStorageSize(regExpStorageSize), arrayBufferSize(arrayBufferSize), objKeyBufferSize(objKeyBufferSize), objValueBufferSize(objValueBufferSize), segmentID(segmentID), cjsModuleCount(cjsModuleCount), functionSourceCount(functionSourceCount), debugInfoOffset(debugInfoOffset), options(options) { std::copy(sourceHash.begin(), sourceHash.end(), this->sourceHash); std::fill(padding, padding + sizeof(padding), 0); } }; /** * Footer of binary file. Used for summary information that is *not* * read during normal execution (since that would hurt locality). */ struct BytecodeFileFooter { uint8_t fileHash[SHA1_NUM_BYTES]; // Hash of everything above the footer. // NOTE: If we ever add any non-byte fields, we need to ensure alignment // everywhere this struct is written. BytecodeFileFooter(const SHA1 &fileHash) { std::copy(fileHash.begin(), fileHash.end(), this->fileHash); } }; /// The string table is an array of these entries, followed by an array of /// OverflowStringTableEntry for the entries whose length or offset doesn't fit /// into the bitfields. struct SmallStringTableEntry { // isUTF16 and isIdentifier cannot be bool because C++ spec allows padding // at type boundaries. // Regardless of LLVM_PACKED_START, // * GCC and CLANG never adds padding at type boundaries. // * MSVC always add padding at type boundaries. // * In addition, in MSVC, for each list of continuous fields with the same // types, they always occupy a multiple of the type's normal size. uint32_t isUTF16 : 1; uint32_t offset : 23; uint32_t length : 8; static constexpr uint32_t INVALID_OFFSET = (1 << 23); static constexpr uint32_t INVALID_LENGTH = (1 << 8) - 1; bool isOverflowed() const { return length == INVALID_LENGTH; } /// Construct a small entry from 'entry'. If any fields overflow, then set /// 'overflowOffset' as the offset instead. SmallStringTableEntry( const StringTableEntry &entry, uint32_t overflowOffset) { isUTF16 = entry.isUTF16(); if (entry.getOffset() < INVALID_OFFSET && entry.getLength() < INVALID_LENGTH) { offset = entry.getOffset(); length = entry.getLength(); } else { assert(overflowOffset < INVALID_OFFSET); offset = overflowOffset; length = INVALID_LENGTH; } } }; // See BytecodeFileFormatTest for details about bit field layouts static_assert( sizeof(SmallStringTableEntry) == 4, "SmallStringTableEntry should take up 4 bytes total"); /// These are indexed by the 'offset' field of overflowed SmallStringTableEntry. struct OverflowStringTableEntry { uint32_t offset; uint32_t length; OverflowStringTableEntry(uint32_t offset, uint32_t length) : offset(offset), length(length) {} }; union FunctionHeaderFlag { enum { ProhibitCall = 0, ProhibitConstruct = 1, ProhibitNone = 2, }; struct { /// Which kinds of calls are prohibited, constructed from the above enum. uint8_t prohibitInvoke : 2; bool strictMode : 1; bool hasExceptionHandler : 1; bool hasDebugInfo : 1; bool overflowed : 1; }; uint8_t flags; FunctionHeaderFlag() { flags = 0; prohibitInvoke = ProhibitNone; } /// \return true if the specified kind of invocation is prohibited by the /// flags. bool isCallProhibited(bool construct) const { return prohibitInvoke == (uint8_t)construct; } }; // See BytecodeFileFormatTest for details about bit field layouts static_assert( sizeof(FunctionHeaderFlag) == 1, "FunctionHeaderFlag should take up 1 byte total"); /// FUNC_HEADER_FIELDS is a macro for defining function header fields. /// The args are API type, small storage type, name, and bit length. /// The types can be different if the overflow supports a longer value than the /// small storage type does. #define FUNC_HEADER_FIELDS(V) \ /* first word */ \ V(uint32_t, uint32_t, offset, 25) \ V(uint32_t, uint32_t, paramCount, 7) \ /* second word */ \ V(uint32_t, uint32_t, bytecodeSizeInBytes, 15) \ V(uint32_t, uint32_t, functionName, 17) \ /* third word */ \ V(uint32_t, uint32_t, infoOffset, 25) \ V(uint32_t, uint32_t, frameSize, 7) \ /* fourth word, with flags below */ \ V(uint32_t, uint8_t, environmentSize, 8) \ V(uint8_t, uint8_t, highestReadCacheIndex, 8) \ V(uint8_t, uint8_t, highestWriteCacheIndex, 8) /** * Metadata of a function. */ struct FunctionHeader { // Use api type here since FunctionHeader stores the full type. #define DECLARE_FIELD(api_type, store_type, name, bits) api_type name; FUNC_HEADER_FIELDS(DECLARE_FIELD) #undef DECLARE_FIELD FunctionHeaderFlag flags{}; public: FunctionHeader( uint32_t size, uint32_t paramCount, uint32_t frameSize, uint32_t envSize, uint32_t functionNameID, uint8_t hiRCacheIndex, uint8_t hiWCacheIndex) : offset(0), paramCount(paramCount), bytecodeSizeInBytes(size), functionName(functionNameID), infoOffset(0), frameSize(frameSize), environmentSize(envSize), highestReadCacheIndex(hiRCacheIndex), highestWriteCacheIndex(hiWCacheIndex) {} }; /// Compact version of FunctionHeader. Fits most functions. /// Has two possible states, indicated by 'overflowed' flag: /// !overflowed: all fields are valid. /// overflowed: only flags and getLargeHeaderOffset() are valid, /// and at the latter is a FunctionHeader. /// Note that msvc and compatible compilers will not put bitfields /// of the same type in the same memory, so don't mix uint8_t and /// uint32_t if you want them packed next to each other. struct SmallFuncHeader { // Use the store_type since SmallFuncHeader attempts to minimize storage. #define DECLARE_BITFIELD(api_type, store_type, name, bits) \ store_type name : bits; FUNC_HEADER_FIELDS(DECLARE_BITFIELD) #undef DECLARE_BITFIELD FunctionHeaderFlag flags{}; /// Make a small header equivalent to 'large' if all values fit, /// else set overflowed with large.infoOffset as large's offset. SmallFuncHeader(const FunctionHeader &large) { std::memset(this, 0, sizeof(SmallFuncHeader)); // Avoid leaking junk. flags = large.flags; #define CHECK_COPY_FIELD(api_type, store_type, name, bits) \ if (large.name > (1U << bits) - 1) { \ setLargeHeaderOffset(large.infoOffset); \ return; \ } \ name = large.name; #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wtype-limits" FUNC_HEADER_FIELDS(CHECK_COPY_FIELD) #pragma GCC diagnostic pop #undef CHECK_COPY_FIELD assert(!flags.overflowed); } void setLargeHeaderOffset(uint32_t largeHeaderOffset) { flags.overflowed = true; // Can use any fields to store the large offset; pick two big ones. offset = largeHeaderOffset & 0xffff; infoOffset = largeHeaderOffset >> 16; } uint32_t getLargeHeaderOffset() const { assert(flags.overflowed); return (infoOffset << 16) | offset; } }; // Sizes of file and function headers are tuned for good cache line packing. // If you change their size, try to avoid headers crossing cache lines. static_assert( sizeof(BytecodeFileHeader) % 32 == 0, "BytecodeFileHeader size should be cache friendly"); static_assert( 32 % sizeof(SmallFuncHeader) == 0, "SmallFuncHeader size should be cache friendly"); struct ExceptionHandlerTableHeader { uint32_t count; }; /// We need HBCExceptionHandlerInfo other than using ExceptionHandlerInfo /// directly because we don't need depth in HBC. struct HBCExceptionHandlerInfo { uint32_t start; uint32_t end; uint32_t target; }; // The size of the file table and debug data. struct DebugInfoHeader { // Number of filenames stored in the table. uint32_t filenameCount; // Bytes in the filename storage contents. uint32_t filenameStorageSize; // Count of the file table. uint32_t fileRegionCount; // Byte offset in the debug data for the lexical data. uint32_t lexicalDataOffset; // Size in bytes of the debug data. uint32_t debugDataSize; }; // The string id of files for given offsets in debug info. struct DebugFileRegion { uint32_t fromAddress; uint32_t filenameId; uint32_t sourceMappingUrlId; }; LLVM_PACKED_END /// Visit each segment in a bytecode file in order. /// This function defines the order of the bytecode file segments. template <typename Visitor> void visitBytecodeSegmentsInOrder(Visitor &visitor) { visitor.visitFunctionHeaders(); visitor.visitStringKinds(); visitor.visitIdentifierHashes(); visitor.visitSmallStringTable(); visitor.visitOverflowStringTable(); visitor.visitStringStorage(); visitor.visitArrayBuffer(); visitor.visitObjectKeyBuffer(); visitor.visitObjectValueBuffer(); visitor.visitRegExpTable(); visitor.visitRegExpStorage(); visitor.visitCJSModuleTable(); visitor.visitFunctionSourceTable(); } /// BytecodeFileFields represents direct byte-level access to the structured /// fields of a bytecode file, providing pointers and ArrayRefs referencing /// directly into the buffer. Note some portions of the bytecode file are less /// structured, such as the function info section; these are not exposed here. /// Most clients will want to use an immutable BytecodeFileFields, which may be /// initialized from a read-only buffer. Tools that want to modify the fields /// in-place may initialize with Mutable=true. template <bool Mutable> struct BytecodeFileFields { template <typename T> using Pointer = typename std::conditional<Mutable, T *, const T *>::type; template <typename T> using Array = typename std:: conditional<Mutable, llvh::MutableArrayRef<T>, llvh::ArrayRef<T>>::type; /// The file header. Pointer<BytecodeFileHeader> header{nullptr}; /// List of function headers. Some of these may be overflow headers. Array<hbc::SmallFuncHeader> functionHeaders; /// The list of short string table entries. Array<hbc::SmallStringTableEntry> stringTableEntries{}; /// Run-length encoding representing the kinds of strings in the table. Array<StringKind::Entry> stringKinds{}; /// The list of identifier hashes. Array<uint32_t> identifierHashes{}; /// The list of overflowed string table entries. Array<hbc::OverflowStringTableEntry> stringTableOverflowEntries{}; /// The character buffer used for string storage. Array<uint8_t> stringStorage; /// Buffer for array literals. Array<uint8_t> arrayBuffer; /// Buffer for object keys. Array<uint8_t> objKeyBuffer; /// Buffer for object values. Array<uint8_t> objValueBuffer; /// List of regexp literals. Array<RegExpTableEntry> regExpTable; /// Storage for regexp bytecode. Array<uint8_t> regExpStorage; /// List of CJS modules. Array<std::pair<uint32_t, uint32_t>> cjsModuleTable; /// List of resolved CJS modules. Array<std::pair<uint32_t, uint32_t>> cjsModuleTableStatic; /// List of function source table entries. Array<std::pair<uint32_t, uint32_t>> functionSourceTable; /// Populate bytecode file fields from a buffer. The fields will point /// directly into the buffer and it is the caller's responsibility to ensure /// the result does not outlive the buffer. /// \p form contains the expected bytecode form (Execution or Delta). /// \return true on success, false on /// failure, in which case an error is returned by reference. bool populateFromBuffer( Array<uint8_t> bytes, std::string *outError, BytecodeForm form = BytecodeForm::Execution); }; using ConstBytecodeFileFields = BytecodeFileFields<false>; using MutableBytecodeFileFields = BytecodeFileFields<true>; } // namespace hbc } // namespace hermes #endif // HERMES_BCGEN_HBC_BYTECODEFILEFORMAT_H

include/hermes/BCGen/HBC/BytecodeFileFormat.h (313 lines of code) (raw):