tools/hbc-attribute/hbc-attribute.cpp (425 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include "llvh/ADT/SmallVector.h"
#include "llvh/ADT/StringRef.h"
#include "llvh/Support/CommandLine.h"
#include "llvh/Support/FileSystem.h"
#include "llvh/Support/InitLLVM.h"
#include "llvh/Support/MemoryBuffer.h"
#include "llvh/Support/PrettyStackTrace.h"
#include "llvh/Support/Signals.h"
#include "llvh/Support/raw_ostream.h"
#include "hermes/BCGen/HBC/BytecodeDisassembler.h"
#include "hermes/BCGen/HBC/SerializedLiteralGenerator.h"
#include "hermes/BCGen/HBC/StringKind.h"
#include "hermes/Public/Buffer.h"
#include "hermes/Support/JSONEmitter.h"
#include "hermes/Support/LEB128.h"
#include "hermes/Support/MemoryBuffer.h"
#include <algorithm>
#include <iostream>
#include <iterator>
#include <sstream>
#include <string>
#include <vector>
/*
* hbc-attribute attributes bundle size to each function. Additional tools
* allow symbolicating and grouping the information into files. Data shared
* entirely/partially between functions are correctly attributed.
*
* The basic principle is to emit a nested type, size and deduplication key
* for each type of data. For example:
*
* Function Type Key Size
* Foo data:string:entry 1 8
* Bar data:string:entry 1 8
* Bar data:string:entry 2 8
*
* The following is assumed to hold:
*
* 1. Records of different Type will not overlap.
* 2. Records of the same Type but different Key will not overlap.
* 3. Records of the same Type and same Key will completely overlap.
* 4. Records of the same Type and same Key will have the same Size.
*
*/
using namespace hermes;
using namespace hermes::hbc;
using namespace hermes::inst;
using llvh::MutableArrayRef;
using llvh::raw_fd_ostream;
using SLG = hermes::hbc::SerializedLiteralGenerator;
/* This tool is highly dependent upon the current bytecode format.
*
* If you have added a simple instruction, make sure string parameters (if any)
* are marked with the OPERAND_STRING_ID macro and bump the version below.
*
* If you have added or modified sections, make sure they're counted properly.
*/
static llvh::cl::opt<std::string> InputFilename(
llvh::cl::Positional,
llvh::cl::desc("Input bundle"),
llvh::cl::init("-"));
static llvh::cl::opt<std::string>
OutputFilename("out", llvh::cl::desc("Output file"), llvh::cl::init("-"));
namespace {
template <typename T>
unsigned byteSize(llvh::ArrayRef<T> ref) {
return ref.size() * sizeof(T);
}
/// Walks the bytecode and outputs usage info.
class UsageCounter : public BytecodeVisitor {
protected:
JSONEmitter &emitter_;
llvh::DenseMap<unsigned, unsigned> virtualOffsets_;
uintptr_t bundleStart_;
unsigned currentFuncId_;
uintptr_t opcodeStart_;
uintptr_t opcodeEnd_;
uintptr_t functionEnd_;
llvh::DenseMap<std::pair<StringRef, unsigned>, unsigned> emitted_;
/// Indices into the bytecode's string table corresponding to the (exclusive)
/// end of each string kind entry.
std::vector<uint32_t> stringKindEnds_;
void appendRecord(llvh::StringRef type, unsigned dedupKey, unsigned size) {
assert(size < (2 << 20) && "Abnormally large size!");
if (size == 0) {
// Don't bother counting anything of size 0.
return;
}
// Do one pass of deduplication while emitting. This cuts output in half.
std::pair<StringRef, unsigned> key = {type, dedupKey};
if (emitted_.count(key)) {
assert(emitted_[key] == size && "Expected deduped entry to be same size");
return;
}
emitted_[key] = size;
emitter_.openDict();
emitter_.emitKeyValue("type", type);
emitter_.emitKeyValue("dedupKey", dedupKey);
emitter_.emitKeyValue("size", size);
emitter_.closeDict();
}
/// Emits available location data we have, which is often just virtualOffset.
void emitFunctionLocation() {
auto debugInfo = bcProvider_->getDebugInfo();
auto offsets = bcProvider_->getDebugOffsets(currentFuncId_);
emitter_.emitKey("location");
emitter_.openDict();
if (offsets && offsets->sourceLocations != DebugOffsets::NO_OFFSET) {
if (auto pos =
debugInfo->getLocationForAddress(offsets->sourceLocations, 0)) {
emitter_.emitKeyValue(
"file", debugInfo->getFilenameByID(pos->filenameId));
emitter_.emitKeyValue("line", pos->line);
emitter_.emitKeyValue("column", pos->column);
}
}
emitter_.emitKeyValue("virtualOffset", virtualOffsets_[currentFuncId_]);
emitter_.emitKeyValue(
"bytecodeSize",
bcProvider_->getFunctionHeader(currentFuncId_).bytecodeSizeInBytes());
emitter_.closeDict();
}
/// Emits per-bundle information.
void emitGlobalInfo() {
appendRecord("headers:global:bundle", 0, sizeof(BytecodeFileHeader));
appendRecord("headers:global:debuginfo", 0, sizeof(DebugInfoHeader));
// FIXME: Some padding is not included.
}
void beforeStart(unsigned funcId, const uint8_t *bytecodeStart) override {
currentFuncId_ = funcId;
emitted_.clear();
opcodeStart_ = (uintptr_t)bytecodeStart;
opcodeEnd_ = llvh::alignAddr(
bytecodeStart +
bcProvider_->getFunctionHeader(funcId).bytecodeSizeInBytes(),
sizeof(uint32_t));
functionEnd_ = opcodeEnd_;
emitter_.emitKeyValue("functionId", funcId);
emitFunctionLocation();
emitter_.emitKey("usage");
emitter_.openArray();
}
void countDebugInfo() {
// FIXME: Avoid reimplementing this logic.
auto *offsets = bcProvider_->getDebugOffsets(currentFuncId_);
if (!offsets)
return;
if (offsets->sourceLocations != DebugOffsets::NO_OFFSET) {
auto data = bcProvider_->getDebugInfo()->viewData().getData();
auto offset = offsets->sourceLocations;
int64_t n, trash;
for (int i = 0; i < 3; i++) {
offset += readSignedLEB128(data, offset, &n);
}
do {
offset += readSignedLEB128(data, offset, &n);
if (n == -1)
break;
offset += readSignedLEB128(data, offset, &n);
if (n & 1)
offset += readSignedLEB128(data, offset, &n);
offset += readSignedLEB128(data, offset, &trash);
} while (true);
appendRecord(
"debuginfo:sourcelocations",
offsets->sourceLocations,
offset - offsets->sourceLocations);
}
if (offsets->lexicalData &&
offsets->lexicalData != DebugOffsets::NO_OFFSET) {
auto data = bcProvider_->getDebugInfo()->viewData().getData();
unsigned start = offsets->lexicalData +
bcProvider_->getDebugInfo()->lexicalDataOffset();
unsigned offset = start;
int64_t trash;
// Read parent id
offset += readSignedLEB128(data, offset, &trash);
// Read variable count
int64_t count;
offset += readSignedLEB128(data, offset, &count);
// Read variables
for (int64_t i = 0; i < count; i++) {
int64_t stringLength;
offset += readSignedLEB128(data, offset, &stringLength);
offset += stringLength;
}
appendRecord(
"debuginfo:lexicaldata", offsets->lexicalData, offset - start);
}
}
void afterStart() override {
auto header = bcProvider_->getFunctionHeader(currentFuncId_);
auto exceptionTable = bcProvider_->getExceptionTable(currentFuncId_);
// We always have a small header, and sometimes a large one too.
appendRecord(
"headers:function:small", currentFuncId_, sizeof(SmallFuncHeader));
if (header.flags().overflowed) {
appendRecord(
"headers:function:large", currentFuncId_, sizeof(FunctionHeader));
}
countStringLiteral(header.functionName());
if (header.flags().hasExceptionHandler) {
// Exception tables are not deduplicated by function.
appendRecord(
"headers:exceptions",
currentFuncId_,
sizeof(ExceptionHandlerTableHeader));
appendRecord(
"bytecode:tables:exception",
currentFuncId_,
byteSize(exceptionTable));
}
appendRecord(
"bytecode:instructions", header.offset(), opcodeEnd_ - opcodeStart_);
appendRecord(
"bytecode:tables:jump", header.offset(), functionEnd_ - opcodeEnd_);
countDebugInfo();
// Assign global headers to the global function
if (bcProvider_->getGlobalFunctionIndex() == currentFuncId_) {
emitGlobalInfo();
}
emitter_.closeArray();
}
void countStringKind(unsigned stringIndex) {
// Map from string table index to kind index.
auto it = std::upper_bound(
stringKindEnds_.begin(), stringKindEnds_.end(), stringIndex);
assert(it != stringKindEnds_.end() && "String index out of range");
auto kindIndex = std::distance(stringKindEnds_.begin(), it);
appendRecord("data:string:kind", kindIndex, sizeof(StringKind::Entry));
StringKind::Kind kind = bcProvider_->getStringKinds()[kindIndex].kind();
if (kind != StringKind::String) {
// Strings whose kind are not "String" are Identifiers and have a
// translation field.
appendRecord(
"data:string:identifier_translation", stringIndex, sizeof(uint32_t));
}
}
void countStringLiteral(unsigned stringIndex) {
countStringKind(stringIndex);
auto entry = bcProvider_->getStringTableEntry(stringIndex);
auto wasLarge = SmallStringTableEntry(entry, 0).isOverflowed();
// Like functions headers, we have two kinds of string entries.
appendRecord(
"data:string:small_entry", stringIndex, sizeof(SmallStringTableEntry));
if (wasLarge) {
appendRecord(
"data:string:overflow_entry",
stringIndex,
sizeof(OverflowStringTableEntry));
}
auto offset = entry.getOffset();
auto length = entry.getLength() * (entry.isUTF16() ? 2 : 1);
// Emit each byte separately for deduplication to allow overlap.
// We could emit the entire length for non-overlapped strings, but those
// make up maybe 10% of them so there's little to gain.
for (unsigned i = 0; i < length; i++) {
appendRecord("data:string:chars", offset + i, 1);
}
}
void countRegex(uint32_t index) {
if (index == 0xFFFFFFFF)
return;
auto regex = bcProvider_->getRegExpTable()[index];
appendRecord("data:regex:entry", index, sizeof(RegExpTableEntry));
appendRecord("data:regex:bytecode", index, regex.length);
}
void countSerializedLiteral(
SLG::TagType tag,
const unsigned char *buff,
unsigned int *ind) {
// FIXME: Avoid duplicating this logic.
unsigned bundleOffset = (uintptr_t)(*ind + buff - bundleStart_);
switch (tag) {
case SLG::ByteStringTag: {
uint8_t val = llvh::support::endian::read<uint8_t, 1>(
buff + *ind, llvh::support::endianness::little);
appendRecord("data:literalbuffer:bytestring", bundleOffset, 1);
countStringLiteral(val);
*ind += 1;
} break;
case SLG::ShortStringTag: {
uint16_t val = llvh::support::endian::read<uint16_t, 1>(
buff + *ind, llvh::support::endianness::little);
appendRecord("data:literalbuffer:shortstring", bundleOffset, 2);
countStringLiteral(val);
*ind += 2;
} break;
case SLG::LongStringTag: {
uint32_t val = llvh::support::endian::read<uint32_t, 1>(
buff + *ind, llvh::support::endianness::little);
appendRecord("data:literalbuffer:longstring", bundleOffset, 4);
countStringLiteral(val);
*ind += 4;
} break;
case SLG::NumberTag: {
appendRecord("data:literalbuffer:double", bundleOffset, 8);
*ind += 8;
} break;
case SLG::IntegerTag: {
appendRecord("data:literalbuffer:int", bundleOffset, 4);
*ind += 4;
} break;
case SLG::NullTag:
case SLG::TrueTag:
case SLG::FalseTag:
break;
}
}
void countSerializedLiterals(
llvh::ArrayRef<unsigned char> array,
unsigned offset,
unsigned count) {
const unsigned char *ptr = array.data();
unsigned keyInd = offset;
while (count > 0) {
std::pair<int, SLG::TagType> keyTag = checkBufferTag(ptr + keyInd);
auto tagLength = (keyTag.first > 0x0f ? 2 : 1);
// This could conceivably overlap if the tag+data fits in the middle of
// an existing 'double' literal, but we'll assume it's uncommon enough.
appendRecord(
"data:literalbuffer:tag",
(unsigned)(uintptr_t)(keyInd + ptr - bundleStart_),
tagLength);
keyInd += tagLength;
for (int i = 0; i < keyTag.first && count; i++) {
countSerializedLiteral(keyTag.second, ptr, &keyInd);
count--;
}
}
}
void visitSwitchImm(const inst::Inst *inst) {
assert(inst->opCode == inst::OpCode::SwitchImm);
const auto *curJmpTableView =
reinterpret_cast<const uint32_t *>(llvh::alignAddr(
(const uint8_t *)inst + inst->iSwitchImm.op2, sizeof(uint32_t)));
unsigned start = inst->iSwitchImm.op4;
unsigned end = inst->iSwitchImm.op5;
assert(start < end && "Jump table spans negative range");
unsigned count = end - start + 1;
uintptr_t newEnd = (uintptr_t)&curJmpTableView[count];
if (newEnd > functionEnd_)
functionEnd_ = newEnd;
}
void preVisitInstruction(inst::OpCode opcode, const uint8_t *ip, int length)
override {
auto inst = (inst::Inst const *)ip;
// Count all strings
#define OPERAND_STRING_ID(OP, N) \
if (opcode == OpCode::OP) { \
countStringLiteral(inst->i##OP.op##N); \
}
#include "hermes/BCGen/HBC/BytecodeList.def"
// Count non-string misc references
switch (opcode) {
case OpCode::SwitchImm:
visitSwitchImm(inst);
break;
case OpCode::NewObjectWithBuffer:
countSerializedLiterals(
bcProvider_->getObjectKeyBuffer(),
inst->iNewObjectWithBuffer.op4,
inst->iNewObjectWithBuffer.op3);
countSerializedLiterals(
bcProvider_->getObjectValueBuffer(),
inst->iNewObjectWithBuffer.op5,
inst->iNewObjectWithBuffer.op3);
break;
case OpCode::NewObjectWithBufferLong:
countSerializedLiterals(
bcProvider_->getObjectKeyBuffer(),
inst->iNewObjectWithBufferLong.op4,
inst->iNewObjectWithBufferLong.op3);
countSerializedLiterals(
bcProvider_->getObjectValueBuffer(),
inst->iNewObjectWithBufferLong.op5,
inst->iNewObjectWithBufferLong.op3);
break;
case OpCode::NewArrayWithBuffer:
countSerializedLiterals(
bcProvider_->getArrayBuffer(),
inst->iNewArrayWithBuffer.op4,
inst->iNewArrayWithBuffer.op3);
break;
case OpCode::NewArrayWithBufferLong:
countSerializedLiterals(
bcProvider_->getArrayBuffer(),
inst->iNewArrayWithBufferLong.op4,
inst->iNewArrayWithBufferLong.op3);
break;
case OpCode::CreateRegExp:
countRegex(inst->iCreateRegExp.op4);
default:
break;
}
}
public:
UsageCounter(
std::shared_ptr<BCProvider> bc,
JSONEmitter &emitter,
llvh::DenseMap<unsigned, unsigned> offsets,
uintptr_t bundleStart)
: BytecodeVisitor(bc),
emitter_(emitter),
virtualOffsets_(offsets),
bundleStart_(bundleStart) {
unsigned end = 0;
for (auto entry : bc->getStringKinds()) {
end += entry.count();
stringKindEnds_.push_back(end);
}
}
};
// Getting all virtual offsets is O(N^2) unless we do them in a single pass.
llvh::DenseMap<unsigned, unsigned> getVirtualOffsets(
std::shared_ptr<BCProvider> bc) {
llvh::DenseMap<unsigned, unsigned> map(bc->getFunctionCount());
unsigned virtualOffset = 0;
for (unsigned i = 0, e = bc->getFunctionCount(); i < e; i++) {
auto header = bc->getFunctionHeader(i);
map[i] = virtualOffset;
virtualOffset += header.bytecodeSizeInBytes();
}
return map;
}
bool attribute(
std::unique_ptr<llvh::MemoryBuffer> input,
JSONEmitter &emitter) {
assert(
llvh::alignAddr(input->getBufferStart(), sizeof(uint32_t)) ==
(uintptr_t)input->getBufferStart());
uintptr_t bundleStart = (uintptr_t)input->getBuffer().data();
auto hermesBuffer = std::make_unique<hermes::MemoryBuffer>(input.get());
auto ret = hbc::BCProviderFromBuffer::createBCProviderFromBuffer(
std::move(hermesBuffer));
if (!ret.first) {
llvh::errs() << "Can't deserialize: " << ret.second << "\n";
return false;
}
std::shared_ptr<BCProvider> bc = std::move(ret.first);
// TODO: Add records for the bytecode header and similar.
UsageCounter counter(bc, emitter, getVirtualOffsets(bc), bundleStart);
for (int i = 0, e = bc->getFunctionCount(); i < e; i++) {
emitter.openDict();
counter.visitInstructionsInFunction(i);
emitter.closeDict();
emitter.endJSONL();
}
return true;
}
} // namespace
int main(int argc, char **argv) {
// Normalize the arg vector.
llvh::InitLLVM initLLVM(argc, argv);
llvh::sys::PrintStackTraceOnErrorSignal("hbc-attribute");
llvh::PrettyStackTraceProgram X(argc, argv);
llvh::llvm_shutdown_obj Y;
llvh::cl::ParseCommandLineOptions(
argc, argv, "Hermes bytecode size attribution tool\n");
llvh::ErrorOr<std::unique_ptr<llvh::MemoryBuffer>> fileBufOrErr =
llvh::MemoryBuffer::getFileOrSTDIN(InputFilename);
if (!fileBufOrErr) {
llvh::errs() << "Error: fail to open file: " << InputFilename << ": "
<< fileBufOrErr.getError().message() << "\n";
return 1;
}
llvh::Optional<raw_fd_ostream> fileOS;
if (!OutputFilename.empty()) {
std::error_code EC;
fileOS.emplace(OutputFilename.data(), EC, llvh::sys::fs::F_Text);
if (EC) {
llvh::errs() << "Error: fail to open file " << OutputFilename << ": "
<< EC.message() << '\n';
return 1;
}
}
auto &output = fileOS ? *fileOS : llvh::outs();
JSONEmitter emitter(output);
if (!attribute(std::move(fileBufOrErr.get()), emitter)) {
return 3;
}
output.flush();
return 0;
}