lib/Support/RegExpSerialization.cpp (320 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include "hermes/Support/RegExpSerialization.h"
#include "hermes/Regex/Regex.h"
#include "hermes/Regex/RegexTraits.h"
#include "hermes/Support/Algorithms.h"
#include "hermes/Support/UTF8.h"
#include "llvh/Support/Format.h"
#include "llvh/Support/raw_ostream.h"
#include <cctype>
using llvh::StringRef;
using namespace hermes;
/// Support for dumping regex bytecode.
namespace {
/// Base implementation of dumpInstruction() which just outputs the opcode.
/// Overloads may be used to specialize this.
void dumpInstruction(const regex::Insn *insn, llvh::raw_ostream &OS) {
/// Dump the instruction stream to a raw_ostream \p OS.
/// The default implementation of dump() outputs the opcode name only.
/// Note this is NOT a virtual function: we can't store a vtable since
/// instructions must be directly serializable.
#define REOP(Code) \
case regex::Opcode::Code: \
OS << #Code; \
break;
switch (insn->opcode) {
#include "hermes/Regex/RegexOpcodes.def"
}
}
/// aligner can take a value and force it to the alignment for its type, even if
/// the value came from a packed struct in memory.
template <typename T>
T aligner(T v) {
return v;
}
/// Helpers to compute the width of an instruction. All instructions are
/// fixed-width, except for brackets, MatchNChar8Insn, and MatchNCharICase8Insn.
template <typename Insn>
uint32_t instructionWidth(const Insn *insn) {
return sizeof *insn;
}
template <>
uint32_t instructionWidth<regex::BracketInsn>(const regex::BracketInsn *insn) {
return insn->totalWidth();
}
template <>
uint32_t instructionWidth<regex::U16BracketInsn>(
const regex::U16BracketInsn *insn) {
return insn->totalWidth();
}
template <>
uint32_t instructionWidth<regex::MatchNChar8Insn>(
const regex::MatchNChar8Insn *insn) {
return insn->totalWidth();
}
template <>
uint32_t instructionWidth<regex::MatchNCharICase8Insn>(
const regex::MatchNCharICase8Insn *insn) {
return insn->totalWidth();
}
void dumpInstruction(const regex::MatchChar8Insn *insn, llvh::raw_ostream &OS) {
OS << "MatchChar8: ";
char c = insn->c;
if (std::isprint(c))
OS << llvh::format("'%c'", c);
else
OS << llvh::format_hex(c, 4);
}
void dumpInstruction(
const regex::MatchChar16Insn *insn,
llvh::raw_ostream &OS) {
OS << "MatchChar16: ";
char32_t c = insn->c;
OS << llvh::format_hex(c, 4);
}
void dumpInstruction(
const regex::U16MatchChar32Insn *insn,
llvh::raw_ostream &OS) {
OS << "U16MatchChar32: ";
uint32_t c = insn->c;
OS << llvh::format_hex(c, 6);
}
void dumpInstruction(
const regex::MatchNChar8Insn *insn,
llvh::raw_ostream &OS) {
OS << "MatchNChar8: '";
const char *cPtr = reinterpret_cast<const char *>(insn + sizeof(regex::Insn));
for (uint32_t i = 0; i < insn->charCount; i++) {
char c = *cPtr;
if (std::isprint(c)) {
OS << llvh::format("%c", c);
} else {
OS << llvh::format_hex(c, 4);
}
cPtr++;
}
OS << "'";
}
void dumpInstruction(
const regex::MatchNCharICase8Insn *insn,
llvh::raw_ostream &OS) {
OS << "MatchNCharICase8: '";
const char *cPtr = reinterpret_cast<const char *>(insn + sizeof(regex::Insn));
for (uint32_t i = 0; i < insn->charCount; i++) {
char c = *cPtr;
if (std::isprint(c)) {
OS << llvh::format("%c", c);
} else {
OS << llvh::format_hex(c, 4);
}
cPtr++;
}
OS << "'";
}
void dumpInstruction(
const regex::MatchCharICase8Insn *insn,
llvh::raw_ostream &OS) {
OS << "MatchCharICase8: ";
char c = insn->c;
if (std::isprint(c))
OS << llvh::format("'%c'", c);
else
OS << llvh::format_hex(c, 4);
}
void dumpInstruction(
const regex::MatchCharICase16Insn *insn,
llvh::raw_ostream &OS) {
OS << "MatchCharICase16: ";
char32_t c = insn->c;
OS << llvh::format_hex(c, 4);
}
void dumpInstruction(
const regex::U16MatchCharICase32Insn *insn,
llvh::raw_ostream &OS) {
OS << "U16MatchCharICase32: ";
uint32_t c = insn->c;
OS << llvh::format_hex(c, 6);
}
void dumpInstruction(const regex::Jump32Insn *insn, llvh::raw_ostream &OS) {
OS << "Jump32: " << llvh::format_hex(insn->target, 4);
}
void dumpInstruction(
const regex::AlternationInsn *insn,
llvh::raw_ostream &OS) {
OS << "Alternation: Target " << llvh::format_hex(insn->secondaryBranch, 4)
<< ", constraints " << unsigned(insn->primaryConstraints) << ","
<< unsigned(insn->secondaryConstraints);
}
void dumpInstruction(const regex::BracketInsn *insn, llvh::raw_ostream &OS) {
using namespace regex;
OS << (insn->opcode == Opcode::U16Bracket ? "U16Bracket" : "Bracket")
<< ": [";
if (insn->negate)
OS << '^';
if (insn->positiveCharClasses & CharacterClass::Digits)
OS << "\\d";
if (insn->positiveCharClasses & CharacterClass::Spaces)
OS << "\\s";
if (insn->positiveCharClasses & CharacterClass::Words)
OS << "\\w";
if (insn->negativeCharClasses & CharacterClass::Digits)
OS << "\\D";
if (insn->negativeCharClasses & CharacterClass::Spaces)
OS << "\\S";
if (insn->negativeCharClasses & CharacterClass::Words)
OS << "\\W";
auto output1Char = [&OS](uint32_t c) {
if (c <= 127 && std::isprint(c))
OS << char(c);
else
OS << llvh::format_hex(c, 4);
};
// BracketRange32 immediately follow insn.
const BracketRange32 *range =
reinterpret_cast<const BracketRange32 *>(1 + insn);
for (uint32_t i = 0; i < insn->rangeCount; i++) {
output1Char(range->start);
if (range->end > range->start) {
OS << '-';
output1Char(range->end);
}
range++;
}
OS << ']';
}
void dumpInstruction(
const regex::WordBoundaryInsn *insn,
llvh::raw_ostream &OS) {
OS << "WordBoundary: " << (insn->invert ? "\\B" : "\\b");
}
void dumpInstruction(
const regex::BeginMarkedSubexpressionInsn *insn,
llvh::raw_ostream &OS) {
OS << "BeginMarkedSubexpression: " << insn->mexp;
}
void dumpInstruction(
const regex::EndMarkedSubexpressionInsn *insn,
llvh::raw_ostream &OS) {
OS << "EndMarkedSubexpression: " << insn->mexp;
}
void dumpInstruction(const regex::BackRefInsn *insn, llvh::raw_ostream &OS) {
OS << "BackRefInsn: " << insn->mexp;
}
void dumpInstruction(const regex::LookaroundInsn *insn, llvh::raw_ostream &OS) {
OS << "Lookaround: " << (insn->forwards ? "" : "<")
<< (insn->invert ? '!' : '=')
<< " (constraints: " << unsigned(insn->constraints)
<< ", marked expressions=[" << insn->mexpBegin << "," << insn->mexpEnd
<< "), continuation " << llvh::format_hex(insn->continuation, 4) << ')';
}
void dumpInstruction(const regex::BeginLoopInsn *insn, llvh::raw_ostream &OS) {
OS << llvh::format(
"BeginLoop: %u %s {%u, %u} (constraints: %u)",
aligner(insn->loopId),
insn->greedy ? "greedy" : "nongreedy",
aligner(insn->min),
aligner(insn->max),
aligner(insn->loopeeConstraints));
}
void dumpInstruction(const regex::EndLoopInsn *insn, llvh::raw_ostream &OS) {
OS << "EndLoop: " << llvh::format_hex(insn->target, 4);
}
void dumpInstruction(
const regex::BeginSimpleLoopInsn *insn,
llvh::raw_ostream &OS) {
OS << llvh::format(
"BeginSimpleLoop: (constraints: %u)", insn->loopeeConstraints);
}
void dumpInstruction(
const regex::EndSimpleLoopInsn *insn,
llvh::raw_ostream &OS) {
OS << "EndSimpleLoop: " << llvh::format_hex(insn->target, 4);
}
void dumpInstruction(const regex::Width1LoopInsn *insn, llvh::raw_ostream &OS) {
OS << llvh::format(
"Width1Loop: %u %s {%u, %u}",
aligner(insn->loopId),
insn->greedy ? "greedy" : "nongreedy",
aligner(insn->min),
aligner(insn->max));
}
} // namespace
namespace hermes {
void dumpRegexBytecode(llvh::ArrayRef<uint8_t> bytes, llvh::raw_ostream &OS) {
// Output the header and then slice it off.
auto *header =
reinterpret_cast<const regex::RegexBytecodeHeader *>(bytes.data());
OS << llvh::format(
" Header: marked: %u loops: %u flags: %u constraints: %u\n",
aligner(header->markedCount),
aligner(header->loopCount),
aligner(header->syntaxFlags),
header->constraints);
bytes = bytes.slice(sizeof *header);
uint32_t cursor = 0;
while (cursor < bytes.size()) {
// Output offset in left column.
OS << " " << llvh::format_hex_no_prefix(cursor, 4) << " ";
// Call dumpInstruction() with its derived type.
auto insn = reinterpret_cast<const regex::Insn *>(&bytes[cursor]);
switch (insn->opcode) {
#define REOP(Code) \
case regex::Opcode::Code: { \
auto derivedInsn = llvh::cast<regex::Code##Insn>(insn); \
dumpInstruction(derivedInsn, OS); \
cursor += instructionWidth(derivedInsn); \
break; \
}
#include "hermes/Regex/RegexOpcodes.def"
}
OS << '\n';
}
// We expect to have consumed exactly the size of the stream.
assert(cursor == bytes.size() && "Invalid instructions in regex stream");
}
CompiledRegExp::CompiledRegExp(CompiledRegExp &&) = default;
CompiledRegExp &CompiledRegExp::operator=(CompiledRegExp &&) = default;
CompiledRegExp::~CompiledRegExp() {}
CompiledRegExp::CompiledRegExp(
std::vector<uint8_t> bytecode,
std::string pattern,
std::string flags)
: bytecode_(std::move(bytecode)),
pattern_(std::move(pattern)),
flags_(flags) {}
llvh::Optional<CompiledRegExp> CompiledRegExp::tryCompile(
StringRef pattern,
StringRef flags,
llvh::StringRef *outError) {
using namespace regex;
using namespace regex::constants;
// We have to match the way strings are constructed by StringTable, which is
// by interpreting UTF-8 encoded surrogates as UTF-16 surrogates.
llvh::SmallVector<char16_t, 16> re16;
convertUTF8WithSurrogatesToUTF16(
std::back_inserter(re16), pattern.begin(), pattern.end());
llvh::SmallVector<char16_t, 6> flags16;
convertUTF8WithSurrogatesToUTF16(
std::back_inserter(flags16), flags.begin(), flags.end());
// Build and compile the regexp.
auto re = regex::Regex<regex::UTF16RegexTraits>(re16, flags16);
if (!re.valid()) {
if (outError)
*outError = messageForError(re.getError());
return llvh::None;
}
return CompiledRegExp(re.compile(), pattern, flags);
}
llvh::ArrayRef<uint8_t> CompiledRegExp::getBytecode() const {
return bytecode_;
}
std::vector<RegExpTableEntry> UniquingRegExpTable::getEntryList() const {
std::vector<RegExpTableEntry> result;
result.reserve(regexps_.size());
uint32_t offset = 0;
for (const auto &re : regexps_) {
uint32_t size = re.getBytecode().size();
result.push_back(RegExpTableEntry{offset, size});
offset += size;
}
return result;
}
RegExpBytecode UniquingRegExpTable::getBytecodeBuffer() const {
RegExpBytecode result;
for (const auto &re : regexps_) {
auto bytecode = re.getBytecode();
result.insert(result.end(), bytecode.begin(), bytecode.end());
}
return result;
}
void UniquingRegExpTable::disassemble(llvh::raw_ostream &OS) const {
uint32_t index = 0;
// Note that regexp patterns and flags are read without interpreting escapes
// by JSLexer, so this will output in a format identical to the source
// pattern (preserving escapes, etc) except that non-BMP characters will have
// been encoded in UTF-8 as two UTF-16 surrogates.
for (const auto ®exp : regexps_) {
OS << index << ": /" << regexp.getPattern() << '/' << regexp.getFlags()
<< '\n';
dumpRegexBytecode(regexp.getBytecode(), OS);
index++;
}
OS << '\n';
}
} // namespace hermes