lib/VM/JSRegExp.cpp (300 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include "hermes/VM/JSRegExp.h"
#include "hermes/Regex/Executor.h"
#include "hermes/Regex/Regex.h"
#include "hermes/Regex/RegexTraits.h"
#include "hermes/Support/UTF8.h"
#include "hermes/VM/BuildMetadata.h"
#include "hermes/VM/Operations.h"
#include "hermes/VM/RegExpMatch.h"
#include "hermes/VM/Runtime-inline.h"
#include "hermes/VM/StringView.h"
namespace hermes {
namespace vm {
//===----------------------------------------------------------------------===//
// class JSRegExp
const ObjectVTable JSRegExp::vt{
VTable(
CellKind::JSRegExpKind,
cellSize<JSRegExp>(),
JSRegExp::_finalizeImpl,
nullptr,
JSRegExp::_mallocSizeImpl,
nullptr,
VTable::HeapSnapshotMetadata{
HeapSnapshot::NodeType::Regexp,
JSRegExp::_snapshotNameImpl,
JSRegExp::_snapshotAddEdgesImpl,
JSRegExp::_snapshotAddNodesImpl,
nullptr}),
JSRegExp::_getOwnIndexedRangeImpl,
JSRegExp::_haveOwnIndexedImpl,
JSRegExp::_getOwnIndexedPropertyFlagsImpl,
JSRegExp::_getOwnIndexedImpl,
JSRegExp::_setOwnIndexedImpl,
JSRegExp::_deleteOwnIndexedImpl,
JSRegExp::_checkAllOwnIndexedImpl,
};
void JSRegExpBuildMeta(const GCCell *cell, Metadata::Builder &mb) {
mb.addJSObjectOverlapSlots(JSObject::numOverlapSlots<JSRegExp>());
JSObjectBuildMeta(cell, mb);
const auto *self = static_cast<const JSRegExp *>(cell);
mb.setVTable(&JSRegExp::vt);
mb.addField(&self->pattern_);
}
PseudoHandle<JSRegExp> JSRegExp::create(
Runtime &runtime,
Handle<JSObject> parentHandle) {
auto *cell = runtime.makeAFixed<JSRegExp, HasFinalizer::Yes>(
runtime,
parentHandle,
runtime.getHiddenClassForPrototype(
*parentHandle, numOverlapSlots<JSRegExp>()));
return JSObjectInit::initToPseudoHandle(runtime, cell);
}
void JSRegExp::initialize(
Handle<JSRegExp> selfHandle,
Runtime &runtime,
Handle<StringPrimitive> pattern,
Handle<StringPrimitive> flags,
llvh::ArrayRef<uint8_t> bytecode) {
assert(
pattern && flags &&
"Null pattern and/or flags passed to JSRegExp::initialize");
selfHandle->pattern_.set(runtime, *pattern, &runtime.getHeap());
DefinePropertyFlags dpf = DefinePropertyFlags::getDefaultNewPropertyFlags();
dpf.enumerable = 0;
dpf.configurable = 0;
auto res = JSObject::defineOwnProperty(
selfHandle,
runtime,
Predefined::getSymbolID(Predefined::lastIndex),
dpf,
HandleRootOwner::getZeroValue());
(void)res;
assert(
res != ExecutionStatus::EXCEPTION && *res &&
"defineOwnProperty() failed");
selfHandle->initializeBytecode(bytecode);
}
ExecutionStatus JSRegExp::initialize(
Handle<JSRegExp> selfHandle,
Runtime &runtime,
Handle<JSRegExp> otherHandle,
Handle<StringPrimitive> flags) {
llvh::SmallVector<char16_t, 16> flagsText16;
flags->appendUTF16String(flagsText16);
auto sflags = regex::SyntaxFlags::fromString(flagsText16);
if (!sflags) {
return runtime.raiseSyntaxError("Invalid RegExp: Invalid flags");
}
auto pattern = runtime.makeHandle(getPattern(otherHandle.get(), runtime));
// Fast path to avoid recompiling the RegExp if the flags match
if (LLVM_LIKELY(
sflags->toByte() == getSyntaxFlags(otherHandle.get()).toByte())) {
initialize(
selfHandle,
runtime,
pattern,
flags,
{otherHandle->bytecode_, otherHandle->bytecodeSize_});
return ExecutionStatus::RETURNED;
}
return initialize(selfHandle, runtime, pattern, flags);
}
/// ES11 21.2.3.2.2 RegExpInitialize ( obj, pattern, flags )
ExecutionStatus JSRegExp::initialize(
Handle<JSRegExp> selfHandle,
Runtime &runtime,
Handle<StringPrimitive> pattern,
Handle<StringPrimitive> flags) {
assert(
pattern && flags &&
"Null pattern and/or flags passed to JSRegExp::initialize");
llvh::SmallVector<char16_t, 6> flagsText16;
flags->appendUTF16String(flagsText16);
llvh::SmallVector<char16_t, 16> patternText16;
pattern->appendUTF16String(patternText16);
// Build the regex.
regex::Regex<regex::UTF16RegexTraits> regex(patternText16, flagsText16);
if (!regex.valid()) {
return runtime.raiseSyntaxError(
TwineChar16("Invalid RegExp: ") +
regex::constants::messageForError(regex.getError()));
}
// The regex is valid. Compile and store its bytecode.
auto bytecode = regex.compile();
initialize(selfHandle, runtime, pattern, flags, bytecode);
return ExecutionStatus::RETURNED;
}
void JSRegExp::initializeBytecode(llvh::ArrayRef<uint8_t> bytecode) {
size_t sz = bytecode.size();
assert(
sz <= std::numeric_limits<uint32_t>::max() &&
"Bytecode size cannot exceed 32 bits");
auto header =
reinterpret_cast<const regex::RegexBytecodeHeader *>(bytecode.data());
syntaxFlags_ = regex::SyntaxFlags::fromByte(header->syntaxFlags);
bytecodeSize_ = sz;
bytecode_ = (uint8_t *)checkedMalloc(sz);
memcpy(bytecode_, bytecode.data(), sz);
}
PseudoHandle<StringPrimitive> JSRegExp::getPattern(
JSRegExp *self,
PointerBase &base) {
return createPseudoHandle(self->pattern_.get(base));
}
template <typename CharT, typename Traits>
CallResult<RegExpMatch> performSearch(
Runtime &runtime,
llvh::ArrayRef<uint8_t> bytecode,
const CharT *start,
uint32_t stringLength,
uint32_t searchStartOffset,
regex::constants::MatchFlagType matchFlags) {
std::vector<regex::CapturedRange> nativeMatchRanges;
auto matchResult = regex::searchWithBytecode(
bytecode,
start,
searchStartOffset,
stringLength,
&nativeMatchRanges,
matchFlags);
if (matchResult == regex::MatchRuntimeResult::StackOverflow) {
return runtime.raiseRangeError("Maximum regex stack depth reached");
} else if (matchResult == regex::MatchRuntimeResult::NoMatch) {
return RegExpMatch{}; // not found.
}
size_t matchRangeCount = nativeMatchRanges.size();
assert(matchRangeCount > 0);
RegExpMatch match;
match.reserve(matchRangeCount);
for (size_t i = 0; i < matchRangeCount; i++) {
const auto &submatch = nativeMatchRanges[i];
if (!submatch.matched()) {
assert(i > 0 && "match_result[0] should always match");
match.push_back(llvh::None);
} else {
uint32_t pos = submatch.start;
uint32_t length = submatch.end - submatch.start;
match.push_back(RegExpMatchRange{pos, length});
}
}
assert(!match.empty() && "Unexpected empty match");
return match;
}
CallResult<RegExpMatch> JSRegExp::search(
Handle<JSRegExp> selfHandle,
Runtime &runtime,
Handle<StringPrimitive> strHandle,
uint32_t searchStartOffset) {
assert(selfHandle->bytecode_ && "Missing bytecode");
auto input = StringPrimitive::createStringView(runtime, strHandle);
// Note we may still have a match if searchStartOffset == str.size(),
// if the regexp can match an empty string
if (searchStartOffset > input.length()) {
return RegExpMatch{}; // no match possible
}
auto matchFlags = regex::constants::matchDefault;
// Respect the sticky flag, which forces us to match only at the given
// location.
if (selfHandle->syntaxFlags_.sticky) {
matchFlags |= regex::constants::matchOnlyAtStart;
}
CallResult<RegExpMatch> matchResult = RegExpMatch{};
if (input.isASCII()) {
matchFlags |= regex::constants::matchInputAllAscii;
matchResult = performSearch<char, regex::ASCIIRegexTraits>(
runtime,
llvh::makeArrayRef(selfHandle->bytecode_, selfHandle->bytecodeSize_),
input.castToCharPtr(),
input.length(),
searchStartOffset,
matchFlags);
} else {
matchResult = performSearch<char16_t, regex::UTF16RegexTraits>(
runtime,
llvh::makeArrayRef(selfHandle->bytecode_, selfHandle->bytecodeSize_),
input.castToChar16Ptr(),
input.length(),
searchStartOffset,
matchFlags);
}
// Only update on successful match.
if (LLVM_UNLIKELY(matchResult == ExecutionStatus::EXCEPTION)) {
return ExecutionStatus::EXCEPTION;
} else if (!matchResult->empty()) {
runtime.regExpLastInput = strHandle.getHermesValue();
runtime.regExpLastRegExp = selfHandle.getHermesValue();
runtime.regExpLastMatch = *matchResult;
}
return matchResult;
}
JSRegExp::~JSRegExp() {
free(bytecode_);
}
void JSRegExp::_finalizeImpl(GCCell *cell, GC *gc) {
JSRegExp *self = vmcast<JSRegExp>(cell);
if (self->bytecode_) {
gc->getIDTracker().untrackNative(self->bytecode_);
}
self->~JSRegExp();
}
size_t JSRegExp::_mallocSizeImpl(GCCell *cell) {
auto *self = vmcast<JSRegExp>(cell);
return self->bytecodeSize_;
}
std::string JSRegExp::_snapshotNameImpl(GCCell *cell, GC *gc) {
auto *const self = vmcast<JSRegExp>(cell);
return converter(getPattern(self, gc->getPointerBase()).get());
}
void JSRegExp::_snapshotAddEdgesImpl(GCCell *cell, GC *gc, HeapSnapshot &snap) {
auto *const self = vmcast<JSRegExp>(cell);
// Call the super type to add any other custom edges.
JSObject::_snapshotAddEdgesImpl(self, gc, snap);
if (self->bytecode_) {
snap.addNamedEdge(
HeapSnapshot::EdgeType::Internal,
"bytecode",
gc->getNativeID(self->bytecode_));
}
}
void JSRegExp::_snapshotAddNodesImpl(GCCell *cell, GC *gc, HeapSnapshot &snap) {
auto *const self = vmcast<JSRegExp>(cell);
if (self->bytecode_) {
// Add a native node for regex bytecode, to account for native size
// directly owned by the regex.
snap.beginNode();
snap.endNode(
HeapSnapshot::NodeType::Native,
"RegExpBytecode",
gc->getNativeID(self->bytecode_),
self->bytecodeSize_,
0);
}
}
/// \return an escaped string equivalent to \p pattern.
/// This is used to construct the 'source' property of RegExp. This requires
/// us to return a string from which the regexp may be reconstructed as if
/// from a /foo/ style literal. Note this is different from the RegExp
/// constructor that takes a string, e.g. new RegExp("/") returns a regexp
/// that matches /, but
/// /// does not (it's a comment!). So we may have to perform surgery on the
/// pattern.
CallResult<HermesValue> JSRegExp::escapePattern(
Handle<StringPrimitive> pattern,
Runtime &runtime) {
SmallU16String<32> result;
result.reserve(pattern->getStringLength());
auto patternView = StringPrimitive::createStringView(runtime, pattern);
bool isBackslashed = false;
for (char16_t c : patternView) {
switch (c) {
case u'/':
// Avoid premature end of regex.
// TODO nice to have: don't do this if we are in square brackets.
// /[/]/ is valid and the middle / does not need to be escaped.
// However /[\/]/ is also valid and means the same thing
// (CharacterEscape production from regexp grammar). Still it would be
// nice to not unnecessarily mangle the user's supplied pattern.
result.append(isBackslashed ? "/" : "\\/");
break;
// Escape line terminators. See ES5.1 7.3.
case u'\n':
result.append(isBackslashed ? "n" : "\\n");
break;
case u'\r':
result.append(isBackslashed ? "r" : "\\r");
break;
case 0x2028:
result.append(isBackslashed ? "u2028" : "\\u2028");
break;
case 0x2029:
result.append(isBackslashed ? "u2029" : "\\u2029");
break;
default:
result.append(c);
break;
}
isBackslashed = (c == u'\\') && !isBackslashed;
}
// "If P is the empty String, this specification can be met by letting S be
// '(?:)'."
if (result.empty()) {
result = u"(?:)";
}
// Avoid unnecessary allocation in the likely event the source and pattern
// match.
if (patternView.equals(result.arrayRef())) {
return pattern.getHermesValue();
}
return StringPrimitive::create(runtime, result);
}
} // namespace vm
} // namespace hermes