Jit/codegen/gen_asm.cpp (1,371 lines of code) (raw):
// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
#include "Jit/codegen/gen_asm.h"
#include "Python.h"
#include "classloader.h"
#include "frameobject.h"
#include "internal/pycore_pystate.h"
#include "internal/pycore_shadow_frame.h"
#include "Jit/code_allocator.h"
#include "Jit/codegen/autogen.h"
#include "Jit/codegen/gen_asm_utils.h"
#include "Jit/frame.h"
#include "Jit/hir/analysis.h"
#include "Jit/hir/hir.h"
#include "Jit/hir/printer.h"
#include "Jit/jit_gdb_support.h"
#include "Jit/jit_rt.h"
#include "Jit/lir/dce.h"
#include "Jit/lir/generator.h"
#include "Jit/lir/postalloc.h"
#include "Jit/lir/postgen.h"
#include "Jit/lir/regalloc.h"
#include "Jit/lir/verify.h"
#include "Jit/log.h"
#include "Jit/perf_jitdump.h"
#include "Jit/pyjit.h"
#include "Jit/runtime.h"
#include "Jit/util.h"
#include <fmt/format.h>
#include <algorithm>
#include <cstdint>
#include <iterator>
#include <list>
#include <unordered_map>
#include <vector>
using namespace asmjit;
using namespace jit::hir;
using namespace jit::lir;
using namespace jit::util;
namespace jit {
namespace codegen {
namespace {
namespace shadow_frame {
// Shadow stack frames appear at the beginning of native frames for jitted
// functions
static constexpr x86::Mem kFramePtr = x86::ptr(x86::rbp, -kShadowFrameSize);
static constexpr x86::Mem kInFramePrevPtr =
x86::ptr(x86::rbp, -kShadowFrameSize + SHADOW_FRAME_FIELD_OFF(prev));
static constexpr x86::Mem kInFrameDataPtr =
x86::ptr(x86::rbp, -kShadowFrameSize + SHADOW_FRAME_FIELD_OFF(data));
static constexpr x86::Mem getStackTopPtr(x86::Gp tstate_reg) {
return x86::ptr(tstate_reg, offsetof(PyThreadState, shadow_frame));
}
} // namespace shadow_frame
} // namespace
void RestoreOriginalGeneratorRBP(x86::Emitter* as) {
size_t original_rbp_offset = offsetof(GenDataFooter, originalRbp);
as->mov(x86::rbp, x86::ptr(x86::rbp, original_rbp_offset));
}
void NativeGenerator::generateEpilogueUnlinkFrame(
x86::Gp tstate_r,
bool is_generator) {
// It's safe to use caller saved registers in this function
auto scratch_reg = tstate_r == x86::rsi ? x86::rdx : x86::rsi;
x86::Mem shadow_stack_top_ptr = shadow_frame::getStackTopPtr(tstate_r);
// Check bit 0 of _PyShadowFrame::data to see if a frame needs
// unlinking. This bit will be set (pointer kind == PYSF_PYFRAME) if so.
// scratch_reg = tstate->shadow_frame
as_->mov(scratch_reg, shadow_stack_top_ptr);
static_assert(
PYSF_PYFRAME == 1 && _PyShadowFrame_NumPtrKindBits == 2,
"Unexpected constants");
as_->bt(x86::qword_ptr(scratch_reg, offsetof(_PyShadowFrame, data)), 0);
// Unlink shadow frame. The send implementation handles unlinking these for
// generators.
if (!is_generator) {
// tstate->shadow_frame = ((_PyShadowFrame*)scratch_reg)->prev
as_->mov(
scratch_reg,
x86::qword_ptr(scratch_reg, offsetof(_PyShadowFrame, prev)));
as_->mov(shadow_stack_top_ptr, scratch_reg);
}
// Unlink PyFrame if needed
asmjit::Label done = as_->newLabel();
as_->jnc(done);
auto saved_rax_ptr = x86::ptr(x86::rbp, -8);
jit::hir::Type ret_type = func_->return_type;
if (ret_type <= TCDouble) {
as_->movsd(saved_rax_ptr, x86::xmm0);
} else {
as_->mov(saved_rax_ptr, x86::rax);
}
if (tstate_r != x86::rdi) {
as_->mov(x86::rdi, tstate_r);
}
as_->call(reinterpret_cast<uint64_t>(JITRT_UnlinkFrame));
if (ret_type <= TCDouble) {
as_->movsd(x86::xmm0, saved_rax_ptr);
} else {
as_->mov(x86::rax, saved_rax_ptr);
}
as_->bind(done);
}
// Scratch register used by the various deopt trampolines.
//
// NB: This MUST be r15. If you change the register you'll also need to change
// the deopt trampoline code that saves all registers.
static const auto deopt_scratch_reg = x86::r15;
Runtime* NativeGeneratorFactory::s_jit_asm_code_rt_ = nullptr;
// these functions call int returning functions and convert their output from
// int (32 bits) to uint64_t (64 bits). This is solely because the code
// generator cannot support an operand size other than 64 bits at this moment. A
// future diff will make it support different operand sizes so that this
// function can be removed.
extern "C" uint64_t
_Invoke_PyObject_SetAttr(PyObject* v, PyObject* name, PyObject* value) {
return PyObject_SetAttr(v, name, value);
}
extern "C" uint64_t
_Invoke_PyObject_SetItem(PyObject* container, PyObject* sub, PyObject* value) {
return PyObject_SetItem(container, sub, value);
}
class AsmJitException : public std::exception {
public:
AsmJitException(Error err, std::string expr, std::string message) noexcept
: err(err), expr(std::move(expr)), message(std::move(message)) {}
const char* what() const noexcept override {
return message.c_str();
}
Error const err;
std::string const expr;
std::string const message;
};
class ThrowableErrorHandler : public ErrorHandler {
public:
void handleError(Error err, const char* message, BaseEmitter*) override {
throw AsmJitException(err, "<unknown>", message);
}
};
#define ASM_CHECK_THROW(exp) \
{ \
auto err = (exp); \
if (err != kErrorOk) { \
auto message = DebugUtils::errorAsString(err); \
throw AsmJitException(err, #exp, message); \
} \
}
#define ASM_CHECK(exp, what) \
{ \
auto err = (exp); \
JIT_CHECK( \
err == kErrorOk, \
"Failed generating %s: %s", \
(what), \
DebugUtils::errorAsString(err)); \
}
#ifdef __ASM_DEBUG
extern "C" void ___debug_helper(const char* name) {
fprintf(stderr, "Entering %s...\n", name);
}
#endif
void* NativeGenerator::GetEntryPoint() {
if (entry_ != nullptr) {
// already compiled
return entry_;
}
JIT_CHECK(as_ == nullptr, "x86::Builder should not have been initialized.");
CodeHolder code;
code.init(CodeAllocator::get()->asmJitCodeInfo());
ThrowableErrorHandler eh;
code.setErrorHandler(&eh);
as_ = new x86::Builder(&code);
env_.as = as_;
env_.hard_exit_label = as_->newLabel();
env_.gen_resume_entry_label = as_->newLabel();
// Prepare the location for where our arguments will go. This just
// uses general purpose registers while available for non-floating
// point values, and floating point values while available for fp
// arguments.
int non_reg_arg_idx = -1;
forEachArgumentRegInfo([&](std::optional<x86::Reg> r, size_t) {
if (r) {
env_.arg_locations.push_back(
r->id() + (r->isXmm() ? PhyLocation::XMM_REG_BASE : 0));
} else {
env_.arg_locations.push_back(non_reg_arg_idx--);
}
});
CollectOptimizableLoadMethods();
auto num_lm_caches = env_.optimizable_load_call_methods_.size() / 2;
auto func = GetFunction();
auto num_la_caches =
func->CountInstrs([](const Instr& instr) { return instr.IsLoadAttr(); });
auto num_sa_caches =
func->CountInstrs([](const Instr& instr) { return instr.IsStoreAttr(); });
auto num_lat_caches = func->env.numLoadAttrCaches();
env_.rt = NativeGeneratorFactory::runtime();
PyCodeObject* code_obj = func->code;
env_.code_rt = env_.rt->allocateCodeRuntime(
code_obj,
GetFunction()->globals,
func->frameMode,
num_lm_caches,
num_la_caches,
num_sa_caches,
num_lat_caches);
for (auto& ref : func->env.references()) {
env_.code_rt->addReference(ref);
}
jit::lir::LIRGenerator lirgen(GetFunction(), &env_);
std::unique_ptr<jit::lir::Function> lir_func;
COMPILE_TIMER(
GetFunction()->compilation_phase_timer,
"Lowering into LIR",
lir_func = lirgen.TranslateFunction())
if (g_dump_hir_passes_json != nullptr) {
lir::JSONPrinter lir_printer;
(*json)["cols"].emplace_back(lir_printer.print(*lir_func, "Initial LIR"));
}
JIT_LOGIF(
g_dump_lir,
"LIR for %s after generation:\n%s",
GetFunction()->fullname,
*lir_func);
PostGenerationRewrite post_gen(lir_func.get(), &env_);
COMPILE_TIMER(
GetFunction()->compilation_phase_timer,
"LIR transformations",
post_gen.run())
COMPILE_TIMER(
GetFunction()->compilation_phase_timer,
"DeadCodeElimination",
eliminateDeadCode(lir_func.get()))
LinearScanAllocator lsalloc(
lir_func.get(),
frame_header_size_ + max_inline_depth_ * kShadowFrameSize);
COMPILE_TIMER(
GetFunction()->compilation_phase_timer,
"Register Allocation",
lsalloc.run())
if (g_dump_hir_passes_json != nullptr) {
lir::JSONPrinter lir_printer;
(*json)["cols"].emplace_back(
lir_printer.print(*lir_func, "Register-allocated LIR"));
}
env_.spill_size = lsalloc.getSpillSize();
env_.changed_regs = lsalloc.getChangedRegs();
env_.exit_label = as_->newLabel();
env_.exit_for_yield_label = as_->newLabel();
env_.frame_mode = GetFunction()->frameMode;
if (GetFunction()->code->co_flags & kCoFlagsAnyGenerator) {
env_.initial_yield_spill_size_ = lsalloc.initialYieldSpillSize();
}
PostRegAllocRewrite post_rewrite(lir_func.get(), &env_);
COMPILE_TIMER(
GetFunction()->compilation_phase_timer,
"Post Reg Alloc Rewrite",
post_rewrite.run())
if (!verifyPostRegAllocInvariants(lir_func.get(), std::cerr)) {
JIT_CHECK(
false,
"LIR for %s failed verification:\n%s",
GetFunction()->fullname,
*lir_func);
}
lir_func_ = std::move(lir_func);
JIT_LOGIF(
g_dump_lir,
"LIR for %s after register allocation:\n%s",
GetFunction()->fullname,
*lir_func_);
try {
COMPILE_TIMER(
GetFunction()->compilation_phase_timer,
"Code Generation",
generateCode(code))
} catch (const AsmJitException& ex) {
String s;
as_->dump(s);
JIT_CHECK(
false,
"Failed to emit code for '%s': '%s' failed with '%s'\n\n"
"Builder contents on failure:\n%s",
GetFunction()->fullname,
ex.expr,
ex.message,
s.data());
}
/* After code generation CodeHolder->codeSize() *should* return the actual
* size of the generated code. This relies on the implementation of
* JitRuntime::_add and may break in the future.
*/
JIT_DCHECK(code.codeSize() < INT_MAX, "Code size is larger than INT_MAX");
compiled_size_ = static_cast<int>(code.codeSize());
env_.code_rt->set_frame_size(env_.frame_size);
return entry_;
}
int NativeGenerator::GetCompiledFunctionSize() const {
return compiled_size_;
}
int NativeGenerator::GetCompiledFunctionStackSize() const {
return env_.frame_size;
}
int NativeGenerator::GetCompiledFunctionSpillStackSize() const {
return spill_stack_size_;
}
void NativeGenerator::generateFunctionEntry() {
as_->push(x86::rbp);
as_->mov(x86::rbp, x86::rsp);
}
void NativeGenerator::loadTState(x86::Gp dst_reg) {
uint64_t tstate =
reinterpret_cast<uint64_t>(&_PyRuntime.gilstate.tstate_current);
if (fitsInt32(tstate)) {
as_->mov(dst_reg, x86::ptr(tstate));
} else {
as_->mov(dst_reg, tstate);
as_->mov(dst_reg, x86::ptr(dst_reg));
}
}
void NativeGenerator::linkOnStackShadowFrame(
x86::Gp tstate_reg,
x86::Gp scratch_reg) {
const jit::hir::Function* func = GetFunction();
jit::hir::FrameMode frame_mode = func->frameMode;
using namespace shadow_frame;
x86::Mem shadow_stack_top_ptr = getStackTopPtr(tstate_reg);
// Save old top of shadow stack
as_->mov(scratch_reg, shadow_stack_top_ptr);
as_->mov(kInFramePrevPtr, scratch_reg);
// Set data
if (frame_mode == jit::hir::FrameMode::kNormal) {
as_->mov(scratch_reg, x86::ptr(tstate_reg, offsetof(PyThreadState, frame)));
static_assert(
PYSF_PYFRAME == 1 && _PyShadowFrame_NumPtrKindBits == 2,
"Unexpected constant");
as_->bts(scratch_reg, 0);
} else {
uintptr_t data =
_PyShadowFrame_MakeData(env_.code_rt, PYSF_CODE_RT, PYSF_JIT);
as_->mov(scratch_reg, data);
}
as_->mov(kInFrameDataPtr, scratch_reg);
// Set our shadow frame as top of shadow stack
as_->lea(scratch_reg, kFramePtr);
as_->mov(shadow_stack_top_ptr, scratch_reg);
}
void NativeGenerator::initializeFrameHeader(
x86::Gp tstate_reg,
x86::Gp scratch_reg) {
// Save pointer to the CodeRuntime
// TODO(mpage) - This is only necessary in the prologue when in normal-frame
// mode. We can lazily fill this when the frame is materialized in
// shadow-frame mode. Not sure if the added complexity is worth the two
// instructions we would save...
as_->mov(scratch_reg, reinterpret_cast<uintptr_t>(env_.code_rt));
as_->mov(
x86::ptr(
x86::rbp,
-static_cast<int>(offsetof(FrameHeader, code_rt)) - kPointerSize),
scratch_reg);
// Generator shadow frames live in generator objects and only get linked in
// on the first resume.
if (!isGen()) {
linkOnStackShadowFrame(tstate_reg, scratch_reg);
}
}
int NativeGenerator::setupFrameAndSaveCallerRegisters(x86::Gp tstate_reg) {
// During execution, the stack looks like the diagram below. The column to
// left indicates how many words on the stack each line occupies.
//
// Legend:
// - <empty> - 1 word
// - N - A fixed number of words > 1
// - * - 0 or more words
// - ? - 0 or 1 words
// - ^ - shares the space with the item above
//
// +-----------------------+
// | * memory arguments |
// | return address |
// | saved rbp | <-- rbp
// | N frame header | See frame.h
// | * inl. shad. frame 0 |
// | * inl. shad. frame 1 |
// | * inl. shad. frame . |
// | * inl. shad. frame N |
// | * spilled values |
// | ? alignment padding |
// | * callee-saved regs |
// | ? call arg buffer |
// | ^ LOAD_METHOD scratch | <-- rsp
// +-----------------------+
auto saved_regs = env_.changed_regs & CALLEE_SAVE_REGS;
int saved_regs_size = saved_regs.count() * 8;
// Make sure we have at least one word for scratch in the epilogue.
spill_stack_size_ = env_.spill_size;
// The frame header size and inlined shadow frames are already included in
// env_.spill_size.
int spill_stack = std::max(spill_stack_size_, 8);
int load_method_scratch = env_.optimizable_load_call_methods_.empty() ? 0 : 8;
int arg_buffer_size = std::max(load_method_scratch, env_.max_arg_buffer_size);
if ((spill_stack + saved_regs_size + arg_buffer_size) % 16 != 0) {
spill_stack += 8;
}
// Allocate stack space and save the size of the function's stack.
as_->sub(x86::rsp, spill_stack);
env_.last_callee_saved_reg_off = spill_stack + saved_regs_size;
x86::Gp scratch_reg = x86::rax;
as_->push(scratch_reg);
initializeFrameHeader(tstate_reg, scratch_reg);
as_->pop(scratch_reg);
// Push used callee-saved registers.
while (!saved_regs.Empty()) {
as_->push(x86::gpq(saved_regs.GetFirst()));
saved_regs.RemoveFirst();
}
if (arg_buffer_size > 0) {
as_->sub(x86::rsp, arg_buffer_size);
}
env_.frame_size = spill_stack + saved_regs_size + arg_buffer_size;
return load_method_scratch;
}
void NativeGenerator::loadOrGenerateLinkFrame(
asmjit::x86::Gp tstate_reg,
const std::vector<
std::pair<const asmjit::x86::Reg, const asmjit::x86::Reg>>& save_regs) {
auto load_tstate_and_move = [&]() {
loadTState(tstate_reg);
for (const auto& pair : save_regs) {
if (pair.first != pair.second) {
if (pair.first.isGpq()) {
JIT_DCHECK(pair.second.isGpq(), "can't mix and match register types");
as_->mov(
static_cast<const asmjit::x86::Gpq&>(pair.second),
static_cast<const asmjit::x86::Gpq&>(pair.first));
} else if (pair.first.isXmm()) {
JIT_DCHECK(pair.second.isXmm(), "can't mix and match register types");
as_->movsd(
static_cast<const asmjit::x86::Xmm&>(pair.second),
static_cast<const asmjit::x86::Xmm&>(pair.first));
}
}
}
};
if (isGen()) {
load_tstate_and_move();
return;
}
switch (GetFunction()->frameMode) {
case FrameMode::kShadow:
load_tstate_and_move();
break;
case FrameMode::kNormal: {
bool align_stack = save_regs.size() % 2;
for (const auto& pair : save_regs) {
if (pair.first.isGpq()) {
as_->push((asmjit::x86::Gpq&)pair.first);
} else if (pair.first.isXmm()) {
as_->sub(x86::rsp, 16);
as_->movdqu(x86::dqword_ptr(x86::rsp), (asmjit::x86::Xmm&)pair.first);
} else {
JIT_CHECK(false, "unsupported saved register type");
}
}
if (align_stack) {
as_->push(x86::rax);
}
as_->mov(
x86::rdi,
reinterpret_cast<intptr_t>(
codeRuntime()->frameState()->code().get()));
as_->mov(
x86::rsi,
reinterpret_cast<intptr_t>(
codeRuntime()->frameState()->globals().get()));
as_->call(reinterpret_cast<uint64_t>(JITRT_AllocateAndLinkFrame));
as_->mov(tstate_reg, x86::rax);
if (align_stack) {
as_->pop(x86::rax);
}
for (auto iter = save_regs.rbegin(); iter != save_regs.rend(); ++iter) {
if (iter->second.isGpq()) {
as_->pop((asmjit::x86::Gpq&)iter->second);
} else if (iter->second.isXmm()) {
as_->movdqu(
(asmjit::x86::Xmm&)iter->second, x86::dqword_ptr(x86::rsp));
as_->add(x86::rsp, 16);
} else {
JIT_CHECK(false, "unsupported saved register type");
}
}
break;
}
}
}
void NativeGenerator::generatePrologue(
Label correct_arg_count,
Label native_entry_point) {
PyCodeObject* code = GetFunction()->code;
// the generic entry point, including primitive return boxing if needed
asmjit::BaseNode* entry_cursor = as_->cursor();
// same as entry_cursor but only set if we are boxing a primitive return
asmjit::BaseNode* box_entry_cursor = nullptr;
// start of the "real" generic entry, after the return-boxing wrapper
asmjit::BaseNode* generic_entry_cursor = nullptr;
bool returns_primitive = func_->returnsPrimitive();
bool returns_double = func_->returnsPrimitiveDouble();
if (returns_primitive) {
// If we return a primitive, then in the generic (non-static) entry path we
// need to box it up (since our caller can't handle an actual primitive
// return). We do this by generating a small wrapper "function" here that
// just calls the real function and then boxes the return value before
// returning.
Label generic_entry = as_->newLabel();
Label box_done = as_->newLabel();
Label error = as_->newLabel();
jit::hir::Type ret_type = func_->return_type;
Annotations annot;
uint64_t box_func;
bool returns_enum = ret_type <= TCEnum;
generateFunctionEntry();
if (returns_enum) {
as_->push(x86::rdx);
as_->push(x86::rdx); // extra push to maintain alignment
annot.add("saveRegisters", as_, entry_cursor);
}
as_->call(generic_entry);
// if there was an error, there's nothing to box
if (returns_double) {
as_->ptest(x86::xmm1, x86::xmm1);
as_->je(error);
} else if (returns_enum) {
as_->test(x86::edx, x86::edx);
as_->je(error);
} else {
as_->test(x86::edx, x86::edx);
as_->je(box_done);
}
if (ret_type <= TCBool) {
as_->movzx(x86::edi, x86::al);
box_func = reinterpret_cast<uint64_t>(JITRT_BoxBool);
} else if (ret_type <= TCInt8) {
as_->movsx(x86::edi, x86::al);
box_func = reinterpret_cast<uint64_t>(JITRT_BoxI32);
} else if (ret_type <= TCUInt8) {
as_->movzx(x86::edi, x86::al);
box_func = reinterpret_cast<uint64_t>(JITRT_BoxU32);
} else if (ret_type <= TCInt16) {
as_->movsx(x86::edi, x86::ax);
box_func = reinterpret_cast<uint64_t>(JITRT_BoxI32);
} else if (ret_type <= TCUInt16) {
as_->movzx(x86::edi, x86::ax);
box_func = reinterpret_cast<uint64_t>(JITRT_BoxU32);
} else if (ret_type <= TCInt32) {
as_->mov(x86::edi, x86::eax);
box_func = reinterpret_cast<uint64_t>(JITRT_BoxI32);
} else if (ret_type <= TCUInt32) {
as_->mov(x86::edi, x86::eax);
box_func = reinterpret_cast<uint64_t>(JITRT_BoxU32);
} else if (ret_type <= TCInt64) {
as_->mov(x86::rdi, x86::rax);
box_func = reinterpret_cast<uint64_t>(JITRT_BoxI64);
} else if (ret_type <= TCUInt64) {
as_->mov(x86::rdi, x86::rax);
box_func = reinterpret_cast<uint64_t>(JITRT_BoxU64);
} else if (returns_double) {
// xmm0 already contains the return value
box_func = reinterpret_cast<uint64_t>(JITRT_BoxDouble);
} else if (returns_enum) {
as_->mov(x86::rdi, x86::rax);
Label box_int = as_->newLabel();
as_->pop(x86::rdx);
as_->pop(x86::rdx);
as_->bt(x86::rdx, _Py_VECTORCALL_INVOKED_STATICALLY_BIT_POS);
as_->jb(box_int);
as_->mov(x86::rsi, reinterpret_cast<uint64_t>(ret_type.typeSpec()));
as_->call(reinterpret_cast<uint64_t>(JITRT_BoxEnum));
as_->jmp(box_done);
as_->bind(box_int);
box_func = reinterpret_cast<uint64_t>(JITRT_BoxI64);
} else {
JIT_CHECK(
false, "unsupported primitive return type %s", ret_type.toString());
}
as_->call(box_func);
as_->bind(box_done);
as_->leave();
as_->ret();
if (returns_double) {
as_->bind(error);
as_->xor_(x86::rax, x86::rax);
as_->leave();
as_->ret();
} else if (returns_enum) {
as_->bind(error);
as_->pop(x86::rdx);
as_->pop(x86::rdx);
as_->leave();
as_->ret();
}
box_entry_cursor = entry_cursor;
generic_entry_cursor = as_->cursor();
as_->bind(generic_entry);
} else {
generic_entry_cursor = entry_cursor;
}
generateFunctionEntry();
Label setup_frame = as_->newLabel();
Label argCheck = as_->newLabel();
if (code->co_flags & CO_STATICALLY_COMPILED) {
// If we've been invoked statically we can skip all of the
// argument checking because we know our args have been
// provided correctly. But if we have primitives we need to
// unbox them from their boxed ints. We usually get to
// avoid this by doing direct invokes from JITed code.
if (func_->has_primitive_args) {
env_.code_rt->addReference(func_->prim_args_info);
as_->mov(
x86::r8, reinterpret_cast<uint64_t>(func_->prim_args_info.get()));
if (func_->returnsPrimitiveDouble()) {
as_->call(reinterpret_cast<uint64_t>(
JITRT_CallStaticallyWithPrimitiveSignatureFP));
} else {
as_->call(reinterpret_cast<uint64_t>(
JITRT_CallStaticallyWithPrimitiveSignature));
}
as_->leave();
as_->ret();
} else {
as_->bt(x86::rdx, _Py_VECTORCALL_INVOKED_STATICALLY_BIT_POS);
as_->jb(setup_frame);
}
}
if (!func_->has_primitive_args) {
as_->test(x86::rcx, x86::rcx); // test for kwargs
if (!((code->co_flags & (CO_VARARGS | CO_VARKEYWORDS)) ||
code->co_kwonlyargcount)) {
// If we have varargs or var kwargs we need to dispatch
// through our helper regardless if kw args are provided to
// create the var args tuple and dict and free them on exit
//
// Similarly, if the function has keyword-only args, we dispatch
// through the helper to check that they were, in fact, passed via
// keyword arguments.
//
// There's a lot of other things that happen in
// the helper so there is potentially a lot of room for optimization
// here.
as_->je(argCheck);
}
// We don't check the length of the kwnames tuple here, normal callers will
// never pass the empty tuple. It is possible for odd callers to still pass
// the empty tuple in which case we'll just go through the slow binding
// path.
as_->call(reinterpret_cast<uint64_t>(JITRT_CallWithKeywordArgs));
as_->leave();
as_->ret();
// check that we have a valid number of args
if (!(code->co_flags & (CO_VARARGS | CO_VARKEYWORDS))) {
as_->bind(argCheck);
as_->cmp(x86::edx, GetFunction()->numArgs());
// We don't have the correct number of arguments. Call a helper to either
// fix them up with defaults or raise an approprate exception.
as_->jz(correct_arg_count);
as_->mov(x86::rcx, GetFunction()->numArgs());
as_->call(
(returns_double
? reinterpret_cast<uint64_t>(
JITRT_CallWithIncorrectArgcountFPReturn)
: reinterpret_cast<uint64_t>(JITRT_CallWithIncorrectArgcount)));
as_->leave();
as_->ret();
}
}
as_->bind(correct_arg_count);
if (code->co_flags & CO_STATICALLY_COMPILED) {
if (!func_->has_primitive_args) {
// We weren't called statically, but we've now resolved
// all arguments to fixed offsets. Validate that the
// arguments are correctly typed.
generateStaticMethodTypeChecks(setup_frame);
} else if (func_->has_primitive_first_arg) {
as_->mov(x86::rdx, 0);
}
}
env_.addAnnotation("Generic entry", generic_entry_cursor);
if (box_entry_cursor) {
env_.addAnnotation(
"Generic entry (box primitive return)", box_entry_cursor);
}
// Args are now validated, setup frame
auto frame_cursor = as_->cursor();
as_->bind(setup_frame);
constexpr auto kNargsfReg = x86::rdx;
constexpr auto kFuncPtrReg = x86::rax;
constexpr auto kArgsReg = x86::r10;
constexpr auto kArgsOverflowBaseReg = kArgsReg;
loadOrGenerateLinkFrame(
x86::r11,
{
{kNargsfReg, kNargsfReg},
{x86::rdi, kFuncPtrReg}, // func
{x86::rsi, kArgsReg} // args
});
// Move arguments into their expected registers and then use r10 as the base
// for additional args. Note for coroutines we leave nargsf in RDX.
size_t num_fp_regs = 0;
size_t num_gp_regs = 0;
size_t args_without_regs = 0;
const bool not_enough_regs_for_args =
forEachArgumentRegInfo([&](std::optional<x86::Reg> r, size_t i) {
if (!r) {
args_without_regs++;
return;
}
x86::Mem arg_ptr =
x86::ptr(kArgsReg, (i - args_without_regs) * kPointerSize);
if (r->isXmm() && num_fp_regs != FP_ARGUMENT_REG_COUNT) {
as_->movsd(x86::Xmm(r->id()), arg_ptr);
num_fp_regs++;
} else if (num_gp_regs != numGpRegsForArgs()) {
as_->mov(x86::Gpq(r->id()), arg_ptr);
num_gp_regs++;
}
});
if (not_enough_regs_for_args) {
// load the location of the remaining args, the backend will
// deal with loading them from here...
as_->lea(
kArgsOverflowBaseReg,
x86::ptr(kArgsReg, (num_fp_regs + num_gp_regs) * kPointerSize));
}
// Finally allocate the saved space required for the actual function
auto native_entry_cursor = as_->cursor();
as_->bind(native_entry_point);
setupFrameAndSaveCallerRegisters(x86::r11);
env_.addAnnotation("Link frame", frame_cursor);
env_.addAnnotation("Native entry", native_entry_cursor);
}
static void
emitCompare(x86::Builder* as, x86::Gp lhs, void* rhs, x86::Gp scratch) {
uint64_t rhsi = reinterpret_cast<uint64_t>(rhs);
if (!fitsInt32(rhsi)) {
// in shared mode type can be in a high address
as->mov(scratch, rhsi);
as->cmp(lhs, scratch);
} else {
as->cmp(lhs, rhsi);
}
}
void NativeGenerator::generateStaticMethodTypeChecks(Label setup_frame) {
// JITRT_CallWithIncorrectArgcount uses the fact that our checks are set up
// from last to first argument - we order the jumps so that the common case of
// no defaulted arguments comes first, and end up with the following
// structure: generic entry: compare defaulted arg count to 0 if zero: go to
// first check compare defaulted arg count to 1 if zero: go to second check
// ...
// This is complicated a bit by the fact that not every argument will have a
// check, as we elide the dynamic ones. For that, we do bookkeeping and assign
// all defaulted arg counts up to the next local to the same label.
const std::vector<TypedArgument>& checks = GetFunction()->typed_args;
env_.static_arg_typecheck_failed_label = as_->newLabel();
if (!checks.size()) {
return;
}
// We build a vector of labels corresponding to [first_check, second_check,
// ..., setup_frame] which will have |checks| + 1 elements, and the
// first_check label will precede the first check.
auto table_label = as_->newLabel();
as_->lea(x86::r8, x86::ptr(table_label));
as_->lea(x86::r8, x86::ptr(x86::r8, x86::rcx, 3));
as_->jmp(x86::r8);
auto jump_table_cursor = as_->cursor();
as_->align(AlignMode::kAlignCode, 8);
as_->bind(table_label);
std::vector<Label> arg_labels;
int defaulted_arg_count = 0;
Py_ssize_t check_index = checks.size() - 1;
// Each check might be a label that hosts multiple arguments, as dynamic
// arguments aren't checked. We need to account for this in our bookkeeping.
auto next_arg = as_->newLabel();
arg_labels.emplace_back(next_arg);
while (defaulted_arg_count < GetFunction()->numArgs()) {
as_->align(AlignMode::kAlignCode, 8);
as_->jmp(next_arg);
if (check_index >= 0) {
long local = checks.at(check_index).locals_idx;
if (GetFunction()->numArgs() - defaulted_arg_count - 1 == local) {
if (check_index == 0) {
next_arg = setup_frame;
} else {
check_index--;
next_arg = as_->newLabel();
}
arg_labels.emplace_back(next_arg);
}
}
defaulted_arg_count++;
}
env_.addAnnotation(
fmt::format("Jump to first non-defaulted argument"), jump_table_cursor);
as_->align(AlignMode::kAlignCode, 8);
as_->bind(arg_labels[0]);
for (Py_ssize_t i = checks.size() - 1; i >= 0; i--) {
auto check_cursor = as_->cursor();
const TypedArgument& arg = checks.at(i);
env_.code_rt->addReference(arg.pytype);
next_arg = arg_labels[checks.size() - i];
as_->mov(x86::r8, x86::ptr(x86::rsi, arg.locals_idx * 8)); // load local
as_->mov(
x86::r8, x86::ptr(x86::r8, offsetof(PyObject, ob_type))); // load type
if (arg.optional) {
// check if the value is None
emitCompare(as_, x86::r8, Py_TYPE(Py_None), x86::rax);
as_->je(next_arg);
}
// common case: check if we have the exact right type
emitCompare(as_, x86::r8, arg.pytype, x86::rax);
as_->je(next_arg);
if (!arg.exact && (arg.pytype->tp_flags & Py_TPFLAGS_BASETYPE)) {
// We need to check the object's MRO and see if the declared type
// is present in it. Technically we don't need to check the last
// entry that will be object but the code gen is a little bit simpler
// if we include it.
Label arg_loop = as_->newLabel();
as_->mov(x86::r10, reinterpret_cast<uint64_t>(arg.pytype.get()));
// PyObject *r8 = r8->tp_mro;
as_->mov(x86::r8, x86::ptr(x86::r8, offsetof(PyTypeObject, tp_mro)));
// Py_ssize_t r11 = r8->ob_size;
as_->mov(x86::r11, x86::ptr(x86::r8, offsetof(PyVarObject, ob_size)));
// PyObject *r8 = &r8->ob_item[0];
as_->add(x86::r8, offsetof(PyTupleObject, ob_item));
// PyObject *r11 = &r8->ob_item[r11];
as_->lea(x86::r11, x86::ptr(x86::r8, x86::r11, 3));
as_->bind(arg_loop);
as_->cmp(x86::ptr(x86::r8), x86::r10);
as_->je(next_arg);
as_->add(x86::r8, sizeof(PyObject*));
as_->cmp(x86::r8, x86::r11);
as_->jne(arg_loop);
}
// no args match, bail to normal vector call to report error
as_->jmp(env_.static_arg_typecheck_failed_label);
bool last_check = i == 0;
if (!last_check) {
as_->bind(next_arg);
}
env_.addAnnotation(
fmt::format("StaticTypeCheck[{}]", arg.pytype->tp_name), check_cursor);
}
}
void NativeGenerator::generateEpilogue(BaseNode* epilogue_cursor) {
as_->setCursor(epilogue_cursor);
// now we can use all the caller save registers except for RAX
as_->bind(env_.exit_label);
bool is_gen = GetFunction()->code->co_flags & kCoFlagsAnyGenerator;
if (is_gen) {
// Set generator state to "completed". We access the state via RBP which
// points to the of spill data and bottom of GenDataFooter.
auto state_offs = offsetof(GenDataFooter, state);
as_->mov(
x86::ptr(x86::rbp, state_offs, sizeof(GenDataFooter::state)),
_PyJitGenState_Completed);
as_->bind(env_.exit_for_yield_label);
RestoreOriginalGeneratorRBP(as_->as<x86::Emitter>());
}
generateEpilogueUnlinkFrame(x86::rdi, is_gen);
// If we return a primitive, set edx/xmm1 to 1 to indicate no error (in case
// of error, deopt will set it to 0 and jump to hard_exit_label, skipping
// this.)
if (func_->returnsPrimitive()) {
if (func_->returnsPrimitiveDouble()) {
// Loads an *integer* 1 in XMM1.. value doesn't matter,
// but it needs to be non-zero. See pg 124,
// https://www.agner.org/optimize/optimizing_assembly.pdf
as_->pcmpeqw(x86::xmm1, x86::xmm1);
as_->psrlq(x86::xmm1, 63);
} else {
as_->mov(x86::edx, 1);
}
}
as_->bind(env_.hard_exit_label);
auto saved_regs = env_.changed_regs & CALLEE_SAVE_REGS;
if (!saved_regs.Empty()) {
// Reset rsp to point at our callee-saved registers and restore them.
JIT_CHECK(
env_.last_callee_saved_reg_off != -1,
"offset to callee saved regs not initialized");
as_->lea(x86::rsp, x86::ptr(x86::rbp, -env_.last_callee_saved_reg_off));
std::vector<int> pop_regs;
while (!saved_regs.Empty()) {
int reg = saved_regs.GetFirst();
pop_regs.push_back(reg);
saved_regs.RemoveFirst();
}
for (auto riter = pop_regs.rbegin(); riter != pop_regs.rend(); ++riter) {
as_->pop(x86::gpq(*riter));
}
}
as_->leave();
as_->ret();
env_.addAnnotation("Epilogue", epilogue_cursor);
if (env_.function_indirections.size()) {
auto jit_helpers = as_->cursor();
for (auto& x : env_.function_indirections) {
Label trampoline = as_->newLabel();
as_->bind(trampoline);
as_->mov(x86::r10, reinterpret_cast<uint64_t>(x.first));
as_->jmp(reinterpret_cast<uint64_t>(jit_trampoline_));
x.second.trampoline = trampoline;
}
env_.addAnnotation("JitHelpers", jit_helpers);
}
}
void NativeGenerator::generateDeoptExits() {
if (env_.deopt_exits.empty()) {
return;
}
auto& deopt_exits = env_.deopt_exits;
auto deopt_cursor = as_->cursor();
auto deopt_exit = as_->newLabel();
std::sort(deopt_exits.begin(), deopt_exits.end(), [](auto& a, auto& b) {
return a.deopt_meta_index < b.deopt_meta_index;
});
// Generate stage 1 trampolines (one per guard). These push the index of the
// appropriate `DeoptMetadata` and then jump to the stage 2 trampoline.
for (const auto& exit : deopt_exits) {
as_->bind(exit.label);
as_->push(exit.deopt_meta_index);
const auto& deopt_meta = env_.rt->getDeoptMetadata(exit.deopt_meta_index);
int deepest_frame_idx = deopt_meta.frame_meta.size() - 1;
emitCall(
env_,
deopt_exit,
deopt_meta.frame_meta[deepest_frame_idx].instr_offset());
}
// Generate the stage 2 trampoline (one per function). This saves the address
// of the final part of the JIT-epilogue that is responsible for restoring
// callee-saved registers and returning, our scratch register, whose original
// contents may be needed during frame reification, and jumps to the final
// trampoline.
//
// Right now the top of the stack looks like:
//
// +-------------------------+ <-- end of JIT's fixed frame
// | index of deopt metadata |
// | saved rip |
// +-------------------------+
//
// and we need to pass our scratch register and the address of the epilogue
// to the global deopt trampoline. The code below leaves the stack with the
// following layout:
//
// +-------------------------+ <-- end of JIT's fixed frame
// | index of deopt metadata |
// | saved rip |
// | padding |
// | address of epilogue |
// | r15 |
// +-------------------------+
//
// The global deopt trampoline expects that our scratch register is at the
// top of the stack so that it can save the remaining registers immediately
// after it, forming a contiguous array of all registers.
//
// If you change this make sure you update that code!
as_->bind(deopt_exit);
// Add padding to keep the stack aligned
as_->push(deopt_scratch_reg);
// Save space for the epilogue
as_->push(deopt_scratch_reg);
// Save our scratch register
as_->push(deopt_scratch_reg);
// Save the address of the epilogue
as_->lea(deopt_scratch_reg, x86::ptr(env_.hard_exit_label));
as_->mov(x86::ptr(x86::rsp, kPointerSize), deopt_scratch_reg);
auto trampoline = GetFunction()->code->co_flags & kCoFlagsAnyGenerator
? deopt_trampoline_generators_
: deopt_trampoline_;
as_->mov(deopt_scratch_reg, reinterpret_cast<uint64_t>(trampoline));
as_->jmp(deopt_scratch_reg);
env_.addAnnotation("Deoptimization exits", deopt_cursor);
}
void NativeGenerator::linkDeoptPatchers(const asmjit::CodeHolder& code) {
JIT_CHECK(code.hasBaseAddress(), "code not generated!");
uint64_t base = code.baseAddress();
for (const auto& udp : env_.pending_deopt_patchers) {
uint64_t patchpoint = base + code.labelOffset(udp.patchpoint);
uint64_t deopt_exit = base + code.labelOffset(udp.deopt_exit);
udp.patcher->link(patchpoint, deopt_exit);
}
}
void NativeGenerator::linkIPtoBCMappings(const asmjit::CodeHolder& code) {
JIT_CHECK(code.hasBaseAddress(), "code not generated!");
uint64_t base = code.baseAddress();
for (const auto& mapping : env_.pending_ip_to_bc_offs) {
uintptr_t ip = base + code.labelOffsetFromBase(mapping.ip);
env_.code_rt->addIPtoBCOff(ip, mapping.bc_off);
}
}
void NativeGenerator::generateResumeEntry() {
// Arbitrary scratch register for use throughout this function. Can be changed
// to pretty much anything which doesn't conflict with arg registers.
const auto scratch_r = x86::r8;
// arg #1 - rdi = PyGenObject* generator
const auto gen_r = x86::rdi;
// arg #2 - rsi = PyObject* sent_value
// arg #3 - rdx = tstate
// arg #4 - rcx = finish_yield_from
// Arg regs must not be modified as they may be used by the next resume stage.
auto cursor = as_->cursor();
as_->bind(env_.gen_resume_entry_label);
generateFunctionEntry();
setupFrameAndSaveCallerRegisters(x86::rdx);
// Setup RBP to use storage in generator rather than stack.
// Pointer to GenDataFooter. Could be any conflict-free register.
const auto jit_data_r = x86::r9;
// jit_data_r = gen->gi_jit_data
size_t gi_jit_data_offset = offsetof(PyGenObject, gi_jit_data);
as_->mov(jit_data_r, x86::ptr(gen_r, gi_jit_data_offset));
// Store linked frame address
size_t link_address_offset = offsetof(GenDataFooter, linkAddress);
as_->mov(scratch_r, x86::ptr(x86::rbp));
as_->mov(x86::ptr(jit_data_r, link_address_offset), scratch_r);
// Store return address
size_t return_address_offset = offsetof(GenDataFooter, returnAddress);
as_->mov(scratch_r, x86::ptr(x86::rbp, 8));
as_->mov(x86::ptr(jit_data_r, return_address_offset), scratch_r);
// Store "original" RBP
size_t original_rbp_offset = offsetof(GenDataFooter, originalRbp);
as_->mov(x86::ptr(jit_data_r, original_rbp_offset), x86::rbp);
// RBP = gen->gi_jit_data
as_->mov(x86::rbp, jit_data_r);
// Resume generator execution: load and clear yieldPoint, then jump to the
// resume target.
size_t yield_point_offset = offsetof(GenDataFooter, yieldPoint);
as_->mov(scratch_r, x86::ptr(x86::rbp, yield_point_offset));
as_->mov(x86::qword_ptr(x86::rbp, yield_point_offset), 0);
size_t resume_target_offset = GenYieldPoint::resumeTargetOffset();
as_->jmp(x86::ptr(scratch_r, resume_target_offset));
env_.addAnnotation("Resume entry point", cursor);
}
void NativeGenerator::generateStaticEntryPoint(
Label native_entry_point,
Label static_jmp_location) {
// Static entry point is the first thing in the method, we'll
// jump back to hit it so that we have a fixed offset to jump from
auto static_link_cursor = as_->cursor();
Label static_entry_point = as_->newLabel();
as_->bind(static_entry_point);
generateFunctionEntry();
// Save incoming args across link call in loadOrGenerateLinkFrame. This is not
// needed for generators as they do not link a frame at this stage.
std::vector<std::pair<const x86::Reg, const x86::Reg>> save_regs;
bool not_enough_regs_for_args;
if (isGen()) {
not_enough_regs_for_args =
forEachArgumentRegInfo([&](std::optional<x86::Reg>, size_t) {});
} else {
not_enough_regs_for_args =
forEachArgumentRegInfo([&](std::optional<x86::Reg> r, size_t) {
if (r) {
save_regs.emplace_back(*r, *r);
}
});
}
loadOrGenerateLinkFrame(x86::r11, save_regs);
if (not_enough_regs_for_args) {
as_->lea(x86::r10, x86::ptr(x86::rbp, 16));
}
as_->jmp(native_entry_point);
env_.addAnnotation("StaticLinkFrame", static_link_cursor);
auto static_entry_point_cursor = as_->cursor();
as_->bind(static_jmp_location);
// force a long jump even if the static entry point is small so that we get
// a consistent offset for the static entry point from the normal entry point.
as_->long_().jmp(static_entry_point);
env_.addAnnotation("StaticEntryPoint", static_entry_point_cursor);
}
void NativeGenerator::generateCode(CodeHolder& codeholder) {
// The body must be generated before the prologue to determine how much spill
// space to allocate.
auto prologue_cursor = as_->cursor();
generateAssemblyBody();
auto epilogue_cursor = as_->cursor();
as_->setCursor(prologue_cursor);
Label correct_arg_count = as_->newLabel();
Label native_entry_point = as_->newLabel();
PyCodeObject* code = GetFunction()->code;
Label static_jmp_location = as_->newLabel();
bool has_static_entry = (code->co_flags & CO_STATICALLY_COMPILED) &&
!GetFunction()->uses_runtime_func;
if (has_static_entry) {
// Setup an entry point for direct static to static
// calls using the native calling convention
generateStaticEntryPoint(native_entry_point, static_jmp_location);
}
// Setup an entry for when we have the correct number of arguments
// This will be dispatched back to from JITRT_CallWithIncorrectArgcount and
// JITRT_CallWithKeywordArgs when we need to perform complicated
// argument binding.
auto arg_reentry_cursor = as_->cursor();
Label correct_args_entry = as_->newLabel();
as_->bind(correct_args_entry);
generateFunctionEntry();
as_->long_().jmp(correct_arg_count);
env_.addAnnotation("Reentry with processed args", arg_reentry_cursor);
// Setup the normal entry point that expects that implements the
// vectorcall convention
auto entry_label = as_->newLabel();
as_->bind(entry_label);
generatePrologue(correct_arg_count, native_entry_point);
generateEpilogue(epilogue_cursor);
if (GetFunction()->code->co_flags & kCoFlagsAnyGenerator) {
generateResumeEntry();
}
if (env_.static_arg_typecheck_failed_label.isValid()) {
auto static_typecheck_cursor = as_->cursor();
as_->bind(env_.static_arg_typecheck_failed_label);
if (GetFunction()->returnsPrimitive()) {
if (GetFunction()->returnsPrimitiveDouble()) {
as_->call(reinterpret_cast<uint64_t>(
JITRT_ReportStaticArgTypecheckErrorsWithDoubleReturn));
} else {
as_->call(reinterpret_cast<uint64_t>(
JITRT_ReportStaticArgTypecheckErrorsWithPrimitiveReturn));
}
} else {
as_->call(
reinterpret_cast<uint64_t>(JITRT_ReportStaticArgTypecheckErrors));
}
as_->leave();
as_->ret();
env_.addAnnotation(
"Static argument typecheck failure stub", static_typecheck_cursor);
}
generateDeoptExits();
ASM_CHECK_THROW(as_->finalize());
ASM_CHECK_THROW(CodeAllocator::get()->addCode(&entry_, &codeholder));
// ------------- orig_entry
// ^
// | JITRT_STATIC_ENTRY_OFFSET (2 bytes, optional)
// | JITRT_CALL_REENTRY_OFFSET (6 bytes)
// v
// ------------- entry_
void* orig_entry = entry_;
if (has_static_entry) {
JIT_CHECK(
codeholder.labelOffset(static_jmp_location) ==
codeholder.labelOffset(entry_label) + JITRT_STATIC_ENTRY_OFFSET,
"bad static-entry offset %d ",
codeholder.labelOffset(entry_label) -
codeholder.labelOffset(static_jmp_location));
}
JIT_CHECK(
codeholder.labelOffset(correct_args_entry) ==
codeholder.labelOffset(entry_label) + JITRT_CALL_REENTRY_OFFSET,
"bad re-entry offset");
linkDeoptPatchers(codeholder);
linkIPtoBCMappings(codeholder);
entry_ = ((char*)entry_) + codeholder.labelOffset(entry_label);
for (auto& entry : env_.unresolved_gen_entry_labels) {
entry.first->setResumeTarget(
codeholder.labelOffsetFromBase(entry.second) +
codeholder.baseAddress());
}
// After code generation CodeHolder->codeSize() *should* return the actual
// size of the generated code and associated data. This relies on the
// implementation of asmjit::JitRuntime::_add and may break in the future.
JIT_DCHECK(
codeholder.codeSize() < INT_MAX, "Code size is larger than INT_MAX");
compiled_size_ = codeholder.codeSize();
if (g_dump_hir_passes_json != nullptr) {
env_.annotations.disassembleJSON(*json, orig_entry, codeholder);
}
JIT_LOGIF(
g_dump_asm,
"Disassembly for %s\n%s",
GetFunction()->fullname,
env_.annotations.disassemble(orig_entry, codeholder));
for (auto& x : env_.function_indirections) {
Label trampoline = x.second.trampoline;
*x.second.indirect =
(void*)(codeholder.labelOffset(trampoline) + codeholder.baseAddress());
}
const hir::Function* func = GetFunction();
std::string prefix = [&] {
switch (func->frameMode) {
case FrameMode::kNormal:
return perf::kFuncSymbolPrefix;
case FrameMode::kShadow:
return perf::kShadowFrameSymbolPrefix;
}
JIT_CHECK(false, "Invalid frame mode");
}();
// For perf, we want only the size of the code, so we get that directly from
// the .text section.
perf::registerFunction(
entry_, codeholder.textSection()->realSize(), func->fullname, prefix);
}
void NativeGenerator::CollectOptimizableLoadMethods() {
auto func = GetFunction();
for (auto& block : func->cfg.blocks) {
const Instr* candidate = nullptr;
for (auto& instr : block) {
auto output = instr.GetOutput();
if (output == nullptr) {
continue;
}
switch (instr.opcode()) {
case Opcode::kLoadMethod: {
candidate = reinterpret_cast<const LoadMethod*>(&instr);
break;
}
case Opcode::kLoadMethodSuper: {
candidate = reinterpret_cast<const LoadMethodSuper*>(&instr);
break;
}
case Opcode::kCallMethod: {
if (candidate != nullptr &&
hir::modelReg(instr.GetOperand(1)) == candidate->GetOutput()) {
env_.optimizable_load_call_methods_.emplace(candidate);
env_.optimizable_load_call_methods_.emplace(&instr);
candidate = nullptr;
}
break;
}
default: {
if (candidate != nullptr && output == candidate->GetOutput()) {
candidate = nullptr;
}
break;
}
}
}
}
}
#ifdef __ASM_DEBUG
const char* NativeGenerator::GetPyFunctionName() const {
return PyUnicode_AsUTF8(GetFunction()->code->co_name);
}
#endif
bool canLoadStoreAddr(asmjit::x86::Gp reg, int64_t addr) {
return reg == x86::rax || (addr >= INT32_MIN && addr <= INT32_MAX);
}
static void raiseUnboundLocalError(BorrowedRef<> name) {
PyErr_Format(
PyExc_UnboundLocalError,
"local variable '%.200U' referenced before assignment",
name);
}
static void raiseUnboundFreevarError(BorrowedRef<> name) {
PyErr_Format(
PyExc_NameError,
"free variable '%.200U' referenced before assignment in enclosing scope",
name);
}
static void raiseAttributeError(BorrowedRef<> receiver, BorrowedRef<> name) {
PyErr_Format(
PyExc_AttributeError,
"'%.50s' object has no attribute '%U'",
Py_TYPE(receiver)->tp_name,
name);
}
static void releaseRefs(
const std::vector<LiveValue>& live_values,
const MemoryView& mem) {
for (const auto& value : live_values) {
switch (value.ref_kind) {
case jit::hir::RefKind::kUncounted:
case jit::hir::RefKind::kBorrowed: {
continue;
}
case jit::hir::RefKind::kOwned: {
PyObject* obj = mem.read(value, true);
// Reference may be NULL if value is not definitely assigned
Py_XDECREF(obj);
break;
}
}
}
}
static PyFrameObject* prepareForDeopt(
const uint64_t* regs,
Runtime* runtime,
std::size_t deopt_idx,
const JITRT_CallMethodKind* call_method_kind) {
JIT_CHECK(deopt_idx != -1ull, "deopt_idx must be valid");
const DeoptMetadata& deopt_meta = runtime->getDeoptMetadata(deopt_idx);
PyThreadState* tstate = _PyThreadState_UncheckedGet();
Ref<PyFrameObject> f = materializePyFrameForDeopt(tstate);
PyFrameObject* frame = f.release();
PyFrameObject* frame_iter = frame;
_PyShadowFrame* sf_iter = tstate->shadow_frame;
// Iterate one past the inline depth because that is the caller frame.
for (int i = deopt_meta.inline_depth; i >= 0; i--) {
// Transfer ownership of shadow frame to the interpreter. The associated
// Python frame will be ignored during future attempts to materialize the
// stack.
_PyShadowFrame_SetOwner(sf_iter, PYSF_INTERP);
reifyFrame(
frame_iter,
deopt_meta,
deopt_meta.frame_meta.at(i),
regs,
call_method_kind);
frame_iter = frame_iter->f_back;
sf_iter = sf_iter->prev;
}
Ref<> deopt_obj;
// Clear our references now that we've transferred them to the frame
MemoryView mem{regs};
deopt_obj = profileDeopt(deopt_idx, deopt_meta, mem);
releaseRefs(deopt_meta.live_values, mem);
if (!PyErr_Occurred()) {
auto reason = deopt_meta.reason;
switch (reason) {
case DeoptReason::kGuardFailure: {
runtime->guardFailed(deopt_meta);
break;
}
case DeoptReason::kUnhandledNullField:
raiseAttributeError(deopt_obj, deopt_meta.eh_name);
break;
case DeoptReason::kUnhandledUnboundLocal:
raiseUnboundLocalError(deopt_meta.eh_name);
break;
case DeoptReason::kUnhandledUnboundFreevar:
raiseUnboundFreevarError(deopt_meta.eh_name);
break;
case DeoptReason::kUnhandledException:
JIT_CHECK(false, "unhandled exception without error set");
break;
case DeoptReason::kRaise:
// This code mirrors what happens in _PyEval_EvalFrameDefault although
// I'm not sure how to test it. Not clear it can happen with JIT.
#ifdef NDEBUG
if (!PyErr_Occurred()) {
PyErr_SetString(
PyExc_SystemError, "error return without exception set");
}
#else
JIT_CHECK(PyErr_Occurred(), "Error return without exception set");
#endif
break;
case jit::DeoptReason::kRaiseStatic:
JIT_CHECK(false, "Lost exception when raising static exception");
break;
case DeoptReason::kReraise:
PyErr_SetString(PyExc_RuntimeError, "No active exception to reraise");
break;
}
}
return frame;
}
static PyObject* resumeInInterpreter(
PyFrameObject* frame,
Runtime* runtime,
std::size_t deopt_idx) {
if (frame->f_gen) {
auto gen = reinterpret_cast<PyGenObject*>(frame->f_gen);
// It's safe to call JITRT_GenJitDataFree directly here, rather than
// through _PyJIT_GenDealloc. Ownership of all references have been
// transferred to the frame.
JITRT_GenJitDataFree(gen);
gen->gi_jit_data = nullptr;
}
PyThreadState* tstate = PyThreadState_Get();
PyObject* result = nullptr;
// Resume all of the inlined frames and the caller
const DeoptMetadata& deopt_meta = runtime->getDeoptMetadata(deopt_idx);
int inline_depth = deopt_meta.inline_depth;
int err_occurred = (deopt_meta.reason != DeoptReason::kGuardFailure);
while (inline_depth >= 0) {
// TODO(emacs): Investigate skipping resuming frames that do not have
// try/catch. Will require re-adding _PyShadowFrame_Pop back for
// non-generators and unlinking the frame manually.
// We need to maintain the invariant that there is at most one shadow frame
// on the shadow stack for each frame on the Python stack. Unless we are a
// a generator, the interpreter will insert a new entry on the shadow stack
// when execution resumes there, so we remove our entry.
if (!frame->f_gen) {
_PyShadowFrame_Pop(tstate, tstate->shadow_frame);
}
// Resume one frame.
PyFrameObject* prev_frame = frame->f_back;
result = PyEval_EvalFrameEx(frame, err_occurred);
// The interpreter loop handles unlinking the frame from the execution
// stack so we just need to decref.
JITRT_DecrefFrame(frame);
frame = prev_frame;
err_occurred = result == nullptr;
// Push the previous frame's result onto the value stack. We can't push
// after resuming because f_stacktop is NULL during execution of a frame.
if (!err_occurred) {
if (inline_depth > 0) {
// The caller is at inline depth 0, so we only attempt to push the
// result onto the stack in the deeper (> 0) frames. Otherwise, we
// should just return the value from the native code in the way our
// native calling convention requires.
*(frame->f_stacktop)++ = result;
}
}
inline_depth--;
}
return result;
}
void* generateDeoptTrampoline(bool generator_mode) {
CodeHolder code;
code.init(CodeAllocator::get()->asmJitCodeInfo());
x86::Builder a(&code);
Annotations annot;
auto annot_cursor = a.cursor();
// When we get here the stack has the following layout. The space on the
// stack for the call arg buffer / LOAD_METHOD scratch space is always safe
// to read, but its contents will depend on the function being compiled as
// well as the program point at which deopt occurs. We pass a pointer to it
// into the frame reification code so that it can properly reconstruct the
// interpreter's stack when the the result of a LOAD_METHOD is on the
// stack. See the comments in reifyStack in deopt.cpp for more details.
//
// +-------------------------+
// | ... |
// | ? call arg buffer |
// | ^ LOAD_METHOD scratch |
// +-------------------------+ <-- end of JIT's fixed frame
// | index of deopt metadata |
// | saved rip |
// | padding |
// | address of epilogue |
// | r15 | <-- rsp
// +-------------------------+
//
// Save registers for use in frame reification. Once these are saved we're
// free to clobber any caller-saved registers.
//
// IF YOU USE CALLEE-SAVED REGISTERS YOU HAVE TO RESTORE THEM MANUALLY BEFORE
// THE EXITING THE TRAMPOLINE.
a.push(x86::r14);
a.push(x86::r13);
a.push(x86::r12);
a.push(x86::r11);
a.push(x86::r10);
a.push(x86::r9);
a.push(x86::r8);
a.push(x86::rdi);
a.push(x86::rsi);
a.push(x86::rbp);
a.push(x86::rsp);
a.push(x86::rbx);
a.push(x86::rdx);
a.push(x86::rcx);
a.push(x86::rax);
annot.add("saveRegisters", &a, annot_cursor);
if (generator_mode) {
// Restore original RBP for use in epilogue.
RestoreOriginalGeneratorRBP(a.as<x86::Emitter>());
}
// Set up a stack frame for the trampoline so that:
//
// 1. Runtime code in the JIT that is used to update PyFrameObjects can find
// the saved rip at the expected location immediately following the end of
// the JIT's fixed frame.
// 2. The JIT-compiled function shows up in C stack straces when it is
// deopting. Only the deopt trampoline will appear in the trace if
// we don't open a frame.
//
// Right now the stack has the following layout:
//
// +-------------------------+ <-- end of JIT's fixed frame
// | index of deopt metadata |
// | saved rip |
// | padding |
// | address of epilogue |
// | r15 |
// | ... |
// | rax | <-- rsp
// +-------------------------+
//
// We want our frame to look like:
//
// +-------------------------+ <-- end of JIT's fixed frame
// | saved rip |
// | saved rbp | <-- rbp
// | index of deopt metadata |
// | address of epilogue |
// | r15 |
// | ... |
// | rax | <-- rsp
// +-------------------------+
//
// Load the saved rip passed to us from the JIT-compiled function, which
// resides where we're supposed to save rbp.
auto saved_rbp_addr =
x86::ptr(x86::rsp, (PhyLocation::NUM_GP_REGS + 2) * kPointerSize);
a.mov(x86::rdi, saved_rbp_addr);
// Save rbp and set up our frame
a.mov(saved_rbp_addr, x86::rbp);
a.lea(x86::rbp, saved_rbp_addr);
// Load the index of the deopt metadata, which resides where we're supposed to
// save rip.
auto saved_rip_addr = x86::ptr(x86::rbp, kPointerSize);
a.mov(x86::rsi, saved_rip_addr);
a.mov(saved_rip_addr, x86::rdi);
// Save the index of the deopt metadata
auto deopt_meta_addr = x86::ptr(x86::rbp, -kPointerSize);
a.mov(deopt_meta_addr, x86::rsi);
// Prep the frame for evaluation in the interpreter.
//
// We pass the array of saved registers, a pointer to the runtime, the index
// of deopt metadata, and the call method kind.
annot_cursor = a.cursor();
a.mov(x86::rdi, x86::rsp);
a.mov(
x86::rsi, reinterpret_cast<uint64_t>(NativeGeneratorFactory::runtime()));
a.mov(x86::rdx, deopt_meta_addr);
auto call_method_kind_addr = x86::ptr(x86::rbp, 2 * kPointerSize);
a.lea(x86::rcx, call_method_kind_addr);
static_assert(
std::is_same_v<
decltype(prepareForDeopt),
PyFrameObject*(
const uint64_t*,
Runtime*,
std::size_t,
const JITRT_CallMethodKind*)>,
"prepareForDeopt has unexpected signature");
a.call(reinterpret_cast<uint64_t>(prepareForDeopt));
// Clean up saved registers.
//
// This isn't strictly necessary but saves 128 bytes on the stack if we end
// up resuming in the interpreter.
a.add(x86::rsp, (PhyLocation::NUM_GP_REGS - 1) * kPointerSize);
// We have to restore our scratch register manually since it's callee-saved
// and the stage 2 trampoline used it to hold the address of this
// trampoline. We can't rely on the JIT epilogue to restore it for us, as the
// JIT-compiled code may not have spilled it.
a.pop(deopt_scratch_reg);
annot.add("prepareForDeopt", &a, annot_cursor);
// Resume execution in the interpreter.
annot_cursor = a.cursor();
// First argument: frame returned from prepareForDeopt.
a.mov(x86::rdi, x86::rax);
// Second argument: runtime.
a.mov(
x86::rsi, reinterpret_cast<uint64_t>(NativeGeneratorFactory::runtime()));
// Third argument: DeoptMetadata index.
a.mov(x86::rdx, x86::ptr(x86::rsp, kPointerSize));
static_assert(
std::is_same_v<
decltype(resumeInInterpreter),
PyObject*(PyFrameObject*, Runtime*, std::size_t)>,
"resumeInInterpreter has unexpected signature");
a.call(reinterpret_cast<uint64_t>(resumeInInterpreter));
annot.add("resumeInInterpreter", &a, annot_cursor);
// If we return a primitive and prepareForDeopt returned null, we need that
// null in edx/xmm1 to signal error to our caller. Since this trampoline is
// shared, we do this move unconditionally, but even if not needed, it's
// harmless. (To eliminate it, we'd need another trampoline specifically for
// deopt of primitive-returning functions, just to do this one move.)
a.mov(x86::edx, x86::eax);
a.movq(x86::xmm1, x86::eax);
// Now we're done. Get the address of the epilogue and jump there.
annot_cursor = a.cursor();
auto epilogue_addr = x86::ptr(x86::rbp, -2 * kPointerSize);
a.mov(x86::rdi, epilogue_addr);
// Remove our frame from the stack
a.leave();
// Clear the saved rip. Normally this would be handled by a `ret`; we must
// clear it manually because we're jumping directly to the epilogue.
a.sub(x86::rsp, -kPointerSize);
a.jmp(x86::rdi);
annot.add("jumpToRealEpilogue", &a, annot_cursor);
auto name =
generator_mode ? "deopt_trampoline_generators" : "deopt_trampoline";
void* result{nullptr};
ASM_CHECK(a.finalize(), name);
ASM_CHECK(CodeAllocator::get()->addCode(&result, &code), name);
JIT_LOGIF(
g_dump_asm,
"Disassembly for %s\n%s",
name,
annot.disassemble(result, code));
auto code_size = code.textSection()->realSize();
register_raw_debug_symbol(name, __FILE__, __LINE__, result, code_size, 0);
perf::registerFunction(result, code_size, name);
return result;
}
void* generateJitTrampoline() {
CodeHolder code;
code.init(CodeAllocator::get()->asmJitCodeInfo());
x86::Builder a(&code);
Annotations annot;
auto annot_cursor = a.cursor();
a.push(x86::rbp);
a.mov(x86::rbp, x86::rsp);
// save space for compiled out arg, and keep stack 16-byte aligned
a.sub(x86::rsp, sizeof(void*) * 2);
// save incoming arg registers
const int saved_reg_count = 6;
a.push(x86::r9);
a.push(x86::r8);
a.push(x86::rcx);
a.push(x86::rdx);
a.push(x86::rsi);
a.push(x86::rdi);
annot.add("saveRegisters", &a, annot_cursor);
// r10 contains the function object from our stub
a.mov(x86::rdi, x86::r10);
a.mov(x86::rsi, x86::rsp);
a.lea(
x86::rdx,
x86::ptr(
x86::rsp, sizeof(void*) * saved_reg_count)); // compiled indicator
a.call(reinterpret_cast<uint64_t>(JITRT_CompileFunction));
a.cmp(x86::byte_ptr(x86::rsp, sizeof(void*) * saved_reg_count), 0);
auto compile_failed = a.newLabel();
a.je(compile_failed);
// restore registers, and jump to JITed code
a.pop(x86::rdi);
a.pop(x86::rsi);
a.pop(x86::rdx);
a.pop(x86::rcx);
a.pop(x86::r8);
a.pop(x86::r9);
a.leave();
a.jmp(x86::rax);
auto name = "JitTrampoline";
a.bind(compile_failed);
a.leave();
a.ret();
ASM_CHECK(a.finalize(), name);
void* result{nullptr};
ASM_CHECK(CodeAllocator::get()->addCode(&result, &code), name);
JIT_LOGIF(
g_dump_asm,
"Disassembly for %s\n%s",
name,
annot.disassemble(result, code));
auto code_size = code.textSection()->realSize();
register_raw_debug_symbol(name, __FILE__, __LINE__, result, code_size, 0);
perf::registerFunction(result, code_size, name);
return result;
}
void NativeGenerator::generateAssemblyBody() {
auto as = env_.as;
auto& blocks = lir_func_->basicblocks();
for (auto& basicblock : blocks) {
env_.block_label_map.emplace(basicblock, as->newLabel());
}
for (lir::BasicBlock* basicblock : blocks) {
as->bind(map_get(env_.block_label_map, basicblock));
for (auto& instr : basicblock->instructions()) {
asmjit::BaseNode* cursor = as->cursor();
autogen::AutoTranslator::getInstance().translateInstr(&env_, instr.get());
if (instr->origin() != nullptr) {
env_.addAnnotation(instr.get(), cursor);
}
}
}
}
int NativeGenerator::calcFrameHeaderSize(const hir::Function* func) {
return func == nullptr ? 0 : sizeof(FrameHeader);
}
// calcMaxInlineDepth must work with nullptr HIR functions because it's valid
// to call NativeGenerator with only LIR (e.g., from a test). In the case of an
// LIR-only function, there is no HIR inlining.
int NativeGenerator::calcMaxInlineDepth(const hir::Function* func) {
if (func == nullptr) {
return 0;
}
int result = 0;
for (const auto& block : func->cfg.blocks) {
for (const auto& instr : block) {
auto bif = dynamic_cast<const BeginInlinedFunction*>(&instr);
if (!bif) {
continue;
}
int depth = bif->inlineDepth();
result = std::max(depth, result);
}
}
return result;
}
size_t NativeGenerator::numGpRegsForArgs() const {
return codegen::numGpRegsForArgs(GetFunction()->code);
}
// Calls cb() once for each function argument. If that argument is initially
// allocated to a register, provides that register.
// Returns true if there are any arguments that cannot initially be mapped to
// registers. Registers/arguments can a mixture of general purpose and floating-
// point (XMMx).
bool NativeGenerator::forEachArgumentRegInfo(
std::function<void(std::optional<asmjit::x86::Reg>, size_t)> cb) const {
size_t total_args = (size_t)GetFunction()->numArgs();
const std::vector<TypedArgument>& checks = GetFunction()->typed_args;
size_t gp_index = 0, fp_index = 0;
for (size_t i = 0, check_index = 0; i < total_args; i++) {
if (check_index < checks.size() &&
checks[check_index].locals_idx == (int)i &&
checks[check_index++].jit_type <= TCDouble) {
if (fp_index < FP_ARGUMENT_REG_COUNT) {
cb(x86::xmm(FP_ARGUMENT_REGS[fp_index++] - PhyLocation::XMM_REG_BASE),
i);
} else {
cb({}, i);
}
continue;
}
if (gp_index < numGpRegsForArgs()) {
PhyLocation phy_reg;
if (GetFunction()->code->co_flags & CO_COROUTINE &&
gp_index >= CORO_NARGSF_ARG_IDX) {
// Skip RDX which holds the the awaited flag from nargsf for coroutines.
phy_reg = GP_ARGUMENT_REGS[1 + gp_index];
} else {
phy_reg = GP_ARGUMENT_REGS[gp_index];
}
cb(x86::gpq(phy_reg), i);
gp_index++;
} else {
cb({}, i);
}
}
return gp_index >= numGpRegsForArgs() || fp_index >= FP_ARGUMENT_REG_COUNT;
}
} // namespace codegen
} // namespace jit