erts/emulator/beam/jit/beam_asm.hpp (978 lines of code) (raw):
/*
* %CopyrightBegin%
*
* Copyright Ericsson AB 2020-2020. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* %CopyrightEnd%
*/
#include <string>
#include <vector>
#include <unordered_map>
#include <map>
#include <asmjit/asmjit.h>
extern "C"
{
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include "sys.h"
#include "erl_vm.h"
#include "global.h"
#include "beam_catches.h"
#include "beam_asm.h"
}
class ArgVal {
BeamOpArg gen_op;
public:
enum TYPE {
u = TAG_u,
i = TAG_i,
x = TAG_x,
y = TAG_y,
f = TAG_f,
q = TAG_q,
e = TAG_r,
l = TAG_l /* float register */
};
ArgVal(const BeamOpArg &arg) {
gen_op = arg;
}
ArgVal(enum TYPE t, BeamInstr val) {
gen_op.type = t;
gen_op.val = val;
}
ArgVal(unsigned t, BeamInstr val) {
#ifdef DEBUG
switch (t) {
case TAG_u:
break;
case TAG_i:
break;
case TAG_x:
break;
case TAG_y:
break;
case TAG_f:
break;
case TAG_q:
break;
case TAG_r:
break;
case TAG_l:
break;
default:
ASSERT(0);
}
#endif
gen_op.type = t;
gen_op.val = val;
}
constexpr enum TYPE getType() const {
return (enum TYPE)gen_op.type;
}
constexpr uint64_t getValue() const {
return gen_op.val;
}
constexpr bool isMem() const {
return gen_op.type == x || gen_op.type == y;
}
constexpr bool isLiteral() const {
return gen_op.type == q;
}
constexpr bool isImmed() const {
return gen_op.type == i;
}
template<typename T>
ArgVal operator+(T val) const {
return ArgVal(gen_op.type, val + gen_op.val);
}
template<typename T>
ArgVal operator*(T val) const {
return ArgVal(gen_op.type, val * gen_op.val);
}
enum Relation { none, consecutive, reverse_consecutive };
static Relation register_relation(const ArgVal &arg1, const ArgVal &arg2) {
TYPE type = arg1.getType();
bool same_reg_types =
type == arg2.getType() && (type == TYPE::x || type == TYPE::y);
if (!same_reg_types) {
return none;
} else if (arg1.getValue() + 1 == arg2.getValue()) {
return consecutive;
} else if (arg1.getValue() == arg2.getValue() + 1) {
return reverse_consecutive;
} else {
return none;
}
};
};
using namespace asmjit;
class BeamAssembler : public ErrorHandler {
protected:
/* Holds code and relocation information. */
CodeHolder code;
/* TODO: Want to change this to x86::Builder in order to be able to patch
* the correct I into the code after code generation */
x86::Assembler a;
FileLogger logger;
Section *rodata = nullptr;
/* * * * * * * * * */
/* Points at x_reg_array inside an ErtsSchedulerRegisters struct, allowing
* the aux_regs field to be addressed with an 8-bit displacement. */
const x86::Gp registers = x86::rbx;
#ifdef NATIVE_ERLANG_STACK
/* The Erlang stack pointer, note that it uses RSP and is therefore invalid
* when running on the runtime stack. */
const x86::Gp E = x86::rsp;
/* Cached copy of Erlang stack pointer used to speed up stack switches when
* we know that the runtime doesn't read or modify the Erlang stack.
*
* If we find ourselves pressed for registers in the future, we could save
* this in the same slot as `registers` as that can be trivially recomputed
* from the top of the runtime stack. */
const x86::Gp E_saved = x86::r12;
#else
const x86::Gp E = x86::r12;
#endif
const x86::Gp c_p = x86::r13;
const x86::Gp FCALLS = x86::r14;
const x86::Gp HTOP = x86::r15;
/* Local copy of the active code index.
*
* This is set to ERTS_SAVE_CALLS_CODE_IX when save_calls is active, which
* routes us to a common handler routine that calls save_calls before
* jumping to the actual code. */
const x86::Gp active_code_ix = x86::rbp;
#ifdef ERTS_MSACC_EXTENDED_STATES
const x86::Mem erts_msacc_cache = getSchedulerRegRef(
offsetof(ErtsSchedulerRegisters, aux_regs.d.erts_msacc_cache));
#endif
/* * * * * * * * * */
#ifdef WIN32
const x86::Gp ARG1 = x86::rcx;
const x86::Gp ARG2 = x86::rdx;
const x86::Gp ARG3 = x86::r8;
const x86::Gp ARG4 = x86::r9;
const x86::Gp ARG5 = x86::r10;
const x86::Gp ARG6 = x86::r11;
const x86::Gp ARG1d = x86::ecx;
const x86::Gp ARG2d = x86::edx;
const x86::Gp ARG3d = x86::r8d;
const x86::Gp ARG4d = x86::r9d;
const x86::Gp ARG5d = x86::r10d;
const x86::Gp ARG6d = x86::r11d;
#else
const x86::Gp ARG1 = x86::rdi;
const x86::Gp ARG2 = x86::rsi;
const x86::Gp ARG3 = x86::rdx;
const x86::Gp ARG4 = x86::rcx;
const x86::Gp ARG5 = x86::r8;
const x86::Gp ARG6 = x86::r9;
const x86::Gp ARG1d = x86::edi;
const x86::Gp ARG2d = x86::esi;
const x86::Gp ARG3d = x86::edx;
const x86::Gp ARG4d = x86::ecx;
const x86::Gp ARG5d = x86::r8d;
const x86::Gp ARG6d = x86::r9d;
#endif
const x86::Gp RET = x86::rax;
const x86::Gp RETd = x86::eax;
const x86::Gp RETb = x86::al;
const x86::Mem TMP_MEM1q = getSchedulerRegRef(
offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[0]));
const x86::Mem TMP_MEM2q = getSchedulerRegRef(
offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[1]));
const x86::Mem TMP_MEM3q = getSchedulerRegRef(
offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[2]));
const x86::Mem TMP_MEM4q = getSchedulerRegRef(
offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[3]));
const x86::Mem TMP_MEM5q = getSchedulerRegRef(
offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[4]));
const x86::Mem TMP_MEM1d = getSchedulerRegRef(
offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[0]),
sizeof(Uint32));
const x86::Mem TMP_MEM2d = getSchedulerRegRef(
offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[1]),
sizeof(Uint32));
const x86::Mem TMP_MEM3d = getSchedulerRegRef(
offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[2]),
sizeof(Uint32));
const x86::Mem TMP_MEM4d = getSchedulerRegRef(
offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[3]),
sizeof(Uint32));
const x86::Mem TMP_MEM5d = getSchedulerRegRef(
offsetof(ErtsSchedulerRegisters, aux_regs.d.TMP_MEM[4]),
sizeof(Uint32));
enum Distance { dShort, dLong };
public:
static bool hasCpuFeature(uint32_t featureId);
BeamAssembler() : code() {
/* Setup with default code info */
Error err = code.init(hostEnvironment());
ERTS_ASSERT(!err && "Failed to init codeHolder");
err = code.newSection(&rodata,
".rodata",
SIZE_MAX,
Section::kFlagConst,
8);
ERTS_ASSERT(!err && "Failed to create .rodata section");
err = code.attach(&a);
ERTS_ASSERT(!err && "Failed to attach codeHolder");
#ifdef DEBUG
a.addValidationOptions(BaseEmitter::kValidationOptionAssembler);
#endif
code.setErrorHandler(this);
}
BeamAssembler(const std::string &log) : BeamAssembler() {
if (erts_asm_dump) {
setLogger(log + ".asm");
}
}
~BeamAssembler() {
if (logger.file())
fclose(logger.file());
}
void *getBaseAddress() {
ASSERT(code.hasBaseAddress());
return (void *)code.baseAddress();
}
size_t getOffset() {
return a.offset();
}
/*
* Generate the shortest instruction for setting a register to an immediate
* value. May clear flags.
*/
void mov_imm(x86::Gp to, Uint value) {
if (value == 0) {
/*
* Generate the shortest instruction to set the register to zero.
*
* 48 c7 c0 00 00 00 00 mov rax, 0
* b8 00 00 00 00 mov eax, 0
* 31 c0 xor eax, eax
*
* Thus, "xor eax, eax" is five bytes shorter than "mov rax, 0".
*
* Note: xor clears ZF and C; mov does not change any flags.
*/
a.xor_(to.r32(), to.r32());
} else if (Support::isInt32(value)) {
/*
* Generate the shortest instruction to set the register
* to an unsigned immediate value that fits in 32 bits.
*
* 48 c7 c0 2a 00 00 00 mov rax, 42
* b8 2a 00 00 00 mov eax, 42
*/
a.mov(to.r32(), imm(value));
} else {
a.mov(to, imm(value));
}
}
protected:
void *_codegen() {
Error err = code.flatten();
ERTS_ASSERT(!err && "Could not flatten code");
err = code.resolveUnresolvedLinks();
ERTS_ASSERT(!err && "Could not resolve all links");
/* Verify that all labels are bound */
#ifdef DEBUG
for (auto e : code.labelEntries()) {
if (!e->isBound()) {
erts_exit(ERTS_ABORT_EXIT, "Label %s is not bound", e->name());
}
}
#endif
/* The code needs to be 16 byte aligned, so we allocate a little extra
* and then align it. It has to be 16 bytes aligned in order to the
* code align functions to work. If we ever use a 32 byte align,
* we need to align the code to 32-bytes etc etc. */
void *module =
(void *)erts_alloc(ERTS_ALC_T_CODE, code.codeSize() + 16);
uint64_t aligned_module =
(uint64_t)module + (16 - ((uint64_t)module) % 16);
ERTS_ASSERT((uint64_t)aligned_module % 16 == 0);
code.relocateToBase(aligned_module);
code.copyFlattenedData((void *)aligned_module,
code.codeSize(),
CodeHolder::kCopyPadSectionBuffer);
#ifdef WIN32
DWORD old;
if (!VirtualProtect((void *)aligned_module,
code.codeSize(),
PAGE_EXECUTE_READWRITE,
&old)) {
erts_exit(-2, "Could not change memory protection");
}
#endif
return module;
}
void *getCode(Label label) {
ASSERT(label.isValid());
return (char *)getBaseAddress() + code.labelOffsetFromBase(label);
}
byte *getCode(char *labelName) {
return (byte *)getCode(code.labelByName(labelName, strlen(labelName)));
}
void handleError(Error err, const char *message, BaseEmitter *origin) {
comment(message);
fflush(logger.file());
ASSERT(0 && "Fault instruction encode");
}
constexpr x86::Mem getRuntimeStackRef() const {
int base = offsetof(ErtsSchedulerRegisters, aux_regs.d.runtime_stack);
return getSchedulerRegRef(base);
}
#if !defined(NATIVE_ERLANG_STACK)
# ifdef HARD_DEBUG
constexpr x86::Mem getInitialSPRef() const {
int base = offsetof(ErtsSchedulerRegisters, aux_regs.d.initial_sp);
return getSchedulerRegRef(base);
}
# endif
constexpr x86::Mem getCPRef() const {
return x86::qword_ptr(E);
}
#endif
constexpr x86::Mem getSchedulerRegRef(int offset,
size_t size = sizeof(UWord)) const {
const int x_reg_offset =
offsetof(ErtsSchedulerRegisters, x_reg_array.d);
/* The entire aux_reg field should be addressable with an 8-bit
* displacement. */
ERTS_CT_ASSERT(x_reg_offset <= 128);
return x86::Mem(registers, offset - x_reg_offset, size);
}
constexpr x86::Mem getFRef(int index, size_t size = sizeof(UWord)) const {
int base = offsetof(ErtsSchedulerRegisters, f_reg_array.d);
int offset = index * sizeof(FloatDef);
ASSERT(index >= 0 && index <= 1023);
return getSchedulerRegRef(base + offset, size);
}
constexpr x86::Mem getXRef(int index, size_t size = sizeof(UWord)) const {
int base = offsetof(ErtsSchedulerRegisters, x_reg_array.d);
int offset = index * sizeof(Eterm);
ASSERT(index >= 0 && index < ERTS_X_REGS_ALLOCATED);
return getSchedulerRegRef(base + offset, size);
}
constexpr x86::Mem getYRef(int index, size_t size = sizeof(UWord)) const {
ASSERT(index >= 0 && index <= 1023);
#ifdef NATIVE_ERLANG_STACK
return x86::Mem(E, index * sizeof(Eterm), size);
#else
return x86::Mem(E, (index + CP_SIZE) * sizeof(Eterm), size);
#endif
}
constexpr x86::Mem getCARRef(x86::Gp Src,
size_t size = sizeof(UWord)) const {
return x86::Mem(Src, -TAG_PRIMARY_LIST, size);
}
constexpr x86::Mem getCDRRef(x86::Gp Src,
size_t size = sizeof(UWord)) const {
return x86::Mem(Src, -TAG_PRIMARY_LIST + sizeof(Eterm), size);
}
void load_x_reg_array(x86::Gp reg) {
/* By definition. */
a.mov(reg, registers);
}
void load_erl_bits_state(x86::Gp reg) {
int offset =
offsetof(ErtsSchedulerRegisters, aux_regs.d.erl_bits_state);
a.lea(reg, getSchedulerRegRef(offset));
}
void emit_assert_redzone_unused() {
#ifdef HARD_DEBUG
const int REDZONE_BYTES = S_REDZONE * sizeof(Eterm);
Label next = a.newLabel();
/* We modify the stack pointer to avoid spilling into a register,
* TMP_MEM, or using the stack. */
a.sub(E, imm(REDZONE_BYTES));
a.cmp(HTOP, E);
a.add(E, imm(REDZONE_BYTES));
a.jbe(next);
a.ud2();
a.bind(next);
#endif
}
/*
* Calls an Erlang function.
*/
template<typename Any>
void erlang_call(Any Target, const x86::Gp &spill) {
#ifdef NATIVE_ERLANG_STACK
/* We use the Erlang stack as the native stack. We can use a
* native `call` instruction. */
emit_assert_erlang_stack();
emit_assert_redzone_unused();
aligned_call(Target);
#else
Label next = a.newLabel();
/* Save the return CP on the stack. */
a.lea(spill, x86::qword_ptr(next));
a.mov(getCPRef(), spill);
a.jmp(Target);
/* Need to align this label in order for it to be recognized as is_CP.
*/
a.align(kAlignCode, 8);
a.bind(next);
#endif
}
/*
* Calls the given address in shared fragment, ensuring that the
* redzone is unused and that the return address forms a valid
* CP.
*/
template<typename Any>
void fragment_call(Any Target) {
emit_assert_erlang_stack();
emit_assert_redzone_unused();
#if defined(HARD_DEBUG) && !defined(NATIVE_ERLANG_STACK)
/* Verify that the stack has not grown. */
Label next = a.newLabel();
a.cmp(x86::rsp, getInitialSPRef());
a.short_().je(next);
a.ud2();
a.bind(next);
#endif
aligned_call(Target);
}
/*
* Calls the given function pointer. In a debug build with
* HARD_DEBUG defined, it will be enforced that the redzone is
* unused.
*
* The return will NOT be aligned, and thus will not form a valid
* CP. That means that call code must not scan the stack in any
* way. That means, for example, that the called code must not
* throw an exception, do a garbage collection, or cause a context
* switch.
*/
void safe_fragment_call(void (*Target)()) {
emit_assert_erlang_stack();
emit_assert_redzone_unused();
a.call(imm(Target));
}
template<typename FuncPtr>
void aligned_call(FuncPtr(*target)) {
/* Calls to absolute addresses (encoded in the address table) are
* always 6 bytes long. */
aligned_call(imm(target), 6);
}
void aligned_call(Label target) {
/* Relative calls are always 5 bytes long. */
aligned_call(target, 5);
}
template<typename OperandType>
void aligned_call(OperandType target) {
/* Other calls are variable size. While it would be nice to use this
* method for pointer/label calls too, `asmjit` writes relocations into
* the code buffer itself and overwriting them causes all kinds of
* havoc. */
size_t call_offset, call_size;
call_offset = a.offset();
a.call(target);
call_size = a.offset() - call_offset;
a.setOffset(call_offset);
aligned_call(target, call_size);
}
/* Calls the given address, ensuring that the return address forms a valid
* CP. */
template<typename OperandType>
void aligned_call(OperandType target, size_t size) {
/* The return address must be 8-byte aligned to form a valid CP, so
* we'll align according to the size of the call instruction. */
ssize_t next_address = (a.offset() + size);
if (next_address % 8) {
ssize_t nop_count = 8 - next_address % 8;
for (int i = 0; i < nop_count; i++) {
a.nop();
}
}
#ifdef HARD_DEBUG
/* TODO: When frame pointers are in place, assert (at runtime) that the
* destination has a `push rbp; mov rbp, rsp` sequence. */
#endif
a.call(target);
ASSERT((a.offset() % 8) == 0);
}
void runtime_call(x86::Gp func, unsigned args) {
ASSERT(args < 5);
emit_assert_runtime_stack();
#ifdef WIN32
a.sub(x86::rsp, imm(4 * sizeof(UWord)));
a.call(func);
a.add(x86::rsp, imm(4 * sizeof(UWord)));
#else
a.call(func);
#endif
}
template<typename T>
struct function_arity;
template<typename T, typename... Args>
struct function_arity<T(Args...)>
: std::integral_constant<int, sizeof...(Args)> {};
template<int expected_arity, typename T>
void runtime_call(T(*func)) {
static_assert(expected_arity == function_arity<T>());
emit_assert_runtime_stack();
#ifdef WIN32
unsigned pushed;
switch (expected_arity) {
case 6:
case 5:
/* We push ARG6 to keep the stack aligned even when we only have 5
* arguments. It does no harm, and is slightly more compact than
* sub/push/sub. */
a.push(ARG6);
a.push(ARG5);
a.sub(x86::rsp, imm(4 * sizeof(UWord)));
pushed = 6;
break;
default:
a.sub(x86::rsp, imm(4 * sizeof(UWord)));
pushed = 4;
}
#endif
a.call(imm(func));
#ifdef WIN32
a.add(x86::rsp, imm(pushed * sizeof(UWord)));
#endif
}
template<typename T>
void abs_jmp(T(*addr)) {
a.jmp(imm(addr));
}
/* Explicitly position-independent absolute jump, for use in fragments that
* need to be memcpy'd for performance reasons (e.g. export entries) */
template<typename T>
void pic_jmp(T(*addr)) {
a.mov(ARG6, imm(addr));
a.jmp(ARG6);
}
constexpr x86::Mem getArgRef(const ArgVal &val,
size_t size = sizeof(UWord)) const {
switch (val.getType()) {
case ArgVal::TYPE::l:
return getFRef(val.getValue(), size);
case ArgVal::TYPE::x:
return getXRef(val.getValue(), size);
case ArgVal::TYPE::y:
return getYRef(val.getValue(), size);
default:
ERTS_ASSERT(!"NYI");
return x86::Mem();
}
}
/* Discards a continuation pointer, including the frame pointer if
* applicable. */
void emit_discard_cp() {
emit_assert_erlang_stack();
a.add(x86::rsp, imm(CP_SIZE * sizeof(Eterm)));
}
void emit_assert_runtime_stack() {
#ifdef HARD_DEBUG
Label crash = a.newLabel(), next = a.newLabel();
/* Are we 16-byte aligned? */
a.test(E, (16 - 1));
a.jne(crash);
# ifdef NATIVE_ERLANG_STACK
/* Ensure that we are using the runtime stack. */
int end_offs, start_offs;
end_offs = offsetof(ErtsSchedulerRegisters, runtime_stack_end);
start_offs = offsetof(ErtsSchedulerRegisters, runtime_stack_start);
a.cmp(E, getSchedulerRegRef(end_offs));
a.short_().jl(crash);
a.cmp(E, getSchedulerRegRef(start_offs));
a.short_().jle(next);
# endif
a.bind(crash);
a.ud2();
a.bind(next);
#endif
}
void emit_assert_erlang_stack() {
#ifdef HARD_DEBUG
Label crash = a.newLabel(), next = a.newLabel();
/* Are we term-aligned? */
a.test(E, imm(sizeof(Eterm) - 1));
a.jne(crash);
a.cmp(E, x86::qword_ptr(c_p, offsetof(Process, heap)));
a.jl(crash);
a.cmp(E, x86::qword_ptr(c_p, offsetof(Process, hend)));
a.jle(next);
a.bind(crash);
a.hlt();
a.bind(next);
#endif
}
enum Update : int {
eStack = (1 << 0),
eHeap = (1 << 1),
eReductions = (1 << 2),
eCodeIndex = (1 << 3)
};
template<int Spec = 0>
void emit_enter_runtime() {
emit_assert_erlang_stack();
ERTS_CT_ASSERT((Spec & (Update::eReductions | Update::eStack |
Update::eHeap)) == Spec);
#ifdef NATIVE_ERLANG_STACK
if (!(Spec & Update::eStack)) {
a.mov(E_saved, E);
}
#endif
if ((Spec & (Update::eHeap | Update::eStack)) ==
(Update::eHeap | Update::eStack)) {
/* To update both heap and stack we use sse instructions like gcc
-O3 does. Basically it is this function run through gcc -O3:
struct a { long a; long b; long c; };
void test(long a, long b, long c, struct a *s) {
s->a = a;
s->b = b;
s->c = c;
}
*/
ERTS_CT_ASSERT(offsetof(Process, stop) - offsetof(Process, htop) ==
8);
a.movq(x86::xmm0, HTOP);
a.movq(x86::xmm1, E);
if (Spec & Update::eReductions) {
a.mov(x86::qword_ptr(c_p, offsetof(Process, fcalls)), FCALLS);
}
a.punpcklqdq(x86::xmm0, x86::xmm1);
a.movups(x86::xmmword_ptr(c_p, offsetof(Process, htop)), x86::xmm0);
} else {
if ((Spec & Update::eStack)) {
a.mov(x86::qword_ptr(c_p, offsetof(Process, stop)), E);
}
if (Spec & Update::eHeap) {
a.mov(x86::qword_ptr(c_p, offsetof(Process, htop)), HTOP);
}
if (Spec & Update::eReductions) {
a.mov(x86::qword_ptr(c_p, offsetof(Process, fcalls)), FCALLS);
}
}
#ifdef NATIVE_ERLANG_STACK
a.lea(E, getRuntimeStackRef());
#else
/* Keeping track of stack alignment across shared fragments would be
* too much of a maintenance burden, so we stash and align the stack
* pointer at runtime instead. */
a.mov(getRuntimeStackRef(), x86::rsp);
a.sub(x86::rsp, imm(15));
a.and_(x86::rsp, imm(-16));
#endif
}
template<int Spec = 0>
void emit_leave_runtime() {
emit_assert_runtime_stack();
ERTS_CT_ASSERT((Spec & (Update::eReductions | Update::eStack |
Update::eHeap | Update::eCodeIndex)) == Spec);
#ifdef NATIVE_ERLANG_STACK
if (!(Spec & Update::eStack)) {
a.mov(E, E_saved);
}
#endif
if ((Spec & Update::eStack)) {
a.mov(E, x86::qword_ptr(c_p, offsetof(Process, stop)));
}
if (Spec & Update::eHeap) {
a.mov(HTOP, x86::qword_ptr(c_p, offsetof(Process, htop)));
}
if (Spec & Update::eReductions) {
a.mov(FCALLS, x86::qword_ptr(c_p, offsetof(Process, fcalls)));
}
if (Spec & Update::eCodeIndex) {
/* Updates the local copy of the active code index, retaining
* save_calls if active. */
a.mov(ARG1, imm(&the_active_code_index));
a.mov(ARG1d, x86::dword_ptr(ARG1));
a.cmp(active_code_ix, imm(ERTS_SAVE_CALLS_CODE_IX));
a.cmovne(active_code_ix, ARG1);
}
#if !defined(NATIVE_ERLANG_STACK)
/* Restore the unaligned stack pointer we saved on enter. */
a.mov(x86::rsp, getRuntimeStackRef());
#endif
}
void emit_is_boxed(Label Fail, x86::Gp Src, Distance dist = dLong) {
/* Use the shortest possible instruction depending on the source
* register. */
if (Src == x86::rax || Src == x86::rdi || Src == x86::rsi ||
Src == x86::rcx || Src == x86::rdx) {
a.test(Src.r8(), imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_BOXED));
} else {
a.test(Src.r32(), imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_BOXED));
}
if (dist == dShort) {
a.short_().jne(Fail);
} else {
a.jne(Fail);
}
}
x86::Gp emit_ptr_val(x86::Gp Dst, x86::Gp Src) {
#if !defined(TAG_LITERAL_PTR)
return Src;
#else
if (Dst != Src) {
a.mov(Dst, Src);
}
/* We intentionally skip TAG_PTR_MASK__ here, as we want to use
* plain `emit_boxed_val` when we know the argument can't be a literal,
* such as in bit-syntax matching.
*
* This comes at very little cost as `emit_boxed_val` nearly always has
* a displacement. */
a.and_(Dst, imm(~TAG_LITERAL_PTR));
return Dst;
#endif
}
constexpr x86::Mem emit_boxed_val(x86::Gp Src,
int32_t bytes = 0,
size_t size = sizeof(UWord)) const {
ASSERT(bytes % sizeof(Eterm) == 0);
return x86::Mem(Src, bytes - TAG_PRIMARY_BOXED, size);
}
void emit_test_the_non_value(x86::Gp Reg) {
if (THE_NON_VALUE == 0) {
a.test(Reg.r32(), Reg.r32());
} else {
a.cmp(Reg, imm(THE_NON_VALUE));
}
}
public:
void embed_rodata(char *labelName, const char *buff, size_t size);
void embed_bss(char *labelName, size_t size);
void embed_zeros(size_t size);
void setLogger(std::string log) {
FILE *f = fopen(log.data(), "w+");
/* FIXME: Don't crash when loading multiple modules with the same name.
*
* setLogger(nullptr) disables logging. */
if (f) {
setvbuf(f, NULL, _IONBF, 0);
}
setLogger(f);
}
void setLogger(FILE *log) {
logger.setFile(log);
logger.setIndentation(FormatOptions::kIndentationCode, 4);
code.setLogger(&logger);
}
template<typename... Ts>
void comment(const char *format, Ts... args) {
if (logger.file()) {
char buff[1024];
erts_snprintf(buff, sizeof(buff), format, args...);
a.commentf("# %s", buff);
}
}
struct AsmRange {
BeamInstr *start;
BeamInstr *stop;
std::string name;
/* Not used yet */
std::string file;
unsigned line;
};
void update_gdb_jit_info(std::string modulename,
std::vector<AsmRange> &functions);
void embed(void *data, uint32_t size) {
a.embed((char *)data, size);
}
};
class BeamGlobalAssembler : public BeamAssembler {
typedef void (BeamGlobalAssembler::*emitFptr)(void);
typedef void (*fptr)(void);
/* Please keep this in alphabetical order. */
#define BEAM_GLOBAL_FUNCS(_) \
_(arith_compare_shared) \
_(arith_eq_shared) \
_(bif_nif_epilogue) \
_(bif_element_shared) \
_(bs_add_shared) \
_(bs_size_check_shared) \
_(bs_fixed_integer_shared) \
_(bs_get_tail_shared) \
_(call_bif_shared) \
_(call_error_handler_shared) \
_(call_light_bif_shared) \
_(call_nif_early) \
_(call_nif_shared) \
_(catch_end_shared) \
_(dispatch_bif) \
_(dispatch_nif) \
_(dispatch_return) \
_(dispatch_save_calls) \
_(error_action_code) \
_(garbage_collect) \
_(generic_bp_global) \
_(generic_bp_local) \
_(debug_bp) \
_(handle_error_shared_prologue) \
_(handle_error_shared) \
_(handle_element_error) \
_(handle_hd_error) \
_(i_band_body_shared) \
_(i_band_guard_shared) \
_(i_bif_body_shared) \
_(i_bif_guard_shared) \
_(i_bor_body_shared) \
_(i_bor_guard_shared) \
_(i_bnot_body_shared) \
_(i_bnot_guard_shared) \
_(i_bsl_guard_shared) \
_(i_bsl_body_shared) \
_(i_bsr_guard_shared) \
_(i_bsr_body_shared) \
_(i_bxor_body_shared) \
_(i_bxor_guard_shared) \
_(i_func_info_shared) \
_(i_load_nif_shared) \
_(i_length_guard_shared) \
_(i_length_body_shared) \
_(i_loop_rec_shared) \
_(i_new_small_map_lit_shared) \
_(i_select_val_bins_shared) \
_(i_test_yield_shared) \
_(increment_body_shared) \
_(int_div_rem_body_shared) \
_(int_div_rem_guard_shared) \
_(minus_body_shared) \
_(minus_guard_shared) \
_(new_map_shared) \
_(plus_body_shared) \
_(plus_guard_shared) \
_(process_main) \
_(times_body_shared) \
_(times_guard_shared) \
_(update_map_assoc_shared) \
_(update_map_exact_guard_shared) \
_(update_map_exact_body_shared)
/* Labels exported from within process_main */
#define PROCESS_MAIN_LABELS(_) \
_(context_switch) \
_(context_switch_simplified) \
_(do_schedule)
#define DECL_ENUM(NAME) NAME,
enum GlobalLabels : uint32_t {
BEAM_GLOBAL_FUNCS(DECL_ENUM) PROCESS_MAIN_LABELS(DECL_ENUM)
};
#undef DECL_ENUM
static const std::map<GlobalLabels, emitFptr> emitPtrs;
static const std::map<GlobalLabels, std::string> labelNames;
std::unordered_map<GlobalLabels, Label> labels;
std::unordered_map<GlobalLabels, fptr> ptrs;
#define DECL_FUNC(NAME) void emit_##NAME(void);
BEAM_GLOBAL_FUNCS(DECL_FUNC);
#undef DECL_FUNC
template<typename T>
void emit_bitwise_fallback_body(T(*func_ptr), const ErtsCodeMFA *mfa);
template<typename T>
void emit_bitwise_fallback_guard(T(*func_ptr));
x86::Mem emit_i_length_common(Label fail, int state_size);
void emit_handle_error();
public:
BeamGlobalAssembler();
void (*get(GlobalLabels lbl))(void) {
ASSERT(ptrs[lbl]);
return ptrs[lbl];
}
#define GET_CODE(NAME) \
void (*get_##NAME(void))() { \
return get(NAME); \
}
BEAM_GLOBAL_FUNCS(GET_CODE)
PROCESS_MAIN_LABELS(GET_CODE)
#undef GET_CODE
};
class BeamModuleAssembler : public BeamAssembler {
typedef unsigned BeamLabel;
/* Map of label number to asmjit Label */
typedef std::unordered_map<BeamLabel, Label> LabelMap;
LabelMap labels;
struct patch {
Label where;
int64_t ptr_offs;
int64_t val_offs;
};
struct patch_catch {
struct patch patch;
Label handler;
};
std::vector<struct patch_catch> catches;
/* Map of import entry to patch labels and mfa */
struct patch_import {
std::vector<struct patch> patches;
ErtsCodeMFA mfa;
};
typedef std::unordered_map<unsigned, struct patch_import> ImportMap;
ImportMap imports;
/* Map of fun entry to patch labels */
struct patch_lambda {
std::vector<struct patch> patches;
ErlFunEntry fe;
};
typedef std::unordered_map<unsigned, struct patch_lambda> LambdaMap;
LambdaMap lambdas;
/* Map of literals to patch labels */
struct patch_literal {
std::vector<struct patch> patches;
};
typedef std::unordered_map<unsigned, struct patch_literal> LiteralMap;
LiteralMap literals;
/* All string patches */
std::vector<struct patch> strings;
/* All functions that have been seen so far */
std::vector<BeamLabel> functions;
BeamGlobalAssembler *ga;
/* Used by emit to populate the labelToMFA map */
Label currLabel;
unsigned prev_op = 0;
Label codeHeader;
Label funcInfo;
Label funcYield;
Label genericBPTramp;
Label on_load;
Label floatMax;
Label floatSignMask;
Eterm mod;
/* Save the last PC for an error. */
size_t last_error_offset = 0;
public:
BeamModuleAssembler(BeamGlobalAssembler *ga,
Eterm mod,
unsigned num_labels);
BeamModuleAssembler(BeamGlobalAssembler *ga,
Eterm mod,
unsigned num_labels,
unsigned num_functions);
bool emit(unsigned op, const std::vector<ArgVal> &args);
void *codegen(BeamCodeHeader *in_hdr, BeamCodeHeader **out_hdr);
void *codegen(void);
void codegen(char *buff, size_t len);
BeamInstr *getCode(unsigned label);
void *getCode(Label label) {
return BeamAssembler::getCode(label);
}
byte *getCode(char *labelName) {
return BeamAssembler::getCode(labelName);
}
Label embed_vararg_rodata(const std::vector<ArgVal> &args, int y_offset);
unsigned getCodeSize() {
ASSERT(code.hasBaseAddress());
return code.codeSize();
}
void copyCodeHeader(BeamCodeHeader *hdr);
BeamCodeHeader *getCodeHeader(void);
BeamInstr *getOnLoad(void);
unsigned patchCatches();
void patchLambda(unsigned index, BeamInstr I);
void patchLiteral(unsigned index, Eterm lit);
void patchImport(unsigned index, BeamInstr I);
void patchStrings(byte *string);
void emit_call_bif_export(void *fptr);
private:
/* Helpers */
void emit_gc_test(const ArgVal &Stack,
const ArgVal &Heap,
const ArgVal &Live);
void emit_gc_test_preserve(const ArgVal &Need,
const ArgVal &Live,
x86::Gp term);
x86::Mem emit_setup_export(const ArgVal &Exp);
x86::Gp emit_variable_apply(bool includeI);
x86::Gp emit_fixed_apply(const ArgVal &arity, bool includeI);
x86::Gp emit_call_fun(const ArgVal &Fun);
x86::Gp emit_apply_fun(void);
void emit_is_binary(Label Fail, x86::Gp Src, Label next, Label subbin);
void emit_get_list(const x86::Gp boxed_ptr,
const ArgVal &Hd,
const ArgVal &Tl);
void emit_div_rem(const ArgVal &Fail,
const ArgVal &LHS,
const ArgVal &RHS,
const ErtsCodeMFA *error_mfa);
void emit_setup_guard_bif(const std::vector<ArgVal> &args,
const ArgVal &bif);
void emit_bif_arg_error(std::vector<ArgVal> args, const ErtsCodeMFA *mfa);
void emit_error(int code);
x86::Mem emit_bs_get_integer_prologue(Label next,
Label fail,
int flags,
int size);
int emit_bs_get_field_size(const ArgVal &Size,
int unit,
Label Fail,
const x86::Gp &out,
unsigned max_size = 0);
void emit_bs_get_utf8(const ArgVal &Ctx, const ArgVal &Fail);
void emit_bs_get_utf16(const ArgVal &Ctx,
const ArgVal &Fail,
const ArgVal &Flags);
void emit_handle_error();
void emit_handle_error(const ErtsCodeMFA *exp);
void emit_handle_error(Label I, const ErtsCodeMFA *exp);
void emit_validate(const ArgVal &arity);
void emit_bs_skip_bits(const ArgVal &Fail, const ArgVal &Ctx);
void emit_linear_search(x86::Gp val,
const ArgVal &Fail,
const std::vector<ArgVal> &args);
void emit_check_float(Label next, x86::Xmm value);
void emit_is_small(Label fail, x86::Gp Reg);
void emit_is_both_small(Label fail, x86::Gp A, x86::Gp B);
void emit_validate_unicode(Label next, Label fail, x86::Gp value);
void emit_bif_is_eq_ne_exact_immed(const ArgVal &Src,
const ArgVal &Immed,
const ArgVal &Dst,
Eterm fail_value,
Eterm succ_value);
void emit_proc_lc_unrequire(void);
void emit_proc_lc_require(void);
void emit_nyi(const char *msg);
void emit_nyi(void);
#ifdef DEBUG
void emit_tuple_assertion(const ArgVal &Src, x86::Gp tuple_reg);
#endif
#include "beamasm_protos.h"
void make_move_patch(x86::Gp to,
std::vector<struct patch> &patches,
int64_t offset = 0) {
const int MOV_IMM64_PAYLOAD_OFFSET = 2;
Label lbl = a.newLabel();
a.bind(lbl);
a.long_().mov(to, imm(LLONG_MAX));
patches.push_back({lbl, MOV_IMM64_PAYLOAD_OFFSET, offset});
}
void make_word_patch(std::vector<struct patch> &patches) {
Label lbl = a.newLabel();
UWord word = LLONG_MAX;
a.bind(lbl);
a.embed(reinterpret_cast<char *>(&word), sizeof(word));
patches.push_back({lbl, 0, 0});
}
template<typename A, typename B>
void mov_arg(A to, B from) {
/* We can't move to or from Y registers when we're on the runtime
* stack, so we'll conservatively disallow all mov_args in the hopes of
* finding such bugs sooner. */
emit_assert_erlang_stack();
mov_arg(to, from, ARG1);
}
template<typename T>
void cmp_arg(T oper, const ArgVal &val) {
cmp_arg(oper, val, ARG1);
}
void cmp_arg(x86::Mem mem, const ArgVal &val, const x86::Gp &spill) {
/* Note that the cast to Sint is necessary to handle negative numbers
* such as NIL. */
if (val.isImmed() && Support::isInt32((Sint)val.getValue())) {
a.cmp(mem, imm(val.getValue()));
} else {
mov_arg(spill, val);
a.cmp(mem, spill);
}
}
void cmp_arg(x86::Gp gp, const ArgVal &val, const x86::Gp &spill) {
if (val.isImmed() && Support::isInt32((Sint)val.getValue())) {
a.cmp(gp, imm(val.getValue()));
} else {
mov_arg(spill, val);
a.cmp(gp, spill);
}
}
/* Note: May clear flags. */
void mov_arg(x86::Gp to, const ArgVal &from, const x86::Gp &spill) {
if (from.isMem()) {
a.mov(to, getArgRef(from));
} else if (from.isLiteral()) {
make_move_patch(to, literals[from.getValue()].patches);
} else {
mov_imm(to, from.getValue());
}
}
void mov_arg(x86::Mem to, const ArgVal &from, const x86::Gp &spill) {
if (from.isImmed()) {
if (Support::isInt32((Sint)from.getValue())) {
a.mov(to, imm(from.getValue()));
} else {
a.mov(spill, imm(from.getValue()));
a.mov(to, spill);
}
} else {
mov_arg(spill, from);
a.mov(to, spill);
}
}
void mov_arg(const ArgVal &to, x86::Gp from, const x86::Gp &spill) {
(void)spill;
a.mov(getArgRef(to), from);
}
void mov_arg(const ArgVal &to, BeamInstr from, const x86::Gp &spill) {
if (Support::isInt32((Sint)from)) {
a.mov(getArgRef(to), imm(from));
} else {
a.mov(spill, imm(from));
mov_arg(to, spill);
}
}
void mov_arg(const ArgVal &to, const ArgVal &from, const x86::Gp &spill) {
if (from.isMem()) {
mov_arg(spill, from);
mov_arg(to, spill);
} else {
mov_arg(getArgRef(to), from);
}
}
};
void beamasm_update_perf_info(std::string modulename,
std::vector<BeamAssembler::AsmRange> &ranges);