hphp/runtime/vm/jit/vasm-arm.cpp (1,455 lines of code) (raw):
/*
+----------------------------------------------------------------------+
| HipHop for PHP |
+----------------------------------------------------------------------+
| Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
+----------------------------------------------------------------------+
| This source file is subject to version 3.01 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available through the world-wide-web at the following url: |
| http://www.php.net/license/3_01.txt |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
*/
/*
* The HHVM's ARM64 backend works with an early-truncation policy.
* That means that:
*
* A Vreg8 is an extended W-register with a u8 value.
* A Vreg16 is an extended W-register with a u16 value.
* A Vreg32 is a W-register with a u32 value.
* A Vreg64 is a X-register with a u64 value.
*
* This allows to omit truncation instructions for sub-32-bit
* operations. E.g. a testb{Vreg8 s0, Vreg8 s1} has to truncate
* s0 and s1 before emitting a tst instruction. When using the
* early-truncation policy, the testb{} emitter can rely on the
* fact, that s0 and s1 are already truncated and can emit a
* cmp instruction without preceding uxtb's.
*
* Conversely any arithmetic instruction has to sign extend any
* Vreg8 before operating on it. Vasm is light on these instructions,
* with only the following, currently: csinc[bw]{} and cmp[bw][i]{}.
*
* Early-truncation has also consequences to extension/truncation
* vasm instructions. The following list shows how to use them:
*
* movzbw: Vreg8 -> Vreg16: mov w0, w0 #nop if s==d
* movzbl: Vreg8 -> Vreg32: mov w0, w0 #nop if s==d
* movzbq: Vreg8 -> Vreg64: uxtb x0, x0
* movzwl: Vreg16 -> Vreg32 mov w0, w0 #nop if s==d
* movzwq: Vreg16 -> Vreg64 uxth x0, x0
* movzlq: Vreg32 -> Vreg64 uxtw x0, x0
* movtqb: Vreg64 -> Vreg8: uxtb w0, w0
* movtql: Vreg64 -> Vreg32: uxtw w0, w0
*
* Early-truncation also implies, that instructions have to truncate
* after performing the actual operation if it cannot guarantee that
* the resulting VregN type matches. E.g. emitting code for the vasm
* instruction andbi{Immed imm, Vreg8 s, Vreg8 d} has to truncate the
* result to guarantee that register d indeed holds a u8 value.
*
* Note, that the early-truncation policy allows aarch64 specific
* optimizations, which are not relevant on other architectures.
* E.g. the x86_64 does not need this policy as the ISA allows
* direct register accesses for Vreg8, Vreg16, Vreg32 and Vreg64
* (e.g. AL, AX, EAX, RAX).
*
* The early-truncation policy relies on the following
* requirements of the Vreg type-system:
*
* * All VregNs are created for values of up to N bits
* * All conversions between VregNs are done via movz/movt vasm instructions
*/
#include "hphp/runtime/vm/jit/vasm-emit.h"
#include "hphp/runtime/vm/jit/abi-arm.h"
#include "hphp/runtime/vm/jit/ir-instruction.h"
#include "hphp/runtime/vm/jit/print.h"
#include "hphp/runtime/vm/jit/service-requests.h"
#include "hphp/runtime/vm/jit/smashable-instr-arm.h"
#include "hphp/runtime/vm/jit/timer.h"
#include "hphp/runtime/vm/jit/vasm-gen.h"
#include "hphp/runtime/vm/jit/vasm.h"
#include "hphp/runtime/vm/jit/vasm-instr.h"
#include "hphp/runtime/vm/jit/vasm-internal.h"
#include "hphp/runtime/vm/jit/vasm-lower.h"
#include "hphp/runtime/vm/jit/vasm-print.h"
#include "hphp/runtime/vm/jit/vasm-reg.h"
#include "hphp/runtime/vm/jit/vasm-unit.h"
#include "hphp/runtime/vm/jit/vasm-util.h"
#include "hphp/runtime/vm/jit/vasm-visit.h"
#include "hphp/vixl/a64/macro-assembler-a64.h"
TRACE_SET_MOD(vasm);
namespace HPHP::jit {
///////////////////////////////////////////////////////////////////////////////
using namespace arm;
using namespace vixl;
namespace arm { struct ImmFolder; }
namespace {
///////////////////////////////////////////////////////////////////////////////
static_assert(folly::kIsLittleEndian,
"Code contains little-endian specific optimizations.");
vixl::Register X(Vreg64 r) {
PhysReg pr(r.asReg());
return x2a(pr);
}
vixl::Register W(Vreg64 r) {
PhysReg pr(r.asReg());
return x2a(pr).W();
}
vixl::Register W(Vreg32 r) {
PhysReg pr(r.asReg());
return x2a(pr).W();
}
vixl::Register W(Vreg16 r) {
PhysReg pr(r.asReg());
return x2a(pr).W();
}
vixl::Register W(Vreg8 r) {
PhysReg pr(r.asReg());
return x2a(pr).W();
}
vixl::FPRegister D(Vreg r) {
return x2f(r);
}
vixl::VRegister V(Vreg r) {
return x2v(r);
}
uint8_t Log2(uint8_t value) {
switch (value) {
case 1:
return 0;
case 2:
return 1;
case 4:
return 2;
case 8:
return 3;
default:
always_assert(false);
}
}
vixl::MemOperand M(Vptr p) {
assertx(p.base.isValid());
if (p.index.isValid()) {
assertx(p.disp == 0);
return MemOperand(X(p.base), X(p.index), LSL, Log2(p.scale));
}
return MemOperand(X(p.base), p.disp);
}
vixl::Condition C(ConditionCode cc) {
return arm::convertCC(cc);
}
/*
* Uses the flags from the Vinstr which defs SF to determine
* whether or not the Vixl assembler should emit code which
* sets the status flags.
*/
vixl::FlagsUpdate UF(Vflags flags) {
return flags ? SetFlags : LeaveFlags;
}
/*
* There are numerous ARM instructions that don't set status flags, and
* therefore those flags must be set synthetically in the emitters. This
* assertion is applied to the emitters which don't set all of the status
* flags required by the Vinstr which defs SF. The flags field of the
* Vinstr is used to determine which bits are required. Those required
* bits are compared against the bits which are actually set by the
* implementation.
*/
template<class Inst> void checkSF(const Inst& i, StatusFlags s) {
Vflags required = i.fl;
Vflags set = static_cast<Vflags>(s);
always_assert_flog((required & set) == required,
"should def SF but does not: {}\n",
vinst_names[Vinstr(i).op]);
}
template<class Inst> void checkSF(const Inst& i) {
checkSF(i, StatusFlags::None);
}
/*
* Returns true if the queried flag(s) is in the set of required flags.
*/
bool flagRequired(Vflags flags, StatusFlags flag) {
return (flags & static_cast<Vflags>(flag));
}
///////////////////////////////////////////////////////////////////////////////
struct Vgen {
explicit Vgen(Venv& env)
: env(env)
, assem(*env.cb)
, a(&assem)
, base(a->frontier())
, current(env.current)
, next(env.next)
, jmps(env.jmps)
, jccs(env.jccs)
, catches(env.catches)
{}
~Vgen() {
env.cb->sync(base);
}
static void emitVeneers(Venv& env);
static void handleLiterals(Venv& env);
static void retargetBinds(Venv& env);
static void patch(Venv& env);
static void pad(CodeBlock& cb) {
vixl::MacroAssembler a { cb };
auto const begin = cb.frontier();
while (cb.available() >= 4) a.Brk(1);
assertx(cb.available() == 0);
cb.sync(begin);
}
/////////////////////////////////////////////////////////////////////////////
template<class Inst> void emit(const Inst& i) {
always_assert_flog(false, "unimplemented instruction: {} in B{}\n",
vinst_names[Vinstr(i).op], size_t(current));
}
// intrinsics
void emit(const copy& i);
void emit(const copy2& i);
void emit(const debugtrap& /*i*/) { a->Brk(0); }
void emit(const fallthru& /*i*/);
void emit(const killeffects& /*i*/) {}
void emit(const ldimmb& i);
void emit(const ldimml& i);
void emit(const ldimmq& i);
void emit(const ldimmw& i);
void emit(const ldundefq& /*i*/) {}
void emit(const load& i);
void emit(const store& i);
void emit(const mcprep& i);
// native function abi
void emit(const call& i);
void emit(const callr& i) { a->Blr(X(i.target)); }
void emit(const calls& i);
void emit(const ret& /*i*/) { a->Ret(); }
// stub function abi
void emit(const callstub& i);
void emit(const callfaststub& i);
// php function abi
void emit(const callphp& i) {
emit(call{i.target, i.args});
setCallFuncId(env, a->frontier());
}
void emit(const callphpr& i) {
emit(callr{i.target, i.args});
setCallFuncId(env, a->frontier());
}
void emit(const contenter& i);
void emit(const phpret& i);
// vm entry abi
void emit(const inittc& /*i*/) {}
void emit(const leavetc& i);
// exceptions
void emit(const landingpad& /*i*/) {}
void emit(const nothrow& i);
void emit(const syncpoint& i);
void emit(const unwind& i);
// instructions
void emit(const absdbl& i) { a->Fabs(D(i.d), D(i.s)); }
void emit(const addl& i) { a->Add(W(i.d), W(i.s1), W(i.s0), UF(i.fl)); }
void emit(const addli& i) { a->Add(W(i.d), W(i.s1), i.s0.l(), UF(i.fl)); }
void emit(const addq& i) { a->Add(X(i.d), X(i.s1), X(i.s0), UF(i.fl));}
void emit(const addqi& i) { a->Add(X(i.d), X(i.s1), i.s0.q(), UF(i.fl)); }
void emit(const addsd& i) { a->Fadd(D(i.d), D(i.s1), D(i.s0)); }
void emit(const andb& i) { a->And(W(i.d), W(i.s1), W(i.s0), UF(i.fl)); }
void emit(const andbi& i) { a->And(W(i.d), W(i.s1), i.s0.ub(), UF(i.fl)); }
void emit(const andw& i) { a->And(W(i.d), W(i.s1), W(i.s0), UF(i.fl)); }
void emit(const andwi& i) { a->And(W(i.d), W(i.s1), i.s0.uw(), UF(i.fl)); }
void emit(const andl& i) { a->And(W(i.d), W(i.s1), W(i.s0), UF(i.fl)); }
void emit(const andli& i) { a->And(W(i.d), W(i.s1), i.s0.l(), UF(i.fl)); }
void emit(const andq& i) { a->And(X(i.d), X(i.s1), X(i.s0), UF(i.fl)); }
void emit(const andqi& i) { a->And(X(i.d), X(i.s1), i.s0.q(), UF(i.fl)); }
void emit(const andqi64& i) { a->And(X(i.d), X(i.s1), i.s0.q(), UF(i.fl)); }
void emit(const cmovb& i) { a->Csel(W(i.d), W(i.t), W(i.f), C(i.cc)); }
void emit(const cmovw& i) { a->Csel(W(i.d), W(i.t), W(i.f), C(i.cc)); }
void emit(const cmovl& i) { a->Csel(W(i.d), W(i.t), W(i.f), C(i.cc)); }
void emit(const cmovq& i) { a->Csel(X(i.d), X(i.t), X(i.f), C(i.cc)); }
// note: cmp{bw}[i] are emitted only for narrow comparisons and _do not_ sign
// extend their arguments--these instructions are lowered to cmp{lq}[i] if
// the comparison is not narrow or not equality/inequality
void emit(const cmpb& i) { a->Cmp(W(i.s1), W(i.s0)); }
void emit(const cmpbi& i) { a->Cmp(W(i.s1), static_cast<uint8_t>(i.s0.b())); }
void emit(const cmpw& i) { a->Cmp(W(i.s1), W(i.s0)); }
void emit(const cmpwi& i) { a->Cmp(W(i.s1), static_cast<uint16_t>(i.s0.w())); }
void emit(const cmpl& i) { a->Cmp(W(i.s1), W(i.s0)); }
void emit(const cmpli& i) { a->Cmp(W(i.s1), i.s0.l()); }
void emit(const cmpq& i) { a->Cmp(X(i.s1), X(i.s0)); }
void emit(const cmpqi& i) { a->Cmp(X(i.s1), i.s0.q()); }
void emit(const cmpsd& i);
// TODO(CDE): csinc[bw]{} Should a) sign extend and b) set SF for overflow
void emit(const csincb& i) { a->Csinc(W(i.d), W(i.t), W(i.f), C(i.cc)); }
void emit(const csincw& i) { a->Csinc(W(i.d), W(i.t), W(i.f), C(i.cc)); }
void emit(const csincl& i) { a->Csinc(W(i.d), W(i.t), W(i.f), C(i.cc)); }
void emit(const csincq& i) { a->Csinc(X(i.d), X(i.t), X(i.f), C(i.cc)); }
void emit(const cvtsi2sd& i) { a->Scvtf(D(i.d), X(i.s)); }
void emit(const decl& i) { a->Sub(W(i.d), W(i.s), 1, UF(i.fl)); }
void emit(const decq& i) { a->Sub(X(i.d), X(i.s), 1, UF(i.fl)); }
void emit(const decqmlock& i);
void emit(const divint& i) { a->Sdiv(X(i.d), X(i.s0), X(i.s1)); }
void emit(const divsd& i) { a->Fdiv(D(i.d), D(i.s1), D(i.s0)); }
void emit(const imul& i);
void emit(const incl& i) { a->Add(W(i.d), W(i.s), 1, UF(i.fl)); }
void emit(const incq& i) { a->Add(X(i.d), X(i.s), 1, UF(i.fl)); }
void emit(const incw& i) { a->Add(W(i.d), W(i.s), 1, UF(i.fl)); }
void emit(const jcc& i);
void emit(const jcci& i);
void emit(const jmp& i);
void emit(const jmpi& i);
void emit(const jmpr& i) { a->Br(X(i.target)); }
void emit(const ldbindretaddr& i);
void emit(const lea& i);
void emit(const leap& i);
void emit(const leav& i);
void emit(const lead& i);
void emit(const loadb& i) { a->Ldrb(W(i.d), M(i.s)); }
void emit(const loadl& i) { a->Ldr(W(i.d), M(i.s)); }
void emit(const loadsd& i) { a->Ldr(D(i.d), M(i.s)); }
void emit(const loadtqb& i) { a->Ldrb(W(i.d), M(i.s)); }
void emit(const loadtql& i) { a->Ldr(W(i.d), M(i.s)); }
void emit(const loadups& i);
void emit(const loadw& i) { a->Ldrh(W(i.d), M(i.s)); }
void emit(const loadzbl& i) { a->Ldrb(W(i.d), M(i.s)); }
void emit(const loadzbq& i) { a->Ldrb(W(i.d), M(i.s)); }
void emit(const loadsbq& i) { a->Ldrsb(X(i.d), M(i.s)); }
void emit(const loadsbl& i) { a->Ldrsb(W(i.d), M(i.s)); }
void emit(const loadzwq& i) { a->Ldrh(W(i.d), M(i.s)); }
void emit(const loadzlq& i) { a->Ldr(W(i.d), M(i.s)); }
void emit(const movb& i) { if (i.d != i.s) a->Mov(W(i.d), W(i.s)); }
void emit(const movw& i) { if (i.d != i.s) a->Mov(W(i.d), W(i.s)); }
void emit(const movl& i) { if (i.d != i.s) a->Mov(W(i.d), W(i.s)); }
void emit(const movsbl& i) { a->Sxtb(W(i.d), W(i.s)); }
void emit(const movsbq& i) { a->Sxtb(X(i.d), W(i.s).X()); }
void emit(const movswl& i) { a->Sxth(W(i.d), W(i.s)); }
void emit(const movtqb& i) { a->Uxtb(W(i.d), W(i.s)); }
void emit(const movtqw& i) { a->Uxth(W(i.d), W(i.s)); }
void emit(const movtql& i) { a->Uxtw(W(i.d), W(i.s)); }
void emit(const movzbq& i) { a->Uxtb(X(i.d), W(i.s).X()); }
void emit(const movzwq& i) { a->Uxth(X(i.d), W(i.s).X()); }
void emit(const movzlq& i) { a->Uxtw(X(i.d), W(i.s).X()); }
void emit(const mulsd& i) { a->Fmul(D(i.d), D(i.s1), D(i.s0)); }
void emit(const neg& i) { a->Neg(X(i.d), X(i.s), UF(i.fl)); }
void emit(const nop& /*i*/) { a->Nop(); }
void emit(const notb& i) { a->Mvn(W(i.d), W(i.s)); }
void emit(const not& i) { a->Mvn(X(i.d), X(i.s)); }
void emit(const orbi& i);
void emit(const orq& i);
void emit(const orwi& i);
void emit(const orli& i);
void emit(const orqi& i);
void emit(const pop& i);
void emit(const popp& i);
void emit(const push& i);
void emit(const pushp& i);
void emit(const roundsd& i);
void emit(const sar& i);
void emit(const sarqi& i);
void emit(const setcc& i) { a->Cset(W(i.d), C(i.cc)); }
void emit(const shl& i);
void emit(const shlli& i);
void emit(const shlqi& i);
void emit(const shrli& i);
void emit(const shrqi& i);
void emit(const sqrtsd& i) { a->Fsqrt(D(i.d), D(i.s)); }
void emit(const srem& i);
void emit(const storeb& i) { a->Strb(W(i.s), M(i.m)); }
void emit(const storel& i) { a->Str(W(i.s), M(i.m)); }
void emit(const storesd& i) { emit(store{i.s, i.m}); }
void emit(const storeups& i);
void emit(const storew& i) { a->Strh(W(i.s), M(i.m)); }
void emit(const subl& i) { a->Sub(W(i.d), W(i.s1), W(i.s0), UF(i.fl)); }
void emit(const subli& i) { a->Sub(W(i.d), W(i.s1), i.s0.l(), UF(i.fl)); }
void emit(const subq& i) { a->Sub(X(i.d), X(i.s1), X(i.s0), UF(i.fl)); }
void emit(const subqi& i) { a->Sub(X(i.d), X(i.s1), i.s0.q(), UF(i.fl)); }
void emit(const subsd& i) { a->Fsub(D(i.d), D(i.s1), D(i.s0)); }
void emit(const testb& i){ a->Tst(W(i.s1), W(i.s0)); }
void emit(const testbi& i){ a->Tst(W(i.s1), i.s0.ub()); }
void emit(const testw& i){ a->Tst(W(i.s1), W(i.s0)); }
void emit(const testwi& i){ a->Tst(W(i.s1), i.s0.uw()); }
void emit(const testl& i) { a->Tst(W(i.s1), W(i.s0)); }
void emit(const testli& i) { a->Tst(W(i.s1), i.s0.l()); }
void emit(const testq& i) { a->Tst(X(i.s1), X(i.s0)); }
void emit(const testqi& i) { a->Tst(X(i.s1), i.s0.q()); }
void emit(const trap& /*i*/);
void emit(const ucomisd& i) { a->Fcmp(D(i.s0), D(i.s1)); }
void emit(const unpcklpd&);
void emit(const xorb& i);
void emit(const xorbi& i);
void emit(const xorw& i);
void emit(const xorwi& i);
void emit(const xorl& i);
void emit(const xorq& i);
void emit(const xorqi& i);
// arm intrinsics
void emit(const prefetch& /*i*/) { /* ignored */ }
void emit(const fcvtzs& i) { a->Fcvtzs(X(i.d), D(i.s)); }
void emit(const mrs& i) { a->Mrs(X(i.r), vixl::SystemRegister(i.s.l())); }
void emit(const msr& i) { a->Msr(vixl::SystemRegister(i.s.l()), X(i.r)); }
void emit(const ubfmli& i) { a->ubfm(W(i.d), W(i.s), i.mr.w(), i.ms.w()); }
void emit_nop() { a->Nop(); }
private:
CodeBlock& frozen() { return env.text.frozen().code; }
static void recordAddressImmediate(Venv& env, TCA addr) {
env.meta.addressImmediates.insert(addr);
}
void recordAddressImmediate() {
env.meta.addressImmediates.insert(env.cb->frontier());
}
private:
Venv& env;
vixl::MacroAssembler assem;
vixl::MacroAssembler* a;
Address base;
const Vlabel current;
const Vlabel next;
jit::vector<Venv::LabelPatch>& jmps;
jit::vector<Venv::LabelPatch>& jccs;
jit::vector<Venv::LabelPatch>& catches;
};
///////////////////////////////////////////////////////////////////////////////
static CodeBlock* getBlock(Venv& env, CodeAddress a) {
for (auto const& area : env.text.areas()) {
if (area.code.contains(a)) {
return &area.code;
}
}
return nullptr;
}
void Vgen::emitVeneers(Venv& env) {
auto& meta = env.meta;
decltype(env.meta.veneers) notEmitted;
for (auto const& veneer : meta.veneers) {
auto cb = getBlock(env, veneer.source);
if (!cb) {
// If we can't find the code block, it must have been emitted by a Vunit
// wrapping this one (bindjmp emits a Vunit within a Vunit).
notEmitted.push_back(veneer);
continue;
}
auto const vaddr = cb->frontier();
FTRACE(1, "emitVeneers: source = {}, target = {}, veneer at {}\n",
veneer.source, veneer.target, vaddr);
// Emit the veneer code: LDR + BR.
meta.veneerAddrs.insert(vaddr);
MacroAssembler av{*cb};
vixl::Label target_data;
meta.addressImmediates.insert(vaddr);
poolLiteral(*cb, meta, (uint64_t)makeTarget32(veneer.target), 32, true);
av.bind(&target_data);
av.Ldr(rAsm_w, &target_data);
av.Br(rAsm);
// Update the veneer source instruction to jump/call the veneer.
auto const realSource = env.text.toDestAddress(veneer.source);
CodeBlock tmpBlock;
tmpBlock.init(realSource, kInstructionSize, "emitVeneers");
MacroAssembler at{tmpBlock};
int64_t offset = vaddr - veneer.source;
auto sourceInst = Instruction::Cast(realSource);
if (sourceInst->Mask(UnconditionalBranchMask) == B) {
always_assert(is_int28(offset));
at.b(offset >> kInstructionSizeLog2);
} else if (sourceInst->Mask(UnconditionalBranchMask) == BL) {
always_assert(is_int28(offset));
at.bl(offset >> kInstructionSizeLog2);
} else if (sourceInst->IsCondBranchImm()) {
auto const cond = static_cast<Condition>(sourceInst->ConditionBranch());
if (is_int21(offset)) {
at.b(offset >> kInstructionSizeLog2, cond);
} else {
// The offset doesn't fit in a conditional jump. Hopefully it still fits
// in an unconditional jump, in which case we add an appendix to the
// veneer.
offset += 2 * kInstructionSize;
always_assert(is_int28(offset));
// Add an appendix to the veneer, and jump to it instead. The full
// veneer in this case looks like:
// VENEER:
// LDR RX, LITERAL_ADDR
// BR RX
// APPENDIX:
// B.CC VENEER
// B NEXT
// And the conditional jump into the veneer is turned into a jump to the
// appendix:
// B APPENDIX
// NEXT:
// Turn the original conditional branch into an unconditional one.
at.b(offset >> kInstructionSizeLog2);
// Emit appendix.
auto const appendix = cb->frontier();
av.b(-2 /* veneer starts 2 instructions before the appendix */, cond);
const int64_t nextOffset = (veneer.source + kInstructionSize) - // NEXT
(vaddr + 3 * kInstructionSize); // addr of "B NEXT"
always_assert(is_int28(nextOffset));
av.b(nextOffset >> kInstructionSizeLog2);
// Replace veneer.source with appendix in the relevant metadata.
meta.smashableLocations.erase(veneer.source);
meta.smashableLocations.insert(appendix);
for (auto& tj : meta.inProgressTailJumps) {
if (tj.toSmash() == veneer.source) tj.adjust(appendix);
}
for (auto& bind : meta.smashableBinds) {
if (bind.smashable.toSmash() == veneer.source) {
bind.smashable.adjust(appendix);
}
}
}
} else {
always_assert_flog(0, "emitVeneers: invalid source instruction at source"
" {} (realSource = {})",
veneer.source, realSource);
}
}
env.meta.veneers.swap(notEmitted);
}
void Vgen::handleLiterals(Venv& env) {
decltype(env.meta.literalsToPool) notEmitted;
for (auto const& pl : env.meta.literalsToPool) {
auto const cb = getBlock(env, pl.patchAddress);
if (!cb) {
// If we can't find the code block it must have been emitted by a Vunit
// wrapping this one. (bindjmp emits a Vunit within a Vunit)
notEmitted.push_back(pl);
continue;
}
// Emit the literal.
auto literalAddress = cb->frontier();
if (pl.width == 32) {
cb->dword(static_cast<uint32_t>(pl.value));
} else if (pl.width == 64) {
if (pl.smashable) {
// Although the region is actually dead, we mark it as live, so that
// the relocator can remove the padding.
align(*cb, &env.meta, Alignment::QuadWordSmashable, AlignContext::Live);
literalAddress = cb->frontier();
}
cb->qword(pl.value);
} else {
not_reached();
}
// Patch the LDR.
auto const patchAddressActual =
Instruction::Cast(env.text.toDestAddress(pl.patchAddress));
assertx(patchAddressActual->IsLoadLiteral());
patchAddressActual->SetImmPCOffsetTarget(
Instruction::Cast(literalAddress),
Instruction::Cast(pl.patchAddress));
}
if (env.meta.fallthru) {
auto const fallthru = *env.meta.fallthru;
auto const cb = getBlock(env, fallthru);
if (!cb) {
always_assert_flog(false,
"Fallthrus shouldn't be used in nested Vunits.");
}
auto const blockEndAddr = cb->frontier();
auto const startAddr = cb->toDestAddress(fallthru);
CodeBlock tmp;
tmp.init(startAddr, kInstructionSize, "Tmp");
// Write the jmp.
Assembler a { tmp };
recordAddressImmediate(env, fallthru);
a.b((blockEndAddr - fallthru) >> kInstructionSizeLog2);
}
env.meta.literalsToPool.swap(notEmitted);
}
void Vgen::retargetBinds(Venv& env) {
}
void Vgen::patch(Venv& env) {
// Patch the 32 bit target of the LDR
auto patch = [&env](TCA instr, TCA target) {
// The LDR loading the address to branch to.
auto ldr = Instruction::Cast(instr);
auto const DEBUG_ONLY br = ldr->NextInstruction();
assertx(ldr->Mask(LoadLiteralMask) == LDR_w_lit &&
br->Mask(UnconditionalBranchToRegisterMask) == BR &&
ldr->Rd() == br->Rn());
// The address the LDR loads.
auto targetAddr = ldr->LiteralAddress();
// Patch the 32 bit target following the LDR and BR
patchTarget32(targetAddr, target);
};
for (auto const& p : env.jmps) {
auto addr = env.text.toDestAddress(p.instr);
auto const target = env.addrs[p.target];
assertx(target);
if (env.meta.smashableLocations.count(p.instr)) {
assertx(possiblySmashableJmp(addr));
// Update `addr' to point to the veneer.
addr = TCA(vixl::Instruction::Cast(addr)->ImmPCOffsetTarget());
}
// Patch the address we are jumping to.
patch(addr, target);
}
for (auto const& p : env.jccs) {
auto addr = env.text.toDestAddress(p.instr);
auto const target = env.addrs[p.target];
assertx(target);
if (env.meta.smashableLocations.count(p.instr)) {
assertx(possiblySmashableJcc(addr));
// Update `addr' to point to the veneer.
addr = TCA(vixl::Instruction::Cast(addr)->ImmPCOffsetTarget());
} else {
assertx(Instruction::Cast(addr)->IsCondBranchImm());
// If the jcc starts with a conditional jump, patch the next instruction
// (which should start with a LDR).
addr += kInstructionSize;
}
patch(addr, target);
}
for (auto const& p : env.leas) {
(void)p;
not_implemented();
}
}
///////////////////////////////////////////////////////////////////////////////
void Vgen::emit(const copy& i) {
if (i.s == i.d) return;
if (i.s.isGP() && i.d.isGP()) {
a->Mov(X(i.d), X(i.s));
} else if (i.s.isSIMD() && i.d.isGP()) {
a->Fmov(X(i.d), D(i.s));
} else if (i.s.isGP() && i.d.isSIMD()) {
a->Fmov(D(i.d), X(i.s));
} else {
assertx(i.s.isSIMD() && i.d.isSIMD());
a->mov(V(i.d), V(i.s));
}
}
void Vgen::emit(const copy2& i) {
assertx(i.s0.isValid() && i.s1.isValid() && i.d0.isValid() && i.d1.isValid());
auto s0 = i.s0, s1 = i.s1, d0 = i.d0, d1 = i.d1;
assertx(d0 != d1);
if (d0 == s1) {
if (d1 == s0) {
a->Eor(X(d0), X(d0), X(s0));
a->Eor(X(s0), X(d0), X(s0));
a->Eor(X(d0), X(d0), X(s0));
} else {
// could do this in a simplify pass
if (s1 != d1) a->Mov(X(s1), X(d1)); // save s1 first; d1 != s0
if (s0 != d0) a->Mov(X(s0), X(d0));
}
} else {
// could do this in a simplify pass
if (s0 != d0) a->Mov(X(s0), X(d0));
if (s1 != d1) a->Mov(X(s1), X(d1));
}
}
void emitSimdImmInt(vixl::MacroAssembler* a, uint64_t val, Vreg d) {
// Assembler::fmov emits a ldr from a literal pool if IsImmFP64 is false.
// In that case, emit the raw bits into a GPR first and then move them
// unmodified into destination SIMD
union { double dval; uint64_t ival; };
ival = val;
if (vixl::Assembler::IsImmFP64(dval)) {
a->Fmov(D(d), dval);
} else if (ival == 0) {
a->Fmov(D(d), vixl::xzr);
} else {
a->Mov(rAsm, ival);
a->Fmov(D(d), rAsm);
}
}
void Vgen::emit(const fallthru& /*i*/) {
always_assert(!env.meta.fallthru);
env.meta.fallthru = a->frontier();
a->nop();
}
#define Y(vasm_opc, simd_w, vr_w, gpr_w, imm) \
void Vgen::emit(const vasm_opc& i) { \
if (i.d.isSIMD()) { \
emitSimdImmInt(a, static_cast<uint##vr_w##_t>(i.s.simd_w()), i.d); \
} else { \
Vreg##vr_w d = i.d; \
a->Mov(gpr_w(d), imm); \
} \
}
Y(ldimmb, ub, 8, W, i.s.ub())
Y(ldimmw, uw, 16, W, i.s.uw())
Y(ldimml, l, 32, W, i.s.l())
Y(ldimmq, q, 64, X, i.s.q())
#undef Y
void Vgen::emit(const load& i) {
if (i.d.isGP()) {
a->Ldr(X(i.d), M(i.s));
} else {
a->Ldr(D(i.d), M(i.s));
}
}
void Vgen::emit(const store& i) {
if (i.s.isGP()) {
if (i.s == arm::rsp()) {
a->Mov(rAsm, X(i.s));
a->Str(rAsm, M(i.d));
} else {
a->Str(X(i.s), M(i.d));
}
} else {
a->Str(D(i.s), M(i.d));
}
}
///////////////////////////////////////////////////////////////////////////////
void Vgen::emit(const mcprep& i) {
/*
* Initially, we set the cache to hold (addr << 1) | 1 (where `addr' is the
* address of the movq) so that we can find the movq from the handler.
*
* We set the low bit for two reasons: the Class* will never be a valid
* Class*, so we'll always miss the inline check before it's smashed, and
* MethodCache::handleStaticCall can tell it's not been smashed yet
*/
align(*env.cb, &env.meta, Alignment::SmashMovq, AlignContext::Live);
auto const imm = reinterpret_cast<uint64_t>(a->frontier());
emitSmashableMovq(*env.cb, env.meta, (imm << 1) | 1, r64(i.d));
env.meta.addressImmediates.insert(reinterpret_cast<TCA>(~imm));
}
///////////////////////////////////////////////////////////////////////////////
void Vgen::emit(const call& i) {
recordAddressImmediate();
a->Mov(rAsm, i.target);
a->Blr(rAsm);
if (i.watch) {
*i.watch = a->frontier();
env.meta.watchpoints.push_back(i.watch);
}
}
void Vgen::emit(const calls& i) {
emitSmashableCall(*env.cb, env.meta, i.target);
}
///////////////////////////////////////////////////////////////////////////////
void Vgen::emit(const callstub& i) {
emit(call{i.target, i.args});
}
void Vgen::emit(const callfaststub& i) {
emit(call{i.target, i.args});
}
///////////////////////////////////////////////////////////////////////////////
void Vgen::emit(const phpret& i) {
// prefer load-pair instruction
if (!i.noframe) {
a->ldp(X(arm::rvmfp()), X(rlr()), X(i.fp)[AROFF(m_sfp)]);
} else {
a->Ldr(X(rlr()), X(i.fp)[AROFF(m_savedRip)]);
}
emit(ret{});
}
void Vgen::emit(const contenter& i) {
vixl::Label stub, end;
// Jump past the stub below.
recordAddressImmediate();
a->B(&end);
// We call into this stub from the end below. Take that LR and store it in
// m_savedRip. Then jump to the target.
a->bind(&stub);
a->Str(X(rlr()), M(i.fp[AROFF(m_savedRip)]));
a->Br(X(i.target));
// Call to stub above and then unwind.
a->bind(&end);
recordAddressImmediate();
a->Bl(&stub);
emit(unwind{{i.targets[0], i.targets[1]}});
}
///////////////////////////////////////////////////////////////////////////////
void Vgen::emit(const leavetc& /*i*/) {
// The LR was preserved on the stack by resumetc. Pop it while preserving
// SP alignment and return.
a->Ldp(rAsm, X(rlr()), MemOperand(sp, 16, PostIndex));
a->Ret();
}
///////////////////////////////////////////////////////////////////////////////
void Vgen::emit(const nothrow& /*i*/) {
env.meta.catches.emplace_back(a->frontier(), nullptr);
}
void Vgen::emit(const syncpoint& i) {
FTRACE(5, "IR recordSyncPoint: {} {}\n", a->frontier(), i.fix.show());
env.meta.fixups.emplace_back(a->frontier(), i.fix);
env.record_inline_stack(a->frontier());
}
void Vgen::emit(const unwind& i) {
catches.push_back({a->frontier(), i.targets[1]});
env.record_inline_stack(a->frontier());
emit(jmp{i.targets[0]});
}
///////////////////////////////////////////////////////////////////////////////
/*
* Flags
* SF should be set to MSB of the result
* CF, OF should be set to (1, 1) if the result is truncated, (0, 0) otherwise
* ZF, AF, PF are undefined
*
* In the following implementation,
* N, Z, V are updated according to result
* C is cleared (FIXME)
*/
void Vgen::emit(const imul& i) {
// Do the multiplication
a->Mul(X(i.d), X(i.s0), X(i.s1));
// If we have to set any flags, then always set N and Z since it's cheap.
// Only set V when absolutely necessary. C is not supported.
if (i.fl) {
vixl::Label after;
checkSF(i, StatusFlags::NotC);
if (flagRequired(i.fl, StatusFlags::V)) {
vixl::Label checkSign;
vixl::Label Overflow;
// Do the multiplication for the upper 64 bits of a 128 bit result.
// If the result is not all zeroes or all ones, then we have overflow.
// If the result is all zeroes or all ones, and the sign is the same,
// for both hi and low, then there is no overflow.
a->smulh(rAsm, X(i.s0), X(i.s1));
// If hi is all 0's or 1's, then check the sign, else overflow
// (fallthrough).
recordAddressImmediate();
a->Cbz(rAsm, &checkSign);
a->Cmp(rAsm, -1);
recordAddressImmediate();
a->B(&checkSign, vixl::eq);
// Overflow, so conditionally set N and Z bits and then or in V bit.
a->Bind(&Overflow);
a->Bic(vixl::xzr, X(i.d), vixl::xzr, SetFlags);
a->Mrs(rAsm, NZCV);
a->Orr(rAsm, rAsm, 1<<28);
a->Msr(NZCV, rAsm);
recordAddressImmediate();
a->B(&after);
// Check the signs of hi and lo.
a->Bind(&checkSign);
a->Eor(rAsm, rAsm, X(i.d));
recordAddressImmediate();
a->Tbnz(rAsm, 63, &Overflow);
}
// No Overflow, so conditionally set the N and Z only
a->Bic(vixl::xzr, X(i.d), vixl::xzr, SetFlags);
a->bind(&after);
}
}
void Vgen::emit(const decqmlock& i) {
auto adr = M(i.m);
/* Use VIXL's macroassembler scratch regs. */
a->SetScratchRegisters(vixl::NoReg, vixl::NoReg);
if (RuntimeOption::EvalJitArmLse) {
a->Mov(rVixlScratch0, -1);
a->ldaddal(rVixlScratch0, rVixlScratch0, adr);
a->Sub(rAsm, rVixlScratch0, 1, SetFlags);
} else {
vixl::Label again;
a->bind(&again);
a->ldxr(rAsm, adr);
a->Sub(rAsm, rAsm, 1, SetFlags);
a->stxr(rVixlScratch0, rAsm, adr);
recordAddressImmediate();
a->Cbnz(rVixlScratch0, &again);
}
/* Restore VIXL's scratch regs. */
a->SetScratchRegisters(rVixlScratch0, rVixlScratch1);
}
void Vgen::emit(const jcc& i) {
if (i.targets[1] != i.targets[0]) {
if (next == i.targets[1]) {
return emit(jcc{ccNegate(i.cc), i.sf, {i.targets[1], i.targets[0]}});
}
auto taken = i.targets[1];
jccs.push_back({a->frontier(), taken});
vixl::Label skip, data;
// Emit a "far JCC" sequence for easy patching later. Static relocation
// might be able to simplify this later (see optimizeFarJcc()).
recordAddressImmediate();
a->B(&skip, vixl::InvertCondition(C(i.cc)));
recordAddressImmediate();
poolLiteral(*env.cb, env.meta, (uint64_t)makeTarget32(a->frontier()),
32, false);
a->bind(&data); // This will be remmaped during the handleLiterals phase.
a->Ldr(rAsm_w, &data);
a->Br(rAsm);
a->bind(&skip);
}
emit(jmp{i.targets[0]});
}
void Vgen::emit(const jcci& i) {
vixl::Label skip;
recordAddressImmediate();
a->B(&skip, vixl::InvertCondition(C(i.cc)));
emit(jmpi{i.taken});
a->bind(&skip);
}
void Vgen::emit(const jmp& i) {
if (next == i.target) return;
jmps.push_back({a->frontier(), i.target});
vixl::Label data;
// Emit a "far JMP" sequence for easy patching later. Static relocation
// might be able to simplify this (see optimizeFarJmp()).
recordAddressImmediate();
poolLiteral(*env.cb, env.meta, (uint64_t)a->frontier(), 32, false);
a->bind(&data); // This will be remapped during the handleLiterals phase.
a->Ldr(rAsm_w, &data);
a->Br(rAsm);
}
void Vgen::emit(const jmpi& i) {
vixl::Label data;
// If target can be addressed by pc relative offset (signed 26 bits), emit
// PC relative jump. Else, emit target address into code and load from there.
auto diff = (i.target - a->frontier()) >> vixl::kInstructionSizeLog2;
if (vixl::is_int26(diff)) {
recordAddressImmediate();
a->b(diff);
} else {
// Cannot use simple a->Mov() since such a sequence cannot be
// adjusted while live following a relocation.
recordAddressImmediate();
poolLiteral(*env.cb, env.meta, (uint64_t)i.target, 32, false);
a->bind(&data); // This will be remapped during the handleLiterals phase.
a->Ldr(rAsm_w, &data);
a->Br(rAsm);
}
}
void Vgen::emit(const ldbindretaddr& i) {
auto const addr = a->frontier();
emit(leap{reg::rip[(intptr_t)addr], i.d});
env.ldbindretaddrs.push_back({addr, i.target, i.spOff});
}
void Vgen::emit(const lea& i) {
auto p = i.s;
assertx(p.base.isValid());
if (p.index.isValid()) {
assertx(p.disp == 0);
a->Add(X(i.d), X(p.base), Operand(X(p.index), LSL, Log2(p.scale)));
} else {
a->Add(X(i.d), X(p.base), p.disp);
}
}
void Vgen::emit(const leav& i) {
auto const addr = a->frontier();
emit(leap{reg::rip[(intptr_t)addr], i.d});
env.leas.push_back({addr, i.s});
}
void Vgen::emit(const leap& i) {
vixl::Label imm_data;
vixl::Label after_data;
// Cannot use simple a->Mov() since such a sequence cannot be
// adjusted while live following a relocation.
recordAddressImmediate();
poolLiteral(*env.cb, env.meta, (uint64_t)makeTarget32(i.s.r.disp),
32, false);
a->bind(&imm_data); // This will be remapped during the handleLiterals phase.
a->Ldr(W(i.d), &imm_data);
}
void Vgen::emit(const lead& i) {
recordAddressImmediate();
a->Mov(X(i.d), i.s.get());
}
#define Y(vasm_opc, arm_opc, src_dst, m) \
void Vgen::emit(const vasm_opc& i) { \
assertx(i.m.base.isValid()); \
a->Mov(rAsm, X(i.m.base)); \
if (i.m.index.isValid()) { \
a->Add(rAsm, rAsm, Operand(X(i.m.index), LSL, Log2(i.m.scale))); \
} \
if (i.m.disp != 0) { \
a->Add(rAsm, rAsm, i.m.disp); \
} \
a->arm_opc(V(i.src_dst), MemOperand(rAsm)); \
}
Y(loadups, ld1, d, s)
Y(storeups, st1, s, m)
#undef Y
/*
* Flags
* SF, ZF, PF should be updated according to result
* CF, OF should be cleared
* AF is undefined
*
* In the following implementation,
* N, Z are updated according to result
* C, V are cleared
*/
#define Y(vasm_opc, arm_opc, gpr_w, s0, zr) \
void Vgen::emit(const vasm_opc& i) { \
a->arm_opc(gpr_w(i.d), gpr_w(i.s1), s0); \
if (i.fl) { \
a->Bic(vixl::zr, gpr_w(i.d), vixl::zr, SetFlags); \
} \
}
Y(orbi, Orr, W, i.s0.ub(), wzr);
Y(orwi, Orr, W, i.s0.uw(), xzr);
Y(orli, Orr, W, i.s0.l(), xzr);
Y(orqi, Orr, X, i.s0.q(), xzr);
Y(orq, Orr, X, X(i.s0), xzr);
Y(xorb, Eor, W, W(i.s0), wzr);
Y(xorbi, Eor, W, i.s0.ub(), wzr);
Y(xorw, Eor, W, W(i.s0), wzr);
Y(xorwi, Eor, W, i.s0.uw(), wzr);
Y(xorl, Eor, W, W(i.s0), wzr);
Y(xorq, Eor, X, X(i.s0), xzr);
Y(xorqi, Eor, X, i.s0.q(), xzr);
#undef Y
void Vgen::emit(const pop& i) {
// SP access must be 8 byte aligned. Use rAsm instead.
a->Mov(rAsm, sp);
a->Ldr(X(i.d), MemOperand(rAsm, 8, PostIndex));
a->Mov(sp, rAsm);
}
void Vgen::emit(const push& i) {
// SP access must be 8 byte aligned. Use rAsm instead.
a->Mov(rAsm, sp);
a->Str(X(i.s), MemOperand(rAsm, -8, PreIndex));
a->Mov(sp, rAsm);
}
void Vgen::emit(const roundsd& i) {
switch (i.dir) {
case RoundDirection::nearest: {
a->frintn(D(i.d), D(i.s));
break;
}
case RoundDirection::floor: {
a->frintm(D(i.d), D(i.s));
break;
}
case RoundDirection:: ceil: {
a->frintp(D(i.d), D(i.s));
break;
}
default: {
assertx(i.dir == RoundDirection::truncate);
a->frintz(D(i.d), D(i.s));
}
}
}
void Vgen::emit(const srem& i) {
a->Sdiv(rAsm, X(i.s0), X(i.s1));
a->Msub(X(i.d), rAsm, X(i.s1), X(i.s0));
}
void Vgen::emit(const trap& i) {
env.meta.trapReasons.emplace_back(a->frontier(), i.reason);
a->Brk(1);
}
void Vgen::emit(const unpcklpd& i) {
// i.d and i.s1 can be same, i.s0 is unique.
if (i.d != i.s1) a->fmov(D(i.d), D(i.s1));
a->fmov(rAsm, D(i.s0));
a->fmov(D(i.d), 1, rAsm);
}
///////////////////////////////////////////////////////////////////////////////
void Vgen::emit(const cmpsd& i) {
/*
* cmpsd doesn't update SD, so read the flags into a temp.
* Use one of the macroassembler scratch regs .
*/
a->SetScratchRegisters(vixl::NoReg, vixl::NoReg);
a->Mrs(rVixlScratch0, NZCV);
a->Fcmp(D(i.s0), D(i.s1));
switch (i.pred) {
case ComparisonPred::eq_ord:
a->Csetm(rAsm, C(jit::CC_E));
break;
case ComparisonPred::ne_unord:
a->Csetm(rAsm, C(jit::CC_NE));
break;
default:
always_assert(false);
}
a->Fmov(D(i.d), rAsm);
/* Copy the flags back to the system register. */
a->Msr(NZCV, rVixlScratch0);
a->SetScratchRegisters(rVixlScratch0, rVixlScratch1);
}
///////////////////////////////////////////////////////////////////////////////
/*
* For the shifts:
*
* C is set through inspection
* N, Z are updated according to result
* V is cleared (FIXME)
* PF, AF are not available
*
* Only set the flags if there are any required flags (i.fl).
* Setting the C flag is particularly expensive, so when setting
* flags check this flag specifically.
*/
#define Y(vasm_opc, arm_opc, gpr_w, zr) \
void Vgen::emit(const vasm_opc& i) { \
if (!i.fl) { \
/* Just perform the shift. */ \
a->arm_opc(gpr_w(i.d), gpr_w(i.s1), gpr_w(i.s0)); \
} else { \
checkSF(i, StatusFlags::NotV); \
if (!flagRequired(i.fl, StatusFlags::C)) { \
/* Perform the shift and set N and Z. */ \
a->arm_opc(gpr_w(i.d), gpr_w(i.s1), gpr_w(i.s0)); \
a->Bic(vixl::zr, gpr_w(i.d), vixl::zr, SetFlags); \
} else { \
/* Use VIXL's macroassembler scratch regs. */ \
a->SetScratchRegisters(vixl::NoReg, vixl::NoReg); \
/* Perform the shift using temp and set N and Z. */ \
a->arm_opc(rVixlScratch0, gpr_w(i.s1), gpr_w(i.s0)); \
a->Bic(vixl::zr, rVixlScratch0, vixl::zr, SetFlags); \
/* Read the flags into a temp. */ \
a->Mrs(rAsm, NZCV); \
/* Reshift right leaving the last bit as bit 0. */ \
a->Sub(rVixlScratch1, gpr_w(i.s0), 1); \
a->Lsr(rVixlScratch1, gpr_w(i.s1), rVixlScratch1); \
/* Negate the bits, including bit 0 to match X64. */ \
a->Mvn(rVixlScratch1, rVixlScratch1); \
/* Copy bit zero into bit 29 of the flags. */ \
a->bfm(rAsm, rVixlScratch1, 35, 0); \
/* Copy the flags back to the system register. */ \
a->Msr(NZCV, rAsm); \
/* Copy the result to the destination. */ \
a->Mov(gpr_w(i.d), rVixlScratch0); \
/* Restore VIXL's scratch regs. */ \
a->SetScratchRegisters(rVixlScratch0, rVixlScratch1); \
} \
} \
}
Y(sar, Asr, X, xzr)
#undef Y
#define Y(vasm_opc, arm_opc, gpr_w, sz, zr) \
void Vgen::emit(const vasm_opc& i) { \
if (!i.fl) { \
/* Just perform the shift. */ \
a->arm_opc(gpr_w(i.d), gpr_w(i.s1), gpr_w(i.s0)); \
} else { \
checkSF(i, StatusFlags::NotV); \
if (!flagRequired(i.fl, StatusFlags::C)) { \
/* Perform the shift and set N and Z. */ \
a->arm_opc(gpr_w(i.d), gpr_w(i.s1), gpr_w(i.s0)); \
a->Bic(vixl::zr, gpr_w(i.d), vixl::zr, SetFlags); \
} else { \
/* Use VIXL's macroassembler scratch regs. */ \
a->SetScratchRegisters(vixl::NoReg, vixl::NoReg); \
/* Perform the shift using temp and set N and Z. */ \
a->arm_opc(rVixlScratch0, gpr_w(i.s1), gpr_w(i.s0)); \
a->Bic(vixl::zr, rVixlScratch0, vixl::zr, SetFlags); \
/* Read the flags into a temp. */ \
a->Mrs(rAsm, NZCV); \
/* Reshift right leaving the last bit as bit 0. */ \
a->Mov(rVixlScratch1, sz); \
a->Sub(rVixlScratch1, rVixlScratch1, gpr_w(i.s0)); \
a->Lsr(rVixlScratch1, gpr_w(i.s1), rVixlScratch1); \
/* Negate the bits, including bit 0 to match X64. */ \
a->Mvn(rVixlScratch1, rVixlScratch1); \
/* Copy bit zero into bit 29 of the flags. */ \
a->bfm(rAsm, rVixlScratch1, 35, 0); \
/* Copy the flags back to the system register. */ \
a->Msr(NZCV, rAsm); \
/* Copy the result to the destination. */ \
a->Mov(gpr_w(i.d), rVixlScratch0); \
/* Restore VIXL's scratch regs. */ \
a->SetScratchRegisters(rVixlScratch0, rVixlScratch1); \
} \
} \
}
Y(shl, Lsl, X, 64, xzr)
#undef Y
#define Y(vasm_opc, arm_opc, gpr_w, zr) \
void Vgen::emit(const vasm_opc& i) { \
if (!i.fl) { \
/* Just perform the shift. */ \
a->arm_opc(gpr_w(i.d), gpr_w(i.s1), i.s0.l()); \
} else { \
checkSF(i, StatusFlags::NotV); \
if (!flagRequired(i.fl, StatusFlags::C)) { \
/* Perform the shift and set N and Z. */ \
a->arm_opc(gpr_w(i.d), gpr_w(i.s1), i.s0.l()); \
a->Bic(vixl::zr, gpr_w(i.d), vixl::zr, SetFlags); \
} else { \
/* Use VIXL's macroassembler scratch regs. */ \
a->SetScratchRegisters(vixl::NoReg, vixl::NoReg); \
/* Perform the shift using temp and set N and Z. */ \
a->arm_opc(rVixlScratch0, gpr_w(i.s1), i.s0.l()); \
a->Bic(vixl::zr, rVixlScratch0, vixl::zr, SetFlags); \
/* Read the flags into a temp. */ \
a->Mrs(rAsm, NZCV); \
/* Reshift right leaving the last bit as bit 0. */ \
a->Lsr(rVixlScratch1, gpr_w(i.s1), i.s0.l() - 1); \
/* Negate the bits, including bit 0 to match X64. */ \
a->Mvn(rVixlScratch1, rVixlScratch1); \
/* Copy bit zero into bit 29 of the flags. */ \
a->bfm(rAsm, rVixlScratch1, 35, 0); \
/* Copy the flags back to the system register. */ \
a->Msr(NZCV, rAsm); \
/* Copy the result to the destination. */ \
a->Mov(gpr_w(i.d), rVixlScratch0); \
/* Restore VIXL's scratch regs. */ \
a->SetScratchRegisters(rVixlScratch0, rVixlScratch1); \
} \
} \
}
Y(sarqi, Asr, X, xzr)
Y(shrli, Lsr, W, wzr)
Y(shrqi, Lsr, X, xzr)
#undef Y
#define Y(vasm_opc, arm_opc, gpr_w, sz, zr) \
void Vgen::emit(const vasm_opc& i) { \
if (!i.fl) { \
/* Just perform the shift. */ \
a->arm_opc(gpr_w(i.d), gpr_w(i.s1), i.s0.l()); \
} else { \
checkSF(i, StatusFlags::NotV); \
if (!flagRequired(i.fl, StatusFlags::C)) { \
/* Perform the shift and set N and Z. */ \
a->arm_opc(gpr_w(i.d), gpr_w(i.s1), i.s0.l()); \
a->Bic(vixl::zr, gpr_w(i.d), vixl::zr, SetFlags); \
} else { \
/* Use VIXL's macroassembler scratch regs. */ \
a->SetScratchRegisters(vixl::NoReg, vixl::NoReg); \
/* Perform the shift using temp and set N and Z. */ \
a->arm_opc(rVixlScratch0, gpr_w(i.s1), i.s0.l()); \
a->Bic(vixl::zr, rVixlScratch0, vixl::zr, SetFlags); \
/* Read the flags into a temp. */ \
a->Mrs(rAsm, NZCV); \
/* Reshift right leaving the last bit as bit 0. */ \
a->Lsr(rVixlScratch1, gpr_w(i.s1), sz - i.s0.l()); \
/* Negate the bits, including bit 0 to match X64. */ \
a->Mvn(rVixlScratch1, rVixlScratch1); \
/* Copy bit zero into bit 29 of the flags. */ \
a->bfm(rAsm, rVixlScratch1, 35, 0); \
/* Copy the flags back to the system register. */ \
a->Msr(NZCV, rAsm); \
/* Copy the result to the destination. */ \
a->Mov(gpr_w(i.d), rVixlScratch0); \
/* Restore VIXL's scratch regs. */ \
a->SetScratchRegisters(rVixlScratch0, rVixlScratch1); \
} \
} \
}
Y(shlli, Lsl, W, 32, wzr)
Y(shlqi, Lsl, X, 64, xzr)
#undef Y
///////////////////////////////////////////////////////////////////////////////
void Vgen::emit(const popp& i) {
a->Ldp(X(i.d0), X(i.d1), MemOperand(sp, 16, PostIndex));
}
void Vgen::emit(const pushp& i) {
a->Stp(X(i.s1), X(i.s0), MemOperand(sp, -16, PreIndex));
}
///////////////////////////////////////////////////////////////////////////////
template<typename Lower>
void lower_impl(Vunit& unit, Vlabel b, size_t i, Lower lower) {
vmodify(unit, b, i, [&] (Vout& v) { lower(v); return 1; });
}
template <typename Inst>
void lower(const VLS& /*env*/, Inst& /*inst*/, Vlabel /*b*/, size_t /*i*/) {}
///////////////////////////////////////////////////////////////////////////////
/*
* TODO: Using load size (ldr[bh]?), apply scaled address if 'disp' is unsigned
*/
void lowerVptr(Vptr& p, Vout& v) {
enum {
BASE = 1,
INDEX = 2,
DISP = 4
};
uint8_t mode = (((p.base.isValid() & 0x1) << 0) |
((p.index.isValid() & 0x1) << 1) |
(((p.disp != 0) & 0x1) << 2));
switch (mode) {
case BASE:
case BASE | INDEX:
// ldr/str allow [base] and [base, index], nothing to lower.
break;
case INDEX:
// Not supported, convert to [base].
if (p.scale > 1) {
auto t = v.makeReg();
v << shlqi{Log2(p.scale), p.index, t, v.makeReg()};
p.base = t;
} else {
p.base = p.index;
}
p.index = Vreg{};
p.scale = 1;
break;
case BASE | DISP: {
// ldr/str allow [base, #imm], where #imm is [-256 .. 255].
if (p.disp >= -256 && p.disp <= 255)
break;
// #imm is out of range, convert to [base, index]
auto index = v.makeReg();
v << ldimmq{Immed64(p.disp), index};
p.index = index;
p.scale = 1;
p.disp = 0;
break;
}
case DISP: {
// Not supported, convert to [base].
auto base = v.makeReg();
v << ldimmq{Immed64(p.disp), base};
p.base = base;
p.index = Vreg{};
p.scale = 1;
p.disp = 0;
break;
}
case INDEX | DISP:
// Not supported, convert to [base, #imm] or [base, index].
if (p.scale > 1) {
auto t = v.makeReg();
v << shlqi{Log2(p.scale), p.index, t, v.makeReg()};
p.base = t;
} else {
p.base = p.index;
}
if (p.disp >= -256 && p.disp <= 255) {
p.index = Vreg{};
p.scale = 1;
} else {
auto index = v.makeReg();
v << ldimmq{Immed64(p.disp), index};
p.index = index;
p.scale = 1;
p.disp = 0;
}
break;
case BASE | INDEX | DISP: {
// Not supported, convert to [base, index].
auto index = v.makeReg();
if (p.scale > 1) {
auto t = v.makeReg();
v << shlqi{Log2(p.scale), p.index, t, v.makeReg()};
v << addqi{p.disp, t, index, v.makeReg()};
} else {
v << addqi{p.disp, p.index, index, v.makeReg()};
}
p.index = index;
p.scale = 1;
p.disp = 0;
break;
}
}
}
#define Y(vasm_opc, m) \
void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) { \
lower_impl(e.unit, b, z, [&] (Vout& v) { \
lowerVptr(i.m, v); \
v << i; \
}); \
}
Y(decqmlock, m)
Y(lea, s)
Y(load, s)
Y(loadb, s)
Y(loadl, s)
Y(loadsd, s)
Y(loadtqb, s)
Y(loadtql, s)
Y(loadups, s)
Y(loadw, s)
Y(loadzbl, s)
Y(loadzbq, s)
Y(loadzlq, s)
Y(store, d)
Y(storeb, m)
Y(storel, m)
Y(storesd, m)
Y(storeups, m)
Y(storew, m)
#undef Y
#define Y(vasm_opc, lower_opc, load_opc, store_opc, arg, m) \
void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) { \
lower_impl(e.unit, b, z, [&] (Vout& v) { \
lowerVptr(i.m, v); \
auto r0 = v.makeReg(), r1 = v.makeReg(); \
v << load_opc{i.m, r0}; \
v << lower_opc{arg, r0, r1, i.sf, i.fl}; \
v << store_opc{r1, i.m}; \
}); \
}
Y(addlim, addli, loadl, storel, i.s0, m)
Y(addlm, addl, loadl, storel, i.s0, m)
Y(addwm, addl, loadw, storew, Reg32(i.s0), m)
Y(addqim, addqi, load, store, i.s0, m)
Y(andbim, andbi, loadb, storeb, i.s, m)
Y(subqim, subqi, load, store, i.s0, m)
Y(orbim, orqi, loadb, storeb, i.s0, m)
Y(orqim, orqi, load, store, i.s0, m)
Y(orwim, orqi, loadw, storew, i.s0, m)
Y(orlim, orqi, loadl, storel, i.s0, m)
#undef Y
#define Y(vasm_opc, lower_opc, movs_opc) \
void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) { \
if (!i.fl || (i.fl & static_cast<Vflags>(StatusFlags::NV))) { \
lower_impl(e.unit, b, z, [&] (Vout& v) { \
auto r0 = v.makeReg(), r1 = v.makeReg(); \
v << movs_opc{i.s0, r0}; \
v << movs_opc{i.s1, r1}; \
v << lower_opc{r0, r1, i.sf, i.fl}; \
}); \
} \
}
Y(cmpb, cmpl, movsbl)
Y(cmpw, cmpl, movswl)
#undef Y
#define Y(vasm_opc, lower_opc, movs_opc) \
void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) { \
if (!i.fl || (i.fl & static_cast<Vflags>(StatusFlags::NV))) { \
lower_impl(e.unit, b, z, [&] (Vout& v) { \
auto r = v.makeReg(); \
v << movs_opc{i.s1, r}; \
v << lower_opc{i.s0, r, i.sf, i.fl}; \
}); \
} \
}
Y(cmpbi, cmpli, movsbl)
Y(cmpwi, cmpli, movswl)
#undef Y
#define Y(vasm_opc, lower_opc, load_opc) \
void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) { \
lower_impl(e.unit, b, z, [&] (Vout& v) { \
lowerVptr(i.s1, v); \
auto r = e.allow_vreg() ? v.makeReg() : Vreg(PhysReg(rAsm)); \
v << load_opc{i.s1, r}; \
v << lower_opc{i.s0, r, i.sf, i.fl}; \
}); \
}
Y(cmpbim, cmpbi, loadb)
Y(cmplim, cmpli, loadl)
Y(cmpbm, cmpb, loadb)
Y(cmpwm, cmpw, loadb)
Y(cmplm, cmpl, loadl)
Y(cmpqim, cmpqi, load)
Y(cmpqm, cmpq, load)
Y(cmpwim, cmpwi, loadw)
Y(testbim, testli, loadb)
Y(testlim, testli, loadl)
Y(testqim, testqi, load)
Y(testbm, testb, loadb)
Y(testwm, testw, loadw)
Y(testlm, testl, loadl)
Y(testqm, testq, load)
Y(testwim, testli, loadw)
#undef Y
void lower(const VLS& e, cvtsi2sdm& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
lowerVptr(i.s, v);
auto r = v.makeReg();
v << load{i.s, r};
v << cvtsi2sd{r, i.d};
});
}
#define Y(vasm_opc, lower_opc, load_opc, store_opc, m) \
void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) { \
lower_impl(e.unit, b, z, [&] (Vout& v) { \
lowerVptr(i.m, v); \
auto r0 = e.allow_vreg() ? v.makeReg() : Vreg(PhysReg(rAsm)); \
auto r1 = e.allow_vreg() ? v.makeReg() : Vreg(PhysReg(rAsm)); \
v << load_opc{i.m, r0}; \
v << lower_opc{r0, r1, i.sf, i.fl}; \
v << store_opc{r1, i.m}; \
}); \
}
Y(declm, decl, loadl, storel, m)
Y(decqm, decq, load, store, m)
Y(inclm, incl, loadl, storel, m)
Y(incqm, incq, load, store, m)
Y(incwm, incw, loadw, storew, m)
#undef Y
void lower(const VLS& e, cvttsd2siq& i, Vlabel b, size_t idx) {
lower_impl(e.unit, b, idx, [&] (Vout& v) {
// Clear FPSR IOC flag.
auto const tmp1 = v.makeReg();
auto const tmp2 = v.makeReg();
v << mrs{FPSR, tmp1};
v << andqi{~0x01, tmp1, tmp2, v.makeReg()};
v << msr{tmp2, FPSR};
// Load error value.
auto const err = v.makeReg();
v << ldimmq{0x8000000000000000, err};
// Do ARM64's double to signed int64 conversion.
auto const res = v.makeReg();
v << fcvtzs{i.s, res};
// Check if there was a conversion error.
auto const fpsr = v.makeReg();
auto const sf = v.makeReg();
v << mrs{FPSR, fpsr};
v << testqi{1, fpsr, sf};
// Move converted value or error.
v << cmovq{CC_NZ, sf, res, err, i.d};
});
}
void lower(const VLS& e, callm& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
lowerVptr(i.target, v);
auto const scratch = v.makeReg();
// Load the target from memory and then call it.
v << load{i.target, scratch};
v << callr{scratch, i.args};
});
}
void lower(const VLS& e, jmpm& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
lowerVptr(i.target, v);
auto const scratch = v.makeReg();
v << load{i.target, scratch};
v << jmpr{scratch, i.args};
});
}
///////////////////////////////////////////////////////////////////////////////
void lower(const VLS& e, stublogue& /*i*/, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
// Push both the LR and FP regardless of i.saveframe to align SP.
v << pushp{rlr(), arm::rvmfp()};
});
}
void lower(const VLS& e, unstublogue& /*i*/, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
// Pop LR and remove FP from the stack.
v << popp{PhysReg(rAsm), rlr()};
});
}
void lower(const VLS& e, stubret& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
// Pop LR and (optionally) FP.
if (i.saveframe) {
v << popp{arm::rvmfp(), rlr()};
} else {
v << popp{PhysReg(rAsm), rlr()};
}
v << ret{i.args};
});
}
void lower(const VLS& e, tailcallstub& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
// Restore LR from native stack and adjust SP.
v << popp{PhysReg(rAsm), rlr()};
// Then directly jump to the target.
v << jmpi{i.target, i.args};
});
}
void lower(const VLS& e, tailcallstubr& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
// Restore LR from native stack and adjust SP.
v << popp{PhysReg(rAsm), rlr()};
v << jmpr{i.target, i.args};
});
}
void lower(const VLS& e, stubunwind& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
// Pop the call frame.
v << popp{PhysReg(rAsm), i.d};
});
}
void lower(const VLS& e, stubtophp& /*i*/, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
// Pop the call frame
v << lea{arm::rsp()[16], arm::rsp()};
});
}
void lower(const VLS& e, loadstubret& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
// Load the LR to the destination.
v << load{arm::rsp()[AROFF(m_savedRip)], i.d};
});
}
///////////////////////////////////////////////////////////////////////////////
void lower(const VLS& e, phplogue& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
v << store{rlr(), i.fp[AROFF(m_savedRip)]};
});
}
///////////////////////////////////////////////////////////////////////////////
void lower(const VLS& e, resumetc& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
// Call the translation target.
v << callr{i.target, i.args};
// After returning to the translation, jump directly to the exit.
v << jmpi{i.exittc};
});
}
///////////////////////////////////////////////////////////////////////////////
void lower(const VLS& e, popm& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
auto r = v.makeReg();
v << pop{r};
lowerVptr(i.d, v);
v << store{r, i.d};
});
}
void lower(const VLS& e, poppm& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
auto r0 = v.makeReg();
auto r1 = v.makeReg();
v << popp{r0, r1};
lowerVptr(i.d0, v);
lowerVptr(i.d1, v);
v << store{r0, i.d0};
v << store{r1, i.d1};
});
}
void lower(const VLS& e, pushm& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
auto r = v.makeReg();
lowerVptr(i.s, v);
v << load{i.s, r};
v << push{r};
});
}
void lower(const VLS& e, pushpm& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
auto r0 = v.makeReg();
auto r1 = v.makeReg();
lowerVptr(i.s0, v);
lowerVptr(i.s1, v);
v << load{i.s0, r0};
v << load{i.s1, r1};
v << pushp{r0, r1};
});
}
template<typename movz>
void lower_movz(const VLS& e, movz& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
v << copy{i.s, i.d};
});
}
void lower(const VLS& e, movzbw& i, Vlabel b, size_t z) {
lower_movz(e, i, b, z);
}
void lower(const VLS& e, movzbl& i, Vlabel b, size_t z) {
lower_movz(e, i, b, z);
}
void lower(const VLS& e, movzwl& i, Vlabel b, size_t z) {
lower_movz(e, i, b, z);
}
void lower(const VLS& e, movtdb& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
auto d = v.makeReg();
v << copy{i.s, d};
v << movtqb{d, i.d};
});
}
void lower(const VLS& e, movtdq& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
v << copy{i.s, i.d};
});
}
#define Y(vasm_opc, lower_opc, load_opc, imm, zr, sz) \
void lower(const VLS& e, vasm_opc& i, Vlabel b, size_t z) { \
lower_impl(e.unit, b, z, [&] (Vout& v) { \
lowerVptr(i.m, v); \
if (imm.sz() == 0u) { \
v << lower_opc{PhysReg(vixl::zr), i.m}; \
} else { \
auto r = v.makeReg(); \
v << load_opc{imm, r}; \
v << lower_opc{r, i.m}; \
} \
}); \
}
Y(storebi, storeb, ldimmb, i.s, wzr, b)
Y(storewi, storew, ldimmw, i.s, wzr, w)
Y(storeli, storel, ldimml, i.s, wzr, l)
//storeqi only supports 32-bit immediates
Y(storeqi, store, ldimmq, Immed64(i.s.l()), wzr, q)
#undef Y
void lower(const VLS& e, cloadq& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
auto const scratch = v.makeReg();
lowerVptr(i.t, v);
v << load{i.t, scratch};
v << cmovq{i.cc, i.sf, i.f, scratch, i.d};
});
}
void lower(const VLS& e, loadqp& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
auto const scratch = v.makeReg();
v << leap{i.s, scratch};
v << load{scratch[0], i.d};
});
}
void lower(const VLS& e, loadqd& i, Vlabel b, size_t z) {
lower_impl(e.unit, b, z, [&] (Vout& v) {
auto const scratch = v.makeReg();
v << lead{i.s.getRaw(), scratch};
v << load{scratch[0], i.d};
});
}
///////////////////////////////////////////////////////////////////////////////
void lowerForARM(Vunit& unit) {
vasm_lower(unit, [&] (const VLS& env, Vinstr& inst, Vlabel b, size_t i) {
switch (inst.op) {
#define O(name, ...) \
case Vinstr::name: \
lower(env, inst.name##_, b, i); \
break;
VASM_OPCODES
#undef O
}
});
}
///////////////////////////////////////////////////////////////////////////////
}
void optimizeARM(Vunit& unit, const Abi& abi, bool regalloc) {
Timer timer(Timer::vasm_optimize);
removeTrivialNops(unit);
optimizePhis(unit);
fuseBranches(unit);
optimizeJmps(unit, false);
assertx(checkWidths(unit));
simplify(unit);
annotateSFUses(unit);
lowerForARM(unit);
simplify(unit);
if (!unit.constToReg.empty()) {
foldImms<arm::ImmFolder>(unit);
}
reuseImmq(unit);
optimizeCopies(unit, abi);
annotateSFUses(unit);
if (unit.needsRegAlloc()) {
removeDeadCode(unit);
if (regalloc) {
splitCriticalEdges(unit);
if (RuntimeOption::EvalUseGraphColor &&
unit.context &&
(unit.context->kind == TransKind::Optimize ||
unit.context->kind == TransKind::OptPrologue)) {
allocateRegistersWithGraphColor(unit, abi);
} else {
allocateRegistersWithXLS(unit, abi);
}
}
}
optimizeExits(unit);
optimizeJmps(unit, true);
}
void emitARM(Vunit& unit, Vtext& text, CGMeta& fixups,
AsmInfo* asmInfo) {
vasm_emit<Vgen>(unit, text, fixups, asmInfo);
}
///////////////////////////////////////////////////////////////////////////////
}