hphp/runtime/vm/jit/vasm-x64.cpp (1,027 lines of code) (raw):

/* +----------------------------------------------------------------------+ | HipHop for PHP | +----------------------------------------------------------------------+ | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) | +----------------------------------------------------------------------+ | This source file is subject to version 3.01 of the PHP license, | | that is bundled with this package in the file LICENSE, and is | | available through the world-wide-web at the following url: | | http://www.php.net/license/3_01.txt | | If you did not receive a copy of the PHP license and are unable to | | obtain it through the world-wide-web, please send a note to | | license@php.net so we can mail you a copy immediately. | +----------------------------------------------------------------------+ */ #include "hphp/runtime/vm/jit/vasm-emit.h" #include "hphp/runtime/base/runtime-option.h" #include "hphp/runtime/base/tracing.h" #include "hphp/runtime/vm/jit/abi-x64.h" #include "hphp/runtime/vm/jit/block.h" #include "hphp/runtime/vm/jit/code-gen-helpers.h" #include "hphp/runtime/vm/jit/print.h" #include "hphp/runtime/vm/jit/prof-data.h" #include "hphp/runtime/vm/jit/service-requests.h" #include "hphp/runtime/vm/jit/smashable-instr-x64.h" #include "hphp/runtime/vm/jit/target-cache.h" #include "hphp/runtime/vm/jit/timer.h" #include "hphp/runtime/vm/jit/vasm.h" #include "hphp/runtime/vm/jit/vasm-block-counters.h" #include "hphp/runtime/vm/jit/vasm-instr.h" #include "hphp/runtime/vm/jit/vasm-internal.h" #include "hphp/runtime/vm/jit/vasm-lower.h" #include "hphp/runtime/vm/jit/vasm-print.h" #include "hphp/runtime/vm/jit/vasm-prof.h" #include "hphp/runtime/vm/jit/vasm-unit.h" #include "hphp/runtime/vm/jit/vasm-util.h" #include "hphp/runtime/vm/jit/vasm-visit.h" #include <algorithm> #include <tuple> TRACE_SET_MOD(vasm); namespace HPHP::jit { /////////////////////////////////////////////////////////////////////////////// using namespace reg; using namespace x64; namespace x64 { struct ImmFolder; } namespace { /////////////////////////////////////////////////////////////////////////////// static_assert(folly::kIsLittleEndian, "Code contains little-endian specific optimizations."); template<class X64Asm> struct Vgen { explicit Vgen(Venv& env) : env(env) , a(*env.cb) , current(env.current) , next(env.next) , jmps(env.jmps) , jccs(env.jccs) , catches(env.catches) {} static void emitVeneers(Venv& env) {} static void handleLiterals(Venv& env) {} static void retargetBinds(Venv& env); static void patch(Venv& env); static void pad(CodeBlock& cb); ///////////////////////////////////////////////////////////////////////////// template<class Inst> void emit(const Inst& i) { always_assert_flog(false, "unimplemented instruction: {} in B{}\n", vinst_names[Vinstr(i).op], size_t(current)); } // intrinsics void emit(const prefetch& i) { a.prefetch(i.m.mr()); } void emit(const copy& i); void emit(const copy2& i); void emit(const debugtrap& /*i*/) { a.int3(); } void emit(const fallthru&); void emit(const killeffects& /*i*/) {} void emit(const ldimmb& i); void emit(const ldimml& i); void emit(const ldimmq& i); void emit(const ldundefq& /*i*/) {} void emit(const load& i); void emit(const store& i); void emit(const mcprep& i); // native function abi void emit(const call& i); void emit(const callm& i) { a.prefix(i.target.mr()).call(i.target); } void emit(const callr& i) { a.call(i.target); } void emit(const calls& i); void emit(const ret& /*i*/) { a.ret(); } // stub function abi void emit(const stubret& i); void emit(const callstub& i); void emit(const callfaststub& i); void emit(const tailcallstub& i); void emit(const tailcallstubr& i); // php function abi void emit(const callphp& i) { emit(call{i.target, i.args}); setCallFuncId(env, a.frontier()); } void emit(const callphpr& i) { emit(callr{i.target, i.args}); setCallFuncId(env, a.frontier()); } void emit(const phpret& i); void emit(const contenter& i); // vm entry abi void emit(const inittc& /*i*/) {} void emit(const leavetc&) { a.ret(); } // exceptions void emit(const landingpad& /*i*/) {} void emit(const nothrow& i); void emit(const syncpoint& i); void emit(const unwind& i); // instructions void emit(absdbl i) { unary(i); a.psllq(1, i.d); a.psrlq(1, i.d); } void emit(andb i) { commuteSF(i); a.andb(i.s0, i.d); } void emit(andbi i) { binary(i); a.andb(i.s0, i.d); } void emit(const andbim& i) { a.prefix(i.m.mr()).andb(i.s, i.m); } void emit(andw i) { commuteSF(i); a.andw(i.s0, i.d); } void emit(andwi i) { binary(i); a.andw(i.s0, i.d); } void emit(andl i) { commuteSF(i); a.andl(i.s0, i.d); } void emit(andli i) { binary(i); a.andl(i.s0, i.d); } void emit(andq i) { commuteSF(i); a.andq(i.s0, i.d); } void emit(andqi i); void emit(const addwm& i) { a.prefix(i.m.mr()).addw(i.s0, i.m); } void emit(addl i) { commuteSF(i); a.addl(i.s0, i.d); } void emit(addli i) { binary(i); a.addl(i.s0, i.d); } void emit(const addlm& i) { a.prefix(i.m.mr()).addl(i.s0, i.m); } void emit(const addlim& i); void emit(addq i) { commuteSF(i); a.addq(i.s0, i.d); } void emit(addqi i) { binary(i); a.addq(i.s0, i.d); } void emit(const addqmr& i); void emit(const addqrm& i); void emit(const addqim& i); void emit(addsd i) { commute(i); a.addsd(i.s0, i.d); } void emit(const btrq& i) { binary(i); a.btrq(i.s0, i.d); } void emit(const cloadq& i); template<class cmov> void emit_cmov(const cmov& i); void emit(const cmovb& i) { emit_cmov(i); } void emit(const cmovw& i) { emit_cmov(i); } void emit(const cmovl& i) { emit_cmov(i); } void emit(const cmovq& i) { emit_cmov(i); } void emit(const cmpb& i) { a.cmpb(i.s0, i.s1); } void emit(const cmpbi& i) { a.cmpb(i.s0, i.s1); } void emit(const cmpbim& i) { a.prefix(i.s1.mr()).cmpb(i.s0, i.s1); } void emit(const cmpbm& i) { a.prefix(i.s1.mr()).cmpb(i.s0, i.s1); } void emit(const cmpw& i) { a.cmpw(i.s0, i.s1); } void emit(const cmpwi& i) { a.cmpw(i.s0, i.s1); } void emit(const cmpwim& i) { a.prefix(i.s1.mr()).cmpw(i.s0, i.s1); } void emit(const cmpwm& i) { a.prefix(i.s1.mr()).cmpw(i.s0, i.s1); } void emit(const cmpl& i) { a.cmpl(i.s0, i.s1); } void emit(const cmpli& i) { a.cmpl(i.s0, i.s1); } void emit(const cmplim& i) { a.prefix(i.s1.mr()).cmpl(i.s0, i.s1); } void emit(const cmplm& i) { a.prefix(i.s1.mr()).cmpl(i.s0, i.s1); } void emit(const cmpq& i) { a.cmpq(i.s0, i.s1); } void emit(const cmpqi& i) { a.cmpq(i.s0, i.s1); } void emit(const cmpqim& i) { a.prefix(i.s1.mr()).cmpq(i.s0, i.s1); } void emit(const cmpqm& i) { a.prefix(i.s1.mr()).cmpq(i.s0, i.s1); } void emit(cmpsd i) { noncommute(i); a.cmpsd(i.s0, i.d, i.pred); } void emit(const cqo& /*i*/) { a.cqo(); } void emit(const cvttsd2siq& i) { a.cvttsd2siq(i.s, i.d); } void emit(const cvtsi2sd& i); void emit(const cvtsi2sdm& i); void emit(decl i) { unary(i); a.decl(i.d); } void emit(const declm& i) { a.prefix(i.m.mr()).decl(i.m); } void emit(decq i) { unary(i); a.decq(i.d); } void emit(const decqm& i) { a.prefix(i.m.mr()).decq(i.m); } void emit(const decqmlock& i) { a.prefix(i.m.mr()).decqlock(i.m); } void emit(const decqmlocknosf&); void emit(divsd i) { noncommute(i); a.divsd(i.s0, i.d); } void emit(imul i) { commuteSF(i); a.imul(i.s0, i.d); } void emit(const idiv& i) { a.idiv(i.s); } void emit(incl i) { unary(i); a.incl(i.d); } void emit(const inclm& i) { a.prefix(i.m.mr()).incl(i.m); } void emit(incq i) { unary(i); a.incq(i.d); } void emit(const incqm& i) { a.prefix(i.m.mr()).incq(i.m); } void emit(const incwm& i) { a.prefix(i.m.mr()).incw(i.m); } void emit(const jcc& i); void emit(const jcci& i); void emit(const jmp& i); void emit(const jmpr& i) { a.jmp(i.target); } void emit(const jmpm& i) { a.prefix(i.target.mr()).jmp(i.target); } void emit(const jmpi& i); void emit(const ldbindretaddr& i); void emit(const lea& i); void emit(const leap& i) { a.lea(i.s, i.d); } void emit(const leav& i); void emit(const lead& i) { a.lea(rip[(intptr_t)i.s.get()], i.d); } void emit(const loadups& i) { a.prefix(i.s.mr()).movups(i.s, i.d); } void emit(const loadtqb& i) { a.prefix(i.s.mr()).loadb(i.s, i.d); } void emit(const loadb& i) { a.prefix(i.s.mr()).loadb(i.s, i.d); } void emit(const loadw& i) { a.prefix(i.s.mr()).loadw(i.s, i.d); } void emit(const loadtql& i) { a.prefix(i.s.mr()).loadl(i.s, i.d); } void emit(const loadl& i) { a.prefix(i.s.mr()).loadl(i.s, i.d); } void emit(const loadqp& i) { a.loadq(i.s, i.d); } void emit(const loadqd& i) { a.loadq(rip[(intptr_t)i.s.get()], i.d); } void emit(const loadsd& i) { a.prefix(i.s.mr()).movsd(i.s, i.d); } void emit(const loadzbl& i) { a.prefix(i.s.mr()).loadzbl(i.s, i.d); } void emit(const loadzbq& i) { a.prefix(i.s.mr()).loadzbl(i.s, Reg32(i.d)); } void emit(const loadsbq& i) { a.prefix(i.s.mr()).loadsbq(i.s, i.d); } void emit(const loadzwq& i) { a.prefix(i.s.mr()).loadzwl(i.s, Reg32(i.d)); } void emit(const loadzlq& i) { a.prefix(i.s.mr()).loadl(i.s, Reg32(i.d)); } void emit(const movb& i) { a.movb(i.s, i.d); } void emit(const movl& i) { a.movl(i.s, i.d); } void emit(const movzbw& i) { a.movzbl(i.s, Reg32(i.d)); } void emit(const movzbl& i) { a.movzbl(i.s, i.d); } void emit(const movzbq& i) { a.movzbl(i.s, Reg32(i.d)); } void emit(const movzwl& i) { a.movzwl(i.s, i.d); } void emit(const movzwq& i) { a.movzwl(i.s, Reg32(i.d)); } void emit(const movzlq& i) { a.movl(i.s, Reg32(i.d)); } void emit(const movsbq& i) { a.movsbq(i.s, i.d); } void emit(mulsd i) { commute(i); a.mulsd(i.s0, i.d); } void emit(neg i) { unary(i); a.neg(i.d); } void emit(const nop& /*i*/) { a.nop(); } void emit(not i) { unary(i); a.not(i.d); } void emit(notb i) { unary(i); a.notb(i.d); } void emit(orbi i) { binary(i); a.orb(i.s0, i.d); } void emit(const orbim& i) { a.prefix(i.m.mr()).orb(i.s0, i.m); } void emit(const orwim& i) { a.prefix(i.m.mr()).orw(i.s0, i.m); } void emit(const orlim& i) { a.prefix(i.m.mr()).orl(i.s0, i.m); } void emit(orq i) { commuteSF(i); a.orq(i.s0, i.d); } void emit(orwi i) { binary(i); a.orw(i.s0, i.d); } void emit(orli i) { binary(i); a.orl(i.s0, i.d); } void emit(orqi i) { binary(i); a.orq(i.s0, i.d); } void emit(const orqim& i) { a.prefix(i.m.mr()).orq(i.s0, i.m); } void emit(const pop& i) { a.pop(i.d); } void emit(const popm& i) { a.prefix(i.d.mr()).pop(i.d); } void emit(const popf& i) { assertx(i.d == RegSF{0}); a.popf(); } void emit(const push& i) { a.push(i.s); } void emit(const pushm& i) { a.prefix(i.s.mr()).push(i.s); } void emit(const pushf& i) { assertx(i.s == RegSF{0}); a.pushf(); } void emit(const roundsd& i) { a.roundsd(i.dir, i.s, i.d); } void emit(const sarq& i) { unary(i); a.sarq(i.d); } void emit(sarqi i) { binary(i); a.sarq(i.s0, i.d); } void emit(const setcc& i) { a.setcc(i.cc, i.d); } void emit(shlli i) { binary(i); a.shll(i.s0, i.d); } void emit(shlq i) { unary(i); a.shlq(i.d); } void emit(shrq i) { unary(i); a.shrq(i.d); } void emit(shlqi i) { binary(i); a.shlq(i.s0, i.d); } void emit(shrli i) { binary(i); a.shrl(i.s0, i.d); } void emit(shrqi i) { binary(i); a.shrq(i.s0, i.d); } void emit(const sqrtsd& i) { a.sqrtsd(i.s, i.d); } void emit(const storeups& i) { a.prefix(i.m.mr()).movups(i.s, i.m); } void emit(const storeb& i) { a.prefix(i.m.mr()).storeb(i.s, i.m); } void emit(const storebi& i); void emit(const storel& i) { a.prefix(i.m.mr()).storel(i.s, i.m); } void emit(const storeli& i) { a.prefix(i.m.mr()).storel(i.s, i.m); } void emit(const storeqi& i); void emit(const storesd& i) { a.prefix(i.m.mr()).movsd(i.s, i.m); } void emit(const storew& i) { a.prefix(i.m.mr()).storew(i.s, i.m); } void emit(const storewi& i) { a.prefix(i.m.mr()).storew(i.s, i.m); } void emit(subl i) { noncommute(i); a.subl(i.s0, i.d); } void emit(subli i) { binary(i); a.subl(i.s0, i.d); } void emit(subq i) { noncommute(i); a.subq(i.s0, i.d); } void emit(subqi i) { binary(i); a.subq(i.s0, i.d); } void emit(const subqim& i); void emit(subsd i) { noncommute(i); a.subsd(i.s0, i.d); } void emit(const testb& i) { a.testb(i.s0, i.s1); } void emit(const testbi& i) { a.testb(i.s0, i.s1); } void emit(const testbm& i) { a.prefix(i.s1.mr()).testb(i.s0, i.s1); } void emit(const testbim& i) { a.prefix(i.s1.mr()).testb(i.s0, i.s1); } void emit(const testw& i) { a.testw(i.s0, i.s1); } void emit(const testwi& i); void emit(const testwm& i) { a.prefix(i.s1.mr()).testw(i.s0, i.s1); } void emit(const testwim& i); void emit(const testl& i) { a.testl(i.s0, i.s1); } void emit(const testli& i); void emit(const testlm& i) { a.prefix(i.s1.mr()).testl(i.s0, i.s1); } void emit(const testlim& i); void emit(const testq& i) { a.testq(i.s0, i.s1); } void emit(const testqi& i); void emit(const testqm& i) { a.prefix(i.s1.mr()).testq(i.s0, i.s1); } void emit(const testqim& i); void emit(const trap& i); void emit(const ucomisd& i) { a.ucomisd(i.s0, i.s1); } void emit(unpcklpd i) { noncommute(i); a.unpcklpd(i.s0, i.d); } void emit(xorb i) { commuteSF(i); a.xorb(i.s0, i.d); } void emit(xorbi i) { binary(i); a.xorb(i.s0, i.d); } void emit(xorw i) { commuteSF(i); a.xorw(i.s0, i.d); } void emit(xorwi i) { binary(i); a.xorw(i.s0, i.d); } void emit(xorl i) { commuteSF(i); a.xorl(i.s0, i.d); } void emit(xorq i); void emit(xorqi i) { binary(i); a.xorq(i.s0, i.d); } void emit(const conjure& /*i*/) { always_assert(false); } void emit(const conjureuse& /*i*/) { always_assert(false); } void emit(const crc32q& i); void emit_nop() { emit(lea{rax[8], rax}); emit(lea{rax[-8], rax}); } private: // helpers void prep(Reg8 s, Reg8 d) { if (s != d) a.movb(s, d); } void prep(Reg16 s, Reg16 d) { if (s != d) a.movw(s, d); } void prep(Reg32 s, Reg32 d) { if (s != d) a.movl(s, d); } void prep(Reg64 s, Reg64 d) { if (s != d) a.movq(s, d); } void prep(RegXMM s, RegXMM d) { if (s != d) a.movdqa(s, d); } void emit_simd_imm(int64_t, Vreg); template<class Inst> void unary(Inst& i) { prep(i.s, i.d); } template<class Inst> void binary(Inst& i) { prep(i.s1, i.d); } template<class Inst> void commuteSF(Inst&); template<class Inst> void commute(Inst&); template<class Inst> void noncommute(Inst&); CodeBlock& frozen() { return env.text.frozen().code; } private: Venv& env; X64Asm a; const Vlabel current; const Vlabel next; jit::vector<Venv::LabelPatch>& jmps; jit::vector<Venv::LabelPatch>& jccs; jit::vector<Venv::LabelPatch>& catches; }; /////////////////////////////////////////////////////////////////////////////// /* * Prepare a binary op that is not commutative. * * s0 must be a different register than s1 so we don't clobber it. */ template<class X64Asm> template<class Inst> void Vgen<X64Asm>::noncommute(Inst& i) { assertx(i.s1 == i.d || i.s0 != i.d); // do not clobber s0 binary(i); } /* * Prepare a binary op that is commutative. * * Swap operands if the dest is s0. */ template<class X64Asm> template<class Inst> void Vgen<X64Asm>::commuteSF(Inst& i) { if (i.s1 != i.d && i.s0 == i.d) { i = Inst{i.s1, i.s0, i.d, i.sf}; } else { binary(i); } } template<class X64Asm> template<class Inst> void Vgen<X64Asm>::commute(Inst& i) { if (i.s1 != i.d && i.s0 == i.d) { i = Inst{i.s1, i.s0, i.d}; } else { binary(i); } } /////////////////////////////////////////////////////////////////////////////// /* * Returns true iff the status flags necessary to take a j<a> imply that a j<b> * will also be taken. */ bool ccImplies(ConditionCode a, ConditionCode b) { if (a == b) return true; switch (a) { case CC_None: case CC_O: case CC_NO: case CC_AE: case CC_BE: case CC_NE: case CC_S: case CC_NS: case CC_P: case CC_NP: case CC_GE: case CC_LE: return false; case CC_B: return b == CC_BE; case CC_E: return b == CC_BE || b == CC_LE; case CC_A: return b == CC_AE || b == CC_NE; case CC_L: return b == CC_LE; case CC_G: return b == CC_NE || b == CC_GE; } always_assert(false); } /* * When two jccs go to the same destination, the cc of the first is compatible * with the cc of the second, and they're within a one-byte offset of each * other, retarget the first to jump to the second. This will allow the * relocator to shrink the first one, and the extra jmp shouldn't matter since * we try to only do this to rarely taken jumps. */ template<typename Key, typename Hash> jit::hash_set<TCA> retargetJumps( Venv& env, const jit::hash_map<Key, jit::vector<TCA>, Hash>& jccs ) { jit::hash_set<TCA> retargeted; for (auto& pair : jccs) { auto const& jmps = pair.second; if (jmps.size() < 2) continue; for (size_t i = 0; i < jmps.size(); ++i) { DecodedInstruction di(env.text.toDestAddress(jmps[i]), jmps[i]); // Don't bother if the jump is already a short jump. if (di.size() != 6) continue; for (size_t j = jmps.size() - 1; j > i; --j) { auto const delta = jmps[j] - jmps[i] + 2; // Backwards jumps are probably not guards, and don't retarget to a // dest that's more than a one-byte offset away. if (delta < 0 || !deltaFits(delta, sz::byte)) continue; DecodedInstruction dj(env.text.toDestAddress(jmps[j]), jmps[j]); if (!ccImplies(di.jccCondCode(), dj.jccCondCode())) continue; di.setPicAddress(jmps[j]); retargeted.insert(jmps[i]); // We might've converted a smashable jump to a regular in-unit jump, so // remove any smashable alignments. auto range = env.meta.alignments.equal_range(jmps[i]); while (range.first != range.second) { auto iter = range.first; ++range.first; auto& align = iter->second; if (align.first == Alignment::SmashJcc && align.second == AlignContext::Live) { env.meta.alignments.erase(iter); } } break; } } } return retargeted; } namespace { struct SrcKeyBoolTupleHasher { size_t operator()(std::tuple<SrcKey, bool> v) const { return folly::hash::hash_combine( std::get<0>(v).toAtomicInt(), std::get<1>(v) ); } }; } template<class X64Asm> void Vgen<X64Asm>::retargetBinds(Venv& env) { if (RuntimeOption::EvalJitRetargetJumps < 1) return; // The target is unique per the SrcKey and the fallback flag. jit::hash_map< std::pair<SrcKey, bool>, jit::vector<TCA>, SrcKeyBoolTupleHasher > binds; for (auto const& b : env.meta.smashableBinds) { if (b.smashable.type() == IncomingBranch::Tag::JCC) { binds[std::make_pair(b.sk, b.fallback)] .emplace_back(b.smashable.toSmash()); } } auto const retargeted = retargetJumps(env, std::move(binds)); if (retargeted.empty()) return; // Finally, remove any retargeted jmps from inProgressTailJumps and // smashableBinds. GrowableVector<IncomingBranch> newTailJumps; for (auto& jmp : env.meta.inProgressTailJumps) { if (retargeted.count(jmp.toSmash()) == 0) { newTailJumps.push_back(jmp); } } env.meta.inProgressTailJumps.swap(newTailJumps); decltype(env.meta.smashableBinds) newBinds; for (auto& bind : env.meta.smashableBinds) { if (retargeted.count(bind.smashable.toSmash()) == 0) { newBinds.push_back(bind); } else { FTRACE(3, "retargetBinds: removed {} from smashableBinds\n", bind.smashable.toSmash()); } } env.meta.smashableBinds.swap(newBinds); } template<class X64Asm> void Vgen<X64Asm>::patch(Venv& env) { for (auto const& p : env.jmps) { assertx(env.addrs[p.target]); X64Asm::patchJmp( env.text.toDestAddress(p.instr), p.instr, env.addrs[p.target]); } auto const optLevel = RuntimeOption::EvalJitRetargetJumps; jit::hash_map<TCA, jit::vector<TCA>> jccs; for (auto const& p : env.jccs) { assertx(env.addrs[p.target]); X64Asm::patchJcc( env.text.toDestAddress(p.instr), p.instr, env.addrs[p.target]); if (optLevel >= 2) { jccs[env.addrs[p.target]].emplace_back(p.instr); } } if (!jccs.empty()) retargetJumps(env, jccs); for (auto const& p : env.leas) { assertx(env.vaddrs[p.target]); DecodedInstruction di(env.text.toDestAddress(p.instr), p.instr); assertx(di.hasPicOffset()); di.setPicAddress(env.vaddrs[p.target]); } } template<class X64Asm> void Vgen<X64Asm>::pad(CodeBlock& cb) { X64Asm a { cb }; a.pad(); } /////////////////////////////////////////////////////////////////////////////// template<class X64Asm> void Vgen<X64Asm>::emit(const copy& i) { if (i.s == i.d) return; if (i.s.isGP()) { if (i.d.isGP()) { // GP => GP a.movq(i.s, i.d); } else { // GP => XMM assertx(i.d.isSIMD()); // This generates a movq x86 instruction, which zero extends // the 64-bit value in srcReg into a 128-bit XMM register a.movq_rx(i.s, i.d); } } else { if (i.d.isGP()) { // XMM => GP a.movq_xr(i.s, i.d); } else { // XMM => XMM assertx(i.d.isSIMD()); // This copies all 128 bits in XMM, // thus avoiding partial register stalls a.movdqa(i.s, i.d); } } } template<class X64Asm> void Vgen<X64Asm>::emit(const copy2& i) { assertx(i.s0.isValid() && i.s1.isValid() && i.d0.isValid() && i.d1.isValid()); auto s0 = i.s0, s1 = i.s1, d0 = i.d0, d1 = i.d1; assertx(d0 != d1); if (d0 == s1) { if (d1 == s0) { a.xchgq(d0, d1); } else { // could do this in a simplify pass if (s1 != d1) a.movq(s1, d1); // save s1 first; d1 != s0 if (s0 != d0) a.movq(s0, d0); } } else { // could do this in a simplify pass if (s0 != d0) a.movq(s0, d0); if (s1 != d1) a.movq(s1, d1); } } template<class X64Asm> void Vgen<X64Asm>::emit_simd_imm(int64_t val, Vreg d) { if (val == 0) { a.pxor(d, d); // does not modify flags } else { auto addr = alloc_literal(env, val); a.movsd(rip[(intptr_t)addr], d); } } template<class X64Asm> void Vgen<X64Asm>::emit(const ldimmb& i) { // ldimmb is for Vconst::Byte, which is treated as unsigned uint8_t auto val = i.s.ub(); if (i.d.isGP()) { Vreg8 d8 = i.d; a.movb(static_cast<int8_t>(val), d8); } else { emit_simd_imm(val, i.d); } } template<class X64Asm> void Vgen<X64Asm>::emit(const ldimml& i) { // ldimml is for Vconst::Long, which is treated as unsigned uint32_t auto val = i.s.l(); if (i.d.isGP()) { Vreg32 d32 = i.d; a.movl(val, d32); } else { emit_simd_imm(uint32_t(val), i.d); } } template<class X64Asm> void Vgen<X64Asm>::emit(const ldimmq& i) { auto val = i.s.q(); if (i.d.isGP()) { if (val == 0) { Vreg32 d32 = i.d; a.movl(0, d32); // because emitImmReg tries the xor optimization } else { a.emitImmReg(i.s, i.d); } } else { emit_simd_imm(val, i.d); } } template<class X64Asm> void Vgen<X64Asm>::emit(const load& i) { auto mref = i.s.mr(); a.prefix(mref); if (i.d.isGP()) { a.loadq(mref, i.d); } else { assertx(i.d.isSIMD()); a.movsd(mref, i.d); } } template<class X64Asm> void Vgen<X64Asm>::emit(const store& i) { auto const mref = i.d.mr(); a.prefix(mref); if (i.s.isGP()) { a.storeq(i.s, i.d); } else { assertx(i.s.isSIMD()); a.movsd(i.s, i.d); } } /////////////////////////////////////////////////////////////////////////////// template<class X64Asm> void Vgen<X64Asm>::emit(const mcprep& i) { /* * Initially, we set the cache to hold (addr << 1) | 1 (where `addr' is the * address of the movq) so that we can find the movq from the handler. * * We set the low bit for two reasons: the Class* will never be a valid * Class*, so we'll always miss the inline check before it's smashed, and * MethodCache::handleStaticCall can tell it's not been smashed yet */ auto const mov_addr = emitSmashableMovq(a.code(), env.meta, 0, r64(i.d)); auto const imm = reinterpret_cast<uint64_t>(mov_addr); smashMovq(a.toDestAddress(mov_addr), (imm << 1) | 1); env.meta.addressImmediates.insert(reinterpret_cast<TCA>(~imm)); } /////////////////////////////////////////////////////////////////////////////// template<class X64Asm> void Vgen<X64Asm>::emit(const call& i) { if (a.jmpDeltaFits(i.target)) { a.call(i.target); } else { // can't do a near call; store address in data section. // call by loading the address using rip-relative addressing. This // assumes the data section is near the current code section. Since // this sequence is directly in-line, rip-relative like this is // more compact than loading a 64-bit immediate. auto addr = alloc_literal(env, (uint64_t)i.target); a.call(rip[(intptr_t)addr]); } if (i.watch) { *i.watch = a.frontier(); env.meta.watchpoints.push_back(i.watch); } } template<class X64Asm> void Vgen<X64Asm>::emit(const calls& i) { emitSmashableCall(a.code(), env.meta, i.target); } /////////////////////////////////////////////////////////////////////////////// template<class X64Asm> void Vgen<X64Asm>::emit(const stubret& i) { if (i.saveframe) { a.pop(x64::rvmfp()); } else { a.addq(8, reg::rsp); } a.ret(); } template<class X64Asm> void Vgen<X64Asm>::emit(const callstub& i) { emit(call{i.target, i.args}); } template<class X64Asm> void Vgen<X64Asm>::emit(const callfaststub& i) { emit(call{i.target, i.args}); } template<class X64Asm> void Vgen<X64Asm>::emit(const tailcallstub& i) { a.addq(8, reg::rsp); emit(jmpi{i.target, i.args}); } template<class X64Asm> void Vgen<X64Asm>::emit(const tailcallstubr& i) { a.addq(8, reg::rsp); emit(jmpr{i.target, i.args}); } /////////////////////////////////////////////////////////////////////////////// template<class X64Asm> void Vgen<X64Asm>::emit(const phpret& i) { a.push(i.fp[AROFF(m_savedRip)]); if (!i.noframe) { a.loadq(i.fp[AROFF(m_sfp)], x64::rvmfp()); } a.ret(); } template<class X64Asm> void Vgen<X64Asm>::emit(const contenter& i) { Label Stub, End; Reg64 fp = i.fp, target = i.target; a.jmp8(End); asm_label(a, Stub); a.pop(fp[AROFF(m_savedRip)]); a.jmp(target); asm_label(a, End); a.call(Stub); // m_savedRip will point here. emit(unwind{{i.targets[0], i.targets[1]}}); } /////////////////////////////////////////////////////////////////////////////// template<class X64Asm> void Vgen<X64Asm>::emit(const nothrow& /*i*/) { env.meta.catches.emplace_back(a.frontier(), nullptr); } template<class X64Asm> void Vgen<X64Asm>::emit(const syncpoint& i) { FTRACE(5, "IR recordSyncPoint: {} {}\n", a.frontier(), i.fix.show()); env.meta.fixups.emplace_back(a.frontier(), i.fix); env.record_inline_stack(a.frontier()); } template<class X64Asm> void Vgen<X64Asm>::emit(const unwind& i) { catches.push_back({a.frontier(), i.targets[1]}); env.record_inline_stack(a.frontier()); emit(jmp{i.targets[0]}); } /////////////////////////////////////////////////////////////////////////////// template<class X64Asm> void Vgen<X64Asm>::emit(const fallthru&) { a.nop(); } /////////////////////////////////////////////////////////////////////////////// template<class X64Asm> void Vgen<X64Asm>::emit(andqi i) { if (magFits(i.s0.q(), sz::dword)) { emit(andli{int32_t(i.s0.q()), Reg32(i.s1), Reg32(i.d), i.sf}); return; } binary(i); a.andq(i.s0, i.d); } template<class X64Asm> void Vgen<X64Asm>::emit(const addlim& i) { auto mref = i.m.mr(); a.prefix(mref).addl(i.s0, mref); } template<typename X64Asm> void Vgen<X64Asm>::emit(const addqmr& i) { binary(i); auto const mref = i.m.mr(); a.prefix(mref).addq(mref, i.d); } template<typename X64Asm> void Vgen<X64Asm>::emit(const addqrm& i) { auto const mref = i.m.mr(); a.prefix(mref).addq(i.s1, mref); } template<class X64Asm> void Vgen<X64Asm>::emit(const addqim& i) { auto mref = i.m.mr(); a.prefix(mref).addq(i.s0, mref); } template<class X64Asm> void Vgen<X64Asm>::emit(const subqim& i) { auto mref = i.m.mr(); a.prefix(mref).subq(i.s0, mref); } template<class X64Asm> void Vgen<X64Asm>::emit(const cloadq& i) { auto m = i.t; always_assert(!m.index.isValid()); // not supported, but could be later. if (i.f != i.d) { if (i.d == m.base) { // We can't move f over d or we'll clobber the Vptr we need to load from. // Since cload does the load unconditionally anyway, we can just load and // cmov. a.prefix(m.mr()).loadq(i.t, i.d); a.cmov_reg64_reg64(ccNegate(i.cc), i.f, i.d); return; } a.movq(i.f, i.d); } a.prefix(m.mr()).cload_reg64_disp_reg64(i.cc, m.base, m.disp, i.d); } // add s0 s1 d => mov s1->d; d += s0 // cmov cc s d => if cc { mov s->d } template<class X64Asm> template<class cmov> void Vgen<X64Asm>::emit_cmov(const cmov& i) { if (i.f != i.d && i.t == i.d) { // negate the condition and swap t/f operands so we dont clobber i.t return emit(cmov{ccNegate(i.cc), i.sf, i.t, i.f, i.d}); } else { prep(i.f, i.d); } a.cmov_reg64_reg64(i.cc, r64(i.t), r64(i.d)); } template<class X64Asm> void Vgen<X64Asm>::emit(const cvtsi2sd& i) { a.pxor(i.d, i.d); a.cvtsi2sd(i.s, i.d); } template<class X64Asm> void Vgen<X64Asm>::emit(const cvtsi2sdm& i) { a.pxor(i.d, i.d); a.cvtsi2sd(i.s, i.d); } template<class X64Asm> void Vgen<X64Asm>::emit(const jcc& i) { if (i.targets[1] != i.targets[0]) { if (next == i.targets[1]) { return emit(jcc{ccNegate(i.cc), i.sf, {i.targets[1], i.targets[0]}}); } auto taken = i.targets[1]; jccs.push_back({a.frontier(), taken}); a.jcc(i.cc, a.frontier()); } emit(jmp{i.targets[0]}); } template<class X64Asm> void Vgen<X64Asm>::emit(const jcci& i) { a.jcc(i.cc, i.taken); } template<class X64Asm> void Vgen<X64Asm>::emit(const jmp& i) { if (next == i.target) return; jmps.push_back({a.frontier(), i.target}); a.jmp(a.frontier()); } template<class X64Asm> void Vgen<X64Asm>::emit(const jmpi& i) { if (a.jmpDeltaFits(i.target)) { a.jmp(i.target); } else { // can't do a near jmp - use rip-relative addressing auto addr = alloc_literal(env, (uint64_t)i.target); a.jmp(rip[(intptr_t)addr]); } } template<class X64Asm> void Vgen<X64Asm>::emit(const ldbindretaddr& i) { auto const addr = a.frontier(); emit(leap{reg::rip[(intptr_t)addr], i.d}); env.ldbindretaddrs.push_back({addr, i.target, i.spOff}); } template<class X64Asm> void Vgen<X64Asm>::emit(const lea& i) { assertx(i.s.seg == Segment::DS); // could do this in a simplify pass if (i.s.disp == 0 && i.s.base.isValid() && !i.s.index.isValid()) { emit(copy{i.s.base, i.d}); } else { a.lea(i.s, i.d); } } template<class X64Asm> void Vgen<X64Asm>::emit(const leav& i) { auto const addr = a.frontier(); emit(leap{reg::rip[(intptr_t)addr], i.d}); env.leas.push_back({addr, i.s}); } template<class X64Asm> void Vgen<X64Asm>::emit(const storebi& i) { auto mref = i.m.mr(); a.prefix(mref).storeb(i.s, mref); } template<class X64Asm> void Vgen<X64Asm>::emit(const storeqi& i) { auto mref = i.m.mr(); a.prefix(mref).storeq(i.s, mref); } template<class VgenImpl, typename Inst> bool testimHelper(VgenImpl& env, const Inst& i, uint64_t mask) { // If there's only 1 byte of meaningful bits in the mask, we can adjust the // pointer offset and use testbim instead. int off = 0; while (mask > 0xff && !(mask & 0xff)) { off++; mask >>= 8; } if (mask > 0xff) return false; env.emit(testbim{int8_t(mask), i.s1 + off, i.sf}); return true; } template<class X64Asm> void Vgen<X64Asm>::emit(const testwi& i) { if (i.s0.w() == -1) { return emit(testw{i.s1, i.s1, i.sf}); } a.testw(i.s0, i.s1); } template<class X64Asm> void Vgen<X64Asm>::Vgen::emit(const testwim& i) { if (testimHelper(*this, i, i.s0.w())) return; a.prefix(i.s1.mr()).testw(i.s0, i.s1); } template<class X64Asm> void Vgen<X64Asm>::Vgen::emit(const testlim& i) { if (testimHelper(*this, i, i.s0.l())) return; a.prefix(i.s1.mr()).testl(i.s0, i.s1); } template<class X64Asm> void Vgen<X64Asm>::Vgen::emit(const testli& i) { if (i.s0.l() == -1) { return emit(testl{i.s1, i.s1, i.sf}); } a.testl(i.s0, i.s1); } template<class X64Asm> void Vgen<X64Asm>::emit(const testqi& i) { auto const imm = i.s0.q(); if (magFits(imm, sz::byte)) { a.testb(int8_t(imm), rbyte(i.s1)); } else if (magFits(imm, sz::dword)) { emit(testli{int32_t(imm), Reg32(i.s1), i.sf}); } else if (imm == -1) { emit(testq{i.s1, i.s1, i.sf}); } else { a.testq(i.s0, i.s1); } } template<class X64Asm> void Vgen<X64Asm>::emit(const testqim& i) { if (testimHelper(*this, i, i.s0.q())) return; if (magFits(i.s0.q(), sz::dword)) { // For an unsigned 32 bit immediate, we can get the same results // by emitting a testlim. emit(testlim{int32_t(i.s0.q()), i.s1, i.sf}); } else { a.prefix(i.s1.mr()).testq(i.s0, i.s1); } } template<class X64Asm> void Vgen<X64Asm>::emit(const trap& i) { env.meta.trapReasons.emplace_back(a.frontier(), i.reason); a.ud2(); } template<class X64Asm> void Vgen<X64Asm>::emit(xorq i) { if (i.s0 == i.s1) { // 32-bit xor{s, s, d} zeroes the upper bits of `d'. return emit(xorl{r32(i.s0), r32(i.s1), r32(i.d), i.sf}); } commuteSF(i); a.xorq(i.s0, i.d); } template<class X64Asm> void Vgen<X64Asm>::emit(const crc32q& i) { noncommute(i); a.crc32q(i.s0, i.d); } template<typename X64Asm> void Vgen<X64Asm>::emit(const decqmlocknosf& i) { a.pushf(); a.prefix(i.m.mr()).decqlock(i.m); a.popf(); } /////////////////////////////////////////////////////////////////////////////// template<typename Lower> void lower_impl(Vunit& unit, Vlabel b, size_t i, Lower lower) { vmodify(unit, b, i, [&] (Vout& v) { lower(v); return 1; }); } template <typename Inst> void lower(Vunit& /*unit*/, Inst& /*inst*/, Vlabel /*b*/, size_t /*i*/) {} /////////////////////////////////////////////////////////////////////////////// void lower(Vunit& unit, popp& inst, Vlabel b, size_t i) { lower_impl(unit, b, i, [&] (Vout& v) { v << pop{inst.d0}; v << pop{inst.d1}; }); } void lower(Vunit& unit, poppm& inst, Vlabel b, size_t i) { lower_impl(unit, b, i, [&] (Vout& v) { v << popm{inst.d0}; v << popm{inst.d1}; }); } void lower(Vunit& unit, pushp& inst, Vlabel b, size_t i) { lower_impl(unit, b, i, [&] (Vout& v) { v << push{inst.s0}; v << push{inst.s1}; }); } void lower(Vunit& unit, pushpm& inst, Vlabel b, size_t i) { lower_impl(unit, b, i, [&] (Vout& v) { v << pushm{inst.s0}; v << pushm{inst.s1}; }); } /////////////////////////////////////////////////////////////////////////////// void lower(Vunit& unit, stublogue& inst, Vlabel b, size_t i) { if (inst.saveframe) { unit.blocks[b].code[i] = push{x64::rvmfp()}; } else { unit.blocks[b].code[i] = lea{reg::rsp[-8], reg::rsp}; } } void lower(Vunit& unit, unstublogue& /*inst*/, Vlabel b, size_t i) { unit.blocks[b].code[i] = lea{reg::rsp[8], reg::rsp}; } void lower(Vunit& unit, stubunwind& inst, Vlabel b, size_t i) { lower_impl(unit, b, i, [&] (Vout& v) { v << lea{reg::rsp[8], reg::rsp}; v << pop{inst.d}; }); } void lower(Vunit& unit, stubtophp& /*inst*/, Vlabel b, size_t i) { unit.blocks[b].code[i] = lea{reg::rsp[16], reg::rsp}; } void lower(Vunit& unit, loadstubret& inst, Vlabel b, size_t i) { unit.blocks[b].code[i] = load{reg::rsp[8], inst.d}; } void lower(Vunit& unit, phplogue& inst, Vlabel b, size_t i) { unit.blocks[b].code[i] = popm{inst.fp[AROFF(m_savedRip)]}; } void lower(Vunit& unit, resumetc& inst, Vlabel b, size_t i) { lower_impl(unit, b, i, [&] (Vout& v) { v << callr{inst.target, inst.args}; v << jmpi{inst.exittc}; }); } /////////////////////////////////////////////////////////////////////////////// void lower(Vunit& unit, sar& inst, Vlabel b, size_t i) { lower_impl(unit, b, i, [&] (Vout& v) { v << copy{inst.s0, rcx}; v << sarq{inst.s1, inst.d, inst.sf}; }); } void lower(Vunit& unit, shl& inst, Vlabel b, size_t i) { lower_impl(unit, b, i, [&] (Vout& v) { v << copy{inst.s0, rcx}; v << shlq{inst.s1, inst.d, inst.sf}; }); } void lower(Vunit& unit, shr& inst, Vlabel b, size_t i) { lower_impl(unit, b, i, [&] (Vout& v) { v << copy{inst.s0, rcx}; v << shrq{inst.s1, inst.d, inst.sf}; }); } void lower(Vunit& unit, srem& inst, Vlabel b, size_t i) { lower_impl(unit, b, i, [&] (Vout& v) { v << copy{inst.s0, rax}; v << cqo{}; // sign-extend rax => rdx:rax v << idiv{inst.s1, v.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx v << copy{rdx, inst.d}; }); } void lower(Vunit& unit, divint& inst, Vlabel b, size_t i) { lower_impl(unit, b, i, [&] (Vout& v) { v << copy{inst.s0, rax}; v << cqo{}; // sign-extend rax => rdx:rax v << idiv{inst.s1, v.makeReg()}; // rdx:rax/divisor => quot:rax, rem:rdx v << copy{rax, inst.d}; }); } /////////////////////////////////////////////////////////////////////////////// void lower(Vunit& unit, movtqb& inst, Vlabel b, size_t i) { unit.blocks[b].code[i] = copy{inst.s, inst.d}; } void lower(Vunit& unit, movtdb& inst, Vlabel b, size_t i) { unit.blocks[b].code[i] = copy{inst.s, inst.d}; } void lower(Vunit& unit, movtdq& inst, Vlabel b, size_t i) { unit.blocks[b].code[i] = copy{inst.s, inst.d}; } void lower(Vunit& unit, movtqw& inst, Vlabel b, size_t i) { unit.blocks[b].code[i] = copy{inst.s, inst.d}; } void lower(Vunit& unit, movtql& inst, Vlabel b, size_t i) { unit.blocks[b].code[i] = copy{inst.s, inst.d}; } /////////////////////////////////////////////////////////////////////////////// /* * Lower a few abstractions to facilitate straightforward x64 codegen. */ void lowerForX64(Vunit& unit) { vasm_lower(unit, [&](const VLS& /*env*/, Vinstr& inst, Vlabel b, size_t i) { switch (inst.op) { #define O(name, ...) \ case Vinstr::name: \ lower(unit, inst.name##_, b, i); \ break; VASM_OPCODES #undef O } }); } /////////////////////////////////////////////////////////////////////////////// } void optimizeX64(Vunit& unit, const Abi& abi, bool regalloc) { Timer timer(Timer::vasm_optimize, unit.log_entry); tracing::Block _{ "vasm-optimize", [&] { return traceProps(unit).add("reg_alloc", regalloc); } }; auto const doPass = [&] (const char* name, auto fun) { rqtrace::EventGuard trace{name}; fun(unit); }; doPass("VOPT_DCE", removeDeadCode); doPass("VOPT_PHI", optimizePhis); doPass("VOPT_BRANCH", fuseBranches); doPass("VOPT_JMP", [] (Vunit& u) { optimizeJmps(u, false); }); assertx(checkWidths(unit)); if (unit.context && unit.context->kind == TransKind::Optimize && RuntimeOption::EvalProfBranchSampleFreq > 0) { // Even when branch profiling is on, we still only want to profile // non-profiling translations of PHP functions. We also require that we // can spill, so that we can generate arbitrary profiling code, and also to // ensure we don't profile unique stubs and such. doPass("VOPT_PROF_BRANCH", profile_branches); } doPass("VOPT_X64", lowerForX64); doPass("VOPT_SIMPLIFY", simplify); doPass("VOPT_X64", lowerForX64); if (!unit.constToReg.empty()) { doPass("VOPT_FOLD_IMM", foldImms<x64::ImmFolder>); } doPass("VOPT_COPY", [&] (Vunit& u) { optimizeCopies(u, abi); }); doPass("VOPT_DCE", removeDeadCode); doPass("VOPT_BRANCH", fuseBranches); if (unit.needsRegAlloc()) { doPass("VOPT_JMP", [] (Vunit& u) { optimizeJmps(u, false); }); doPass("VOPT_DCE", removeDeadCode); if (regalloc) { // vasm-block-counts and register allocation require edges to // be pre-split. splitCriticalEdges(unit); doPass("VOPT_BLOCK_WEIGHTS", VasmBlockCounters::profileGuidedUpdate); if (RuntimeOption::EvalUseGraphColor && unit.context && (unit.context->kind == TransKind::Optimize || unit.context->kind == TransKind::OptPrologue)) { rqtrace::EventGuard trace{"VOPT_GRAPH_COLOR"}; allocateRegistersWithGraphColor(unit, abi); } else { rqtrace::EventGuard trace{"VOPT_XLS"}; allocateRegistersWithXLS(unit, abi); } doPass("VOPT_SF_PEEPHOLES", [&] (Vunit& u) { sfPeepholes(u, abi); }); doPass("VOPT_POST_RA_SIMPLIFY", postRASimplify); } } // We can add side-exiting instructions now doPass("VOPT_EXIT", optimizeExits); doPass("VOPT_JMP", [] (Vunit& u) { optimizeJmps(u, true); }); } void emitX64(Vunit& unit, Vtext& text, CGMeta& fixups, AsmInfo* asmInfo) { tracing::Block _{"emit-X64", [&] { return traceProps(unit); }}; #ifdef HAVE_LIBXED if (RuntimeOption::EvalUseXedAssembler) { return vasm_emit<Vgen<XedAssembler>>(unit, text, fixups, asmInfo); } #endif vasm_emit<Vgen<X64Assembler>>(unit, text, fixups, asmInfo); } /////////////////////////////////////////////////////////////////////////////// }