libredex/IRList.h (507 lines of code) (raw):

/* * Copyright (c) Meta Platforms, Inc. and affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include <boost/intrusive/list.hpp> #include <boost/optional.hpp> #include <boost/range/sub_range.hpp> #include <functional> #include <iosfwd> #include <limits> #include <map> #include <memory> #include <set> #include <string> #include <type_traits> #include <unordered_map> #include <unordered_set> #include <vector> #include "Debug.h" class DexCallSite; class DexDebugInstruction; class DexFieldRef; class DexInstruction; class DexMethodHandle; class DexMethodRef; struct DexPosition; class DexString; class DexType; class IRCode; class IRInstruction; struct MethodItemEntry; using reg_t = uint32_t; namespace opcode { enum Branchingness : uint8_t; } // namespace opcode enum TryEntryType { TRY_START = 0, TRY_END = 1, }; std::string show(TryEntryType t); struct TryEntry { TryEntryType type; MethodItemEntry* catch_start; TryEntry(TryEntryType type, MethodItemEntry* catch_start) : type(type), catch_start(catch_start) { always_assert(catch_start != nullptr); } bool operator==(const TryEntry& other) const; }; struct CatchEntry { DexType* catch_type; MethodItemEntry* next; // always null for catchall explicit CatchEntry(DexType* catch_type) : catch_type(catch_type), next(nullptr) {} bool operator==(const CatchEntry& other) const; }; /** * A SwitchIndices represents the set of int values matching a packed switch * case. It could be the only value matching one case. There could also be a set * of values matching a switch case. */ using SwitchIndices = std::set<int>; using InstructionEquality = std::function<bool(const IRInstruction&, const IRInstruction&)>; /* * Multi is where an opcode encodes more than * one branch end-point. This is for packed * and sparse switch. The index is only relevant * for multi-branch encodings. The target is * implicit in the flow, where the target is from * i.e. what has to be re-written is what is recorded * in IRInstruction*. */ enum BranchTargetType { BRANCH_SIMPLE = 0, BRANCH_MULTI = 1, }; struct BranchTarget { MethodItemEntry* src; BranchTargetType type; // The key that a value must match to take this case in a switch statement. int32_t case_key; BranchTarget() = default; explicit BranchTarget(MethodItemEntry* src) : src(src), type(BRANCH_SIMPLE) {} BranchTarget(MethodItemEntry* src, int32_t case_key) : src(src), type(BRANCH_MULTI), case_key(case_key) {} bool operator==(const BranchTarget& other) const; }; /** * A SourceBlock refers to a method and block ID that the following code came * from. It also has a float payload at the moment (though that is in flow), * which will be used for profiling information. */ struct SourceBlock { const DexString* src{nullptr}; std::unique_ptr<SourceBlock> next; // Large methods exist, but a 32-bit integer is safe. // Use the maximum for things that we do not want to emit at this point. static constexpr uint32_t kSyntheticId = std::numeric_limits<uint32_t>::max(); uint32_t id{0}; // Float has enough precision. class Val { static constexpr float kNoneVal = std::numeric_limits<float>::quiet_NaN(); public: Val() {} constexpr Val(float v, float a) noexcept : m_val({v, a}) {} static constexpr Val none() { return Val(kNoneVal, kNoneVal); } // NOLINTNEXTLINE /* implicit */ operator bool() const { return m_val.val == m_val.val; }; bool operator==(const Val& other) const { return (m_val.val == other.m_val.val && m_val.appear100 == other.m_val.appear100) || (m_val.val != m_val.val && other.m_val.val != other.m_val.val); } // To access like an `optional`. struct ValPair { float val; float appear100; }; ValPair& operator*() { redex_assert(operator bool()); return m_val; } const ValPair& operator*() const { redex_assert(operator bool()); return m_val; } ValPair* operator->() { redex_assert(operator bool()); return &m_val; } const ValPair* operator->() const { redex_assert(operator bool()); return &m_val; } private: ValPair m_val; }; const uint32_t vals_size{0}; const std::unique_ptr<Val[]> vals; SourceBlock() = default; SourceBlock(const DexString* src, size_t id) : src(src), id(id) {} SourceBlock(const DexString* src, size_t id, const std::vector<Val>& v) : src(src), id(id), vals_size(v.size()), vals(clone_vals(v.data(), v.size())) {} SourceBlock(const SourceBlock& other) : src(other.src), next(other.next == nullptr ? nullptr : new SourceBlock(*other.next)), id(other.id), vals_size(other.vals_size), vals(clone_vals(other.vals.get(), other.vals_size)) {} boost::optional<float> get_val(size_t i) const { return vals[i] ? boost::optional<float>(vals[i]->val) : boost::none; } boost::optional<float> get_appear100(size_t i) const { return vals[i] ? boost::optional<float>(vals[i]->appear100) : boost::none; } static std::unique_ptr<Val[]> clone_vals(const Val* vals, size_t vals_size) { auto res = std::make_unique<Val[]>(vals_size); for (size_t i = 0; i < vals_size; i++) { res[i] = vals[i]; } return res; } template <typename Fn> void foreach_val(const Fn& fn) const { for (size_t i = 0; i < vals_size; i++) { auto& val = vals[i]; fn(val); } } template <typename Fn> bool foreach_val_early(const Fn& fn) const { for (size_t i = 0; i < vals_size; i++) { auto& val = vals[i]; if (fn(val)) { return true; } } return false; } bool operator==(const SourceBlock& other) const { if (src != other.src || id != other.id || vals_size != other.vals_size) { return false; } for (size_t i = 0; i < vals_size; i++) { if (vals[i] != other.vals[i]) { return false; } } return true; } void append(std::unique_ptr<SourceBlock> sb) { SourceBlock* last = this; while (last->next != nullptr) { last = last->next.get(); } last->next = std::move(sb); } SourceBlock* get_last_in_chain() { auto* last = this; while (last->next != nullptr) { last = last->next.get(); } return last; } const SourceBlock* get_last_in_chain() const { auto* last = this; while (last->next != nullptr) { last = last->next.get(); } return last; } std::string show(bool quoted_src = false) const; }; static_assert(sizeof(void*) != 8 || sizeof(SourceBlock) == 32); /* * MethodItemEntry (and the IRLists that it gets linked into) is a data * structure of DEX methods that is easier to modify than DexMethod. * * For example, when inserting a new instruction into a DexMethod, one needs * to recalculate branch offsets, try-catch regions, and debug info. None of * that is necessary when inserting into an IRList; it gets done when the * IRList gets translated back into a DexMethod by IRCode::sync(). */ enum MethodItemType { // Begins or ends a try region. Points to the first associated catch block MFLOW_TRY, // Found at the beginning of an exception handler block. Points to the next // catch block (in case this one does not match) MFLOW_CATCH, // The actual instructions MFLOW_OPCODE, MFLOW_DEX_OPCODE, // The target of a goto, if, or switch. Also known as a "label" MFLOW_TARGET, // These hold information about the following `MFLOW_(DEX_)OPCODE`s MFLOW_DEBUG, MFLOW_POSITION, // This holds information about the source block. MFLOW_SOURCE_BLOCK, // A no-op MFLOW_FALLTHROUGH, }; std::ostream& operator<<(std::ostream&, const MethodItemType& type); struct MethodItemEntry { boost::intrusive::list_member_hook<> list_hook_; MethodItemType type; union { TryEntry* tentry{nullptr}; CatchEntry* centry; IRInstruction* insn; // dex_insn should only ever be used by the instruction lowering / output // code. Do NOT use it in passes! DexInstruction* dex_insn; BranchTarget* target; std::unique_ptr<DexDebugInstruction> dbgop; std::unique_ptr<DexPosition> pos; std::unique_ptr<SourceBlock> src_block; }; MethodItemEntry(const MethodItemEntry&); explicit MethodItemEntry(DexInstruction* dex_insn) { this->type = MFLOW_DEX_OPCODE; this->dex_insn = dex_insn; } explicit MethodItemEntry(IRInstruction* insn) { this->type = MFLOW_OPCODE; this->insn = insn; } MethodItemEntry(TryEntryType try_type, MethodItemEntry* catch_start) : type(MFLOW_TRY), tentry(new TryEntry(try_type, catch_start)) {} explicit MethodItemEntry(DexType* catch_type) : type(MFLOW_CATCH), centry(new CatchEntry(catch_type)) {} explicit MethodItemEntry(BranchTarget* bt) { this->type = MFLOW_TARGET; this->target = bt; } explicit MethodItemEntry(std::unique_ptr<DexDebugInstruction> dbgop); explicit MethodItemEntry(std::unique_ptr<DexPosition> pos); explicit MethodItemEntry(std::unique_ptr<SourceBlock> src_block); bool operator==(const MethodItemEntry&) const; bool operator!=(const MethodItemEntry& that) const { return !(*this == that); } MethodItemEntry() : type(MFLOW_FALLTHROUGH) {} ~MethodItemEntry(); /* * This should only ever be used by the instruction lowering step. Do NOT use * it in passes! */ void replace_ir_with_dex(DexInstruction* dex_insn); void gather_strings(std::vector<const DexString*>& lstring) const; void gather_types(std::vector<DexType*>& ltype) const; void gather_init_classes(std::vector<DexType*>& ltype) const; void gather_fields(std::vector<DexFieldRef*>& lfield) const; void gather_methods(std::vector<DexMethodRef*>& lmethod) const; void gather_callsites(std::vector<DexCallSite*>& lcallsite) const; void gather_methodhandles(std::vector<DexMethodHandle*>& lmethodhandle) const; opcode::Branchingness branchingness() const; }; class MethodItemEntryCloner { // We need a map of MethodItemEntry we have created because a branch // points to another MethodItemEntry which may have been created or not std::unordered_map<const MethodItemEntry*, MethodItemEntry*> m_entry_map; // for remapping the parent position pointers std::unordered_map<DexPosition*, DexPosition*> m_pos_map; std::vector<DexPosition*> m_positions_to_fix; public: MethodItemEntryCloner(); MethodItemEntry* clone(const MethodItemEntry* mie); /* * This should to be called after the whole method is already cloned so that * m_pos_map has all the positions in the method. * * Don't change any parent pointers that point to `ignore_pos`. This is used * for inlining because the invoke position is the parent but it isn't in the * callee. If you don't have any positions to ignore, nullptr is a safe * default. */ void fix_parent_positions(const DexPosition* ignore_pos = nullptr); }; using MethodItemMemberListOption = boost::intrusive::member_hook<MethodItemEntry, boost::intrusive::list_member_hook<>, &MethodItemEntry::list_hook_>; class IRList { private: using IntrusiveList = boost::intrusive::list<MethodItemEntry, MethodItemMemberListOption>; IntrusiveList m_list; void remove_branch_targets(IRInstruction* branch_inst); static void disposer(MethodItemEntry* mie) { delete mie; } public: using iterator = IntrusiveList::iterator; using const_iterator = IntrusiveList::const_iterator; using reverse_iterator = IntrusiveList::reverse_iterator; using const_reverse_iterator = IntrusiveList::const_reverse_iterator; using difference_type = IntrusiveList::difference_type; IRList::iterator main_block(); IRList::iterator make_if_block(const IRList::iterator& cur, IRInstruction* insn, IRList::iterator* false_block); IRList::iterator make_if_else_block(const IRList::iterator& cur, IRInstruction* insn, IRList::iterator* false_block, IRList::iterator* true_block); IRList::iterator make_switch_block( const IRList::iterator& cur, IRInstruction* insn, IRList::iterator* default_block, std::map<SwitchIndices, IRList::iterator>& cases); size_t size() const { return m_list.size(); } bool empty() const { return m_list.empty(); } /* * Removes a subset of MFLOW_DEBUG instructions. */ void cleanup_debug(); /* * Removes a subset of MFLOW_DEBUG instructions. valid_regs * is an accumulator set of registers used by either DBG_START_LOCAL * or DBG_START_LOCAL_EXTENDED. The DBG_END_LOCAL and DBG_RESTART_LOCAL * instructions are erased, unless valid_regs contains the registers they use. */ void cleanup_debug(std::unordered_set<reg_t>& valid_regs); /* DEPRECATED! Use the version below that passes in the iterator instead, * which is O(1) instead of O(n). */ /* Passes memory ownership of "from" to callee. It will delete it. */ void replace_opcode(IRInstruction* from, IRInstruction* to); /* DEPRECATED! Use the version below that passes in the iterator instead, * which is O(1) instead of O(n). */ /* Passes memory ownership of "from" to callee. It will delete it. */ void replace_opcode(IRInstruction* to_delete, const std::vector<IRInstruction*>& replacements); /* Passes memory ownership of "from" to callee. It will delete it. */ void replace_opcode(const IRList::iterator& it, const std::vector<IRInstruction*>& replacements); /* * Does exactly what it says and you SHOULD be afraid. This is mainly useful * to appease the compiler in various scenarios of unreachable code. */ void replace_opcode_with_infinite_loop(IRInstruction* from); /* Like replace_opcode, but both :from and :to must be branch opcodes. * :to will end up jumping to the same destination as :from. */ void replace_branch(IRInstruction* from, IRInstruction* to); /* * Find the subrange of load-param instructions. These instructions should * always be at the beginning of the method. */ boost::sub_range<IRList> get_param_instructions(); bool structural_equals(const IRList& other, const InstructionEquality& instruction_equals) const; /* Passes memory ownership of "mie" to callee. */ void push_back(MethodItemEntry& mie) { m_list.push_back(mie); } /* Passes memory ownership of "mie" to callee. */ void push_front(MethodItemEntry& mie) { m_list.push_front(mie); } /* * Insert after instruction :position. * * position = nullptr means we insert at the head * * If position is an instruction that has a move-result-pseudo suffix, we * will do the insertion after the move-result-pseudo. */ void insert_after(IRInstruction* position, const std::vector<IRInstruction*>& opcodes); IRList::iterator insert_before(const IRList::iterator& position, MethodItemEntry&); IRList::iterator insert_after(const IRList::iterator& position, MethodItemEntry&); template <class... Args> IRList::iterator insert_before(const IRList::iterator& position, Args&&... args) { return insert_before(position, *(new MethodItemEntry(std::forward<Args>(args)...))); } template <class... Args> IRList::iterator insert_after(const IRList::iterator& position, Args&&... args) { always_assert(position != this->end()); return insert_after(position, *(new MethodItemEntry(std::forward<Args>(args)...))); } /* DEPRECATED! Use the version below that passes in the iterator instead, * which is O(1) instead of O(n). */ /* Memory ownership of "insn" passes to callee, it will delete it. */ void remove_opcode(IRInstruction* insn); /* * Remove the instruction that :it points to. * * If :it points to an instruction that has a move-result-pseudo suffix, we * remove both that instruction and the move-result-pseudo that follows. */ void remove_opcode(const IRList::iterator& it); /* * Returns an estimated of the number of 2-byte code units needed to encode * all the instructions. */ size_t sum_opcode_sizes() const; /* * Returns the number of instructions. */ size_t count_opcodes() const; // transfer all of `other` into `this` starting at `pos` // memory ownership is also transferred void splice(IRList::const_iterator pos, IRList& other) { m_list.splice(pos, other.m_list); } // transfer `other[begin]` to `other[end]` into `this` starting at `pos` // memory ownership is also transferred void splice_selection(IRList::const_iterator pos, IRList& other, IRList::const_iterator begin, IRList::const_iterator end) { m_list.splice(pos, other.m_list, begin, end); } template <typename Predicate> void remove_and_dispose_if(Predicate predicate) { m_list.remove_and_dispose_if(predicate, disposer); } void sanity_check() const; IRList::iterator begin() { return m_list.begin(); } IRList::iterator end() { return m_list.end(); } IRList::const_iterator begin() const { return m_list.begin(); } IRList::const_iterator end() const { return m_list.end(); } IRList::const_iterator cbegin() const { return m_list.cbegin(); } IRList::const_iterator cend() const { return m_list.cend(); } IRList::reverse_iterator rbegin() { return m_list.rbegin(); } IRList::reverse_iterator rend() { return m_list.rend(); } IRList::const_reverse_iterator rbegin() const { return m_list.rbegin(); } IRList::const_reverse_iterator rend() const { return m_list.rend(); } void gather_catch_types(std::vector<DexType*>& ltype) const; void gather_strings(std::vector<const DexString*>& lstring) const; void gather_types(std::vector<DexType*>& ltype) const; void gather_init_classes(std::vector<DexType*>& ltype) const; void gather_fields(std::vector<DexFieldRef*>& lfield) const; void gather_methods(std::vector<DexMethodRef*>& lmethod) const; void gather_callsites(std::vector<DexCallSite*>& lcallsite) const; void gather_methodhandles(std::vector<DexMethodHandle*>& lmethodhandle) const; IRList::iterator erase(const IRList::iterator& it) { return m_list.erase(it); } IRList::iterator erase_and_dispose(const IRList::iterator& it) { return m_list.erase_and_dispose(it, disposer); } IRList::iterator insn_erase_and_dispose(const IRList::iterator& it); void clear_and_dispose() { m_list.clear_and_dispose(disposer); } void insn_clear_and_dispose(); IRList::iterator iterator_to(MethodItemEntry& mie) { return m_list.iterator_to(mie); } IRList::const_iterator iterator_to(const MethodItemEntry& mie) const { return m_list.iterator_to(mie); } friend std::string show(const IRCode*); }; std::string show(const IRList*); namespace ir_list { template <bool is_const> class InstructionIteratorImpl { using Iterator = typename std:: conditional<is_const, IRList::const_iterator, IRList::iterator>::type; using Mie = typename std:: conditional<is_const, const MethodItemEntry, MethodItemEntry>::type; private: Iterator m_it; Iterator m_end; /* * If m_it doesn't point to an MIE of type MFLOW_OPCODE, increment it until * it does. Otherwise do nothing. */ void to_next_instruction() { while (m_it != m_end && m_it->type != MFLOW_OPCODE) { ++m_it; } } /* * If m_it doesn't point to an MIE of type MFLOW_OPCODE, decrement it until * it does. Otherwise do nothing. */ void to_prev_instruction() { while (m_it->type != MFLOW_OPCODE) { --m_it; } } public: using difference_type = long; using value_type = Mie&; using pointer = Mie*; using reference = Mie&; using iterator_category = std::bidirectional_iterator_tag; InstructionIteratorImpl() {} InstructionIteratorImpl(Iterator it, Iterator end) : m_it(it), m_end(end) { to_next_instruction(); } InstructionIteratorImpl(const InstructionIteratorImpl<false>& rhs) : m_it(rhs.m_it), m_end(rhs.m_end) {} InstructionIteratorImpl& operator=(const InstructionIteratorImpl& other) = default; InstructionIteratorImpl& operator++() { ++m_it; to_next_instruction(); return *this; } InstructionIteratorImpl operator++(int) { auto rv = *this; ++(*this); return rv; } InstructionIteratorImpl& operator--() { --m_it; to_prev_instruction(); return *this; } InstructionIteratorImpl operator--(int) { auto rv = *this; --(*this); return rv; } reference operator*() const { return *m_it; } pointer operator->() const { return &*m_it; } bool operator==(const InstructionIteratorImpl& ii) const { return m_it == ii.m_it; } bool operator!=(const InstructionIteratorImpl& ii) const { return !(m_it == ii.m_it); } Iterator unwrap() const { return m_it; } void reset(Iterator it) { m_it = it; to_next_instruction(); } template <bool kConst> friend class InstructionIteratorImpl; }; template <bool is_const> class InstructionIterableImpl { protected: using Iterator = typename std:: conditional<is_const, IRList::const_iterator, IRList::iterator>::type; Iterator m_begin; Iterator m_end; // Only callable by ConstInstructionIterable. If this were public, we may try // to bind const iterators to non-const members template <class T> InstructionIterableImpl(const T& mentry_list, // we add this unused parameter so we don't // accidentally resolve to this constructor when we // meant to call the non-const version bool /* unused */) : m_begin(mentry_list.begin()), m_end(mentry_list.end()) {} public: template <class T> explicit InstructionIterableImpl(T& mentry_list) : m_begin(mentry_list.begin()), m_end(mentry_list.end()) {} template <typename T> explicit InstructionIterableImpl(T* mentry_list) : InstructionIterableImpl(*mentry_list) {} InstructionIteratorImpl<is_const> begin() const { return InstructionIteratorImpl<is_const>(m_begin, m_end); } InstructionIteratorImpl<is_const> end() const { return InstructionIteratorImpl<is_const>(m_end, m_end); } bool empty() const { return begin() == end(); } }; using InstructionIterator = InstructionIteratorImpl<false>; using ConstInstructionIterator = InstructionIteratorImpl<true>; using InstructionIterable = InstructionIterableImpl<false>; class ConstInstructionIterable : public InstructionIterableImpl<true> { public: // We extend the Impl so we can add the const versions of the constructors. // We can't have the const constructors on the non-const iterables template <class T> explicit ConstInstructionIterable(const T& mentry_list) : InstructionIterableImpl<true>(mentry_list, false) {} template <class T> explicit ConstInstructionIterable(const T* mentry_list) : ConstInstructionIterable(*mentry_list) {} }; IRInstruction* primary_instruction_of_move_result_pseudo(IRList::iterator it); IRInstruction* primary_instruction_of_move_result(IRList::iterator it); IRInstruction* move_result_pseudo_of(IRList::iterator it); } // namespace ir_list template <class T> inline ir_list::InstructionIterable InstructionIterable(T& t) { return ir_list::InstructionIterable(t); } template <class T> inline ir_list::InstructionIterable InstructionIterable(T* t) { return ir_list::InstructionIterable(*t); } template <class T> inline ir_list::ConstInstructionIterable InstructionIterable(const T& t) { return ir_list::ConstInstructionIterable(t); } template <class T> inline ir_list::ConstInstructionIterable InstructionIterable(const T* t) { return ir_list::ConstInstructionIterable(*t); }