libredex/PointsToSemantics.h (262 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <bitset>
#include <cstdint>
#include <initializer_list>
#include <iosfwd>
#include <unordered_map>
#include <utility>
#include <vector>
#include <boost/container/flat_map.hpp>
#include <boost/optional.hpp>
#include "ControlFlow.h"
#include "DexClass.h"
#include "PointsToSemanticsUtils.h"
#include "S_Expression.h"
#include "TypeSystem.h"
/*
* The points-to semantics of Dex code defined here can be used for performing
* flow-insensitive, inclusion-based points-to analyses. Each method is
* translated into a system of points-to equations operating on sets of abstract
* object instances. The actual representation of abstract object instances is
* delegated to the particular points-to analysis that ultimately processes
* these equations. This representation may vary depending on the type of
* abstraction implemented in the points-to analysis (context-sensitivity,
* object-sensitivity, etc.). The points-to equations abstract away all
* computational aspects that are not directly related to pointer manipulation
* (like scalar values and arithmetic operations) and is oblivious of
* control-flow dependencies (a sequence of statements is interpreted as all
* possible interleavings of the statements).
*
* Example:
*
* Consider the following class:
*
* public class MyList {
* MyList next;
* Element val;
*
* public MyList(Element v, MyList n) {
* next = n;
* val = v;
* }
*
* public Element nth(int n) {
* MyList x = this;
* for(int i = 0; i < n; ++i) {
* x = x.next;
* }
* return x.val;
* }
* }
*
* The points-to equations of the method `nth` computed by the semantic
* translator are as follows:
*
* LMyList;#nth: (I)LElement; {
* V0 = THIS
* V4 = V0 + V1
* V1 = V4.LMyList;#next
* V2 = V4.LElement;#val
* RETURN V2
* }
*
* Note that all arithmetic computations have been abstracted away. The
* variables are not necessarily numbered sequentially, as irrelevant equations
* may have been optimized away. The list traversal is encoded by the recursive
* definition of variable V4 (where `U` denotes the union of sets of abstract
* object instances). The meaning of this recursive definition is given by the
* least solution of the system of set equations (or equivalently, by the least
* fixpoint of the associated set transformer). The literature on points-to
* analysis is vast, but the fundamental concepts of inclusion-based,
* flow-insensitive analysis go back to Andersen's PhD thesis, which is also an
* excellent introduction to the topic:
*
* L. Andersen. Program Analysis and Specialization for the C Programming
* Language. PhD thesis, DIKU, University of Copenhagen, 1994.
*
* The points-to equations don't keep any connection with the original Dex code
* aside from references to type and method names. This is intentional, as we
* want to be able to modify the points-to equations (for better accuracy and/or
* scalability) without having to worry about consistency with the original
* code.
*/
// Forward declaration.
class PointsToSemantics;
/*
* A points-to variable denotes a set of abstract object instances. It is
* uniquely identified by a positive number.
*/
class PointsToVariable final {
public:
// The default constructor produces the `null` variable to prevent confusion
// with user-defined variables.
PointsToVariable() : m_id(null_var_id()) {}
// This variable has a special meaning: it represents the empty set of
// abstract object instances, i.e., the semantic interpretation of `null`.
static PointsToVariable null_variable() {
return PointsToVariable(null_var_id());
}
// This variable represents the special parameter `this` in instance methods.
static PointsToVariable this_variable() {
return PointsToVariable(this_var_id());
}
sparta::s_expr to_s_expr() const;
static boost::optional<PointsToVariable> from_s_expr(const sparta::s_expr& e);
private:
static constexpr int32_t null_var_id() { return -1; }
static constexpr int32_t this_var_id() { return -2; }
explicit PointsToVariable(size_t id) : m_id(id) {}
// A user-defined variable always has a positive identifier. We use negative
// identifiers for special variables, like the `null` variable.
int32_t m_id;
friend class PointsToMethodSemantics;
friend size_t hash_value(const PointsToVariable&);
friend bool operator==(const PointsToVariable&, const PointsToVariable&);
friend bool operator<(const PointsToVariable&, const PointsToVariable&);
friend std::ostream& operator<<(std::ostream&, const PointsToVariable&);
};
/*
* We want to use points-to variables in various STL containers with minimal
* effort, hence the definition of the following functions.
*/
// To enable the use of boost::hash.
size_t hash_value(const PointsToVariable& v);
bool operator==(const PointsToVariable& v, const PointsToVariable& w);
bool operator<(const PointsToVariable& v, const PointsToVariable& w);
std::ostream& operator<<(std::ostream& o, const PointsToVariable& v);
/*
* Except for the disjunction, points-to operations are similar to their
* counterparts in Dex bytecode. Note that we do not attempt to model exceptions
* precisely. The `PTS_GET_EXCEPTION` operation stands for `move-exception`, but
* assumes that any exception can be caught. This also explains why we have no
* operation corresponding to `throw`. As for the disjuction, it's simply the
* union of points-to variables (V = V1 U V2 U ... U Vn). We also introduce a
* special operation `PTS_GET_CLASS` for java.lang.Object#getClass(), since
* java.lang.Class objects need to be handled specially by the analyzer.
*/
// clang-format off
// is_load is_get is_put is_invoke
#define PTS_OPS \
PTS_OP(PTS_CONST_STRING, true , false, false, false ) \
I PTS_OP(PTS_CONST_CLASS, true , false, false, false ) \
I PTS_OP(PTS_GET_EXCEPTION, true , false, false, false ) \
I PTS_OP(PTS_NEW_OBJECT, true , false, false, false ) \
I PTS_OP(PTS_LOAD_PARAM, true , false, false, false ) \
I PTS_OP(PTS_GET_CLASS, false, false, false, false ) \
I PTS_OP(PTS_CHECK_CAST, false, false, false, false ) \
I PTS_OP(PTS_IGET, false, true , false, false ) \
I PTS_OP(PTS_IGET_SPECIAL, false, true , false, false ) \
I PTS_OP(PTS_SGET, false, true , false, false ) \
I PTS_OP(PTS_IPUT, false, false, true , false ) \
I PTS_OP(PTS_IPUT_SPECIAL, false, false, true , false ) \
I PTS_OP(PTS_SPUT, false, false, true , false ) \
I PTS_OP(PTS_INVOKE_VIRTUAL, false, false, false, true ) \
I PTS_OP(PTS_INVOKE_SUPER, false, false, false, true ) \
I PTS_OP(PTS_INVOKE_DIRECT, false, false, false, true ) \
I PTS_OP(PTS_INVOKE_INTERFACE, false, false, false, true ) \
I PTS_OP(PTS_INVOKE_STATIC, false, false, false, true ) \
I PTS_OP(PTS_RETURN, false, false, false, false ) \
I PTS_OP(PTS_DISJUNCTION, false, false, false, false )
// clang-format on
#define MAX_PTS_OPS (sizeof(unsigned long long) * 8)
#define PTS_OP(OP, ...) OP
#define I ,
enum PointsToOperationKind { PTS_OPS };
#undef I
#undef PTS_OP
/*
* We need a special edge to model the points-to relation between an array and
* its elements. In the future, we could also use special edges to model the
* effect of external libraries or native methods.
*/
enum SpecialPointsToEdge {
PTS_ARRAY_ELEMENT,
};
struct PointsToOperation {
PointsToOperationKind kind;
union {
DexMethodRef* dex_method;
DexFieldRef* dex_field;
const DexString* dex_string;
DexType* dex_type;
size_t parameter;
SpecialPointsToEdge special_edge;
};
PointsToOperation() = default;
explicit PointsToOperation(PointsToOperationKind k) : kind(k) {}
PointsToOperation(PointsToOperationKind k, DexMethodRef* m)
: kind(k), dex_method(m) {}
PointsToOperation(PointsToOperationKind k, DexFieldRef* f)
: kind(k), dex_field(f) {}
PointsToOperation(PointsToOperationKind k, const DexString* s)
: kind(k), dex_string(s) {}
PointsToOperation(PointsToOperationKind k, DexType* t)
: kind(k), dex_type(t) {}
PointsToOperation(PointsToOperationKind k, size_t p)
: kind(k), parameter(p) {}
PointsToOperation(PointsToOperationKind k, SpecialPointsToEdge e)
: kind(k), special_edge(e) {}
bool is_load() const {
#define PTS_OP(OP, is_load, is_get, is_put, is_invoke) \
((is_load) ? (1ULL << (OP)) : 0)
// NOLINTNEXTLINE(bugprone-macro-parentheses)
#define I |
static const std::bitset<MAX_PTS_OPS> load_operations(PTS_OPS);
#undef I
#undef PTS_OP
return load_operations[kind];
}
bool is_get_class() const { return kind == PTS_GET_CLASS; }
bool is_check_cast() const { return kind == PTS_CHECK_CAST; }
bool is_get() const {
#define PTS_OP(OP, is_load, is_get, is_put, is_invoke) \
((is_get) ? (1ULL << (OP)) : 0)
// NOLINTNEXTLINE(bugprone-macro-parentheses)
#define I |
static const std::bitset<MAX_PTS_OPS> get_operations(PTS_OPS);
#undef I
#undef PTS_OP
return get_operations[kind];
}
bool is_sget() const { return kind == PTS_SGET; }
bool is_put() const {
#define PTS_OP(OP, is_load, is_get, is_put, is_invoke) \
((is_put) ? (1ULL << (OP)) : 0)
// NOLINTNEXTLINE(bugprone-macro-parentheses)
#define I |
static const std::bitset<MAX_PTS_OPS> put_operations(PTS_OPS);
#undef I
#undef PTS_OP
return put_operations[kind];
}
bool is_sput() const { return kind == PTS_SPUT; }
bool is_invoke() const {
#define PTS_OP(OP, is_load, is_get, is_put, is_invoke) \
((is_invoke) ? (1ULL << (OP)) : 0)
// NOLINTNEXTLINE(bugprone-macro-parentheses)
#define I |
static const std::bitset<MAX_PTS_OPS> invoke_operations(PTS_OPS);
#undef I
#undef PTS_OP
return invoke_operations[kind];
}
bool is_virtual_call() const { return is_invoke() && !is_static_call(); }
bool is_static_call() const { return kind == PTS_INVOKE_STATIC; }
bool is_return() const { return kind == PTS_RETURN; }
bool is_disjunction() const { return kind == PTS_DISJUNCTION; }
sparta::s_expr to_s_expr() const;
static boost::optional<PointsToOperation> from_s_expr(
const sparta::s_expr& e);
};
/*
* We don't use the term points-to equation here, because strictly speaking,
* some operations are not equational (like a method call with no return value).
*/
class PointsToAction final {
public:
PointsToAction() = default;
const PointsToOperation& operation() const { return m_operation; }
bool has_dest() const { return m_arguments.count(dest_key()) > 0; }
PointsToVariable lhs() const { return get_arg(lhs_key()); }
PointsToVariable rhs() const { return get_arg(rhs_key()); }
PointsToVariable instance() const { return get_arg(instance_key()); }
PointsToVariable dest() const { return get_arg(dest_key()); }
PointsToVariable src() const { return get_arg(src_key()); }
void remove_dest() { m_arguments.erase(dest_key()); }
/*
* Returns the arguments of a method call (indexed by their position in the
* invocation) or the components of a disjunction.
*/
std::vector<std::pair<size_t, PointsToVariable>> get_arguments() const;
/*
* Used to build PTS_CONST_STRING, PTS_CONST_CLASS, PTS_GET_EXCEPTION,
* PTS_NEW_OBJECT and LOAD_PARAM actions.
*/
static PointsToAction load_operation(const PointsToOperation& operation,
PointsToVariable dest);
/*
* Used to build a PTS_GET_CLASS action.
*/
static PointsToAction get_class_operation(PointsToVariable dest,
PointsToVariable src);
/*
* Used to build a PTS_CHECK_CAST action.
*/
static PointsToAction check_cast_operation(DexType* dex_type,
PointsToVariable dest,
PointsToVariable src);
/*
* Used to build PTS_IGET, PTS_IGET_SPECIAL and PTS_SGET actions. There is no
* instance for PTS_SGET.
*/
static PointsToAction get_operation(
const PointsToOperation& operation,
PointsToVariable dest,
boost::optional<PointsToVariable> instance = {});
/*
* Used to build PTS_IPUT, PTS_IPUT_SPECIAL and PTS_SPUT actions. There is no
* lhs for PTS_SPUT.
*/
static PointsToAction put_operation(
const PointsToOperation& operation,
PointsToVariable rhs,
boost::optional<PointsToVariable> lhs = {});
/*
* Used to build PTS_INVOKE_VIRTUAL, PTS_INVOKE_SUPER, PTS_INVOKE_DIRECT,
* PTS_INVOKE_INTERFACE and PTS_INVOKE_STATIC actions. There is no instance
* for PTS_INVOKE_STATIC. The optional dest parameter is used to model the
* return value of the method call if any. The arguments of the method call
* are numbered starting from 0.
*/
static PointsToAction invoke_operation(
const PointsToOperation& operation,
boost::optional<PointsToVariable> dest,
boost::optional<PointsToVariable> instance,
const std::vector<std::pair<int32_t, PointsToVariable>>& args);
/*
* Used to build a PTS_RETURN action.
*/
static PointsToAction return_operation(PointsToVariable src);
/*
* Used to build a disjunction of variables v = v1 + ... + vn.
*/
template <typename InputIterator>
static PointsToAction disjunction(PointsToVariable dest,
InputIterator first,
InputIterator last);
sparta::s_expr to_s_expr() const;
static boost::optional<PointsToAction> from_s_expr(const sparta::s_expr& e);
private:
static constexpr int32_t lhs_key() { return -1; }
static constexpr int32_t rhs_key() { return -2; }
static constexpr int32_t instance_key() { return -3; }
static constexpr int32_t dest_key() { return -4; }
static constexpr int32_t src_key() { return -5; }
PointsToAction(
const PointsToOperation& operation,
std::initializer_list<std::pair<int32_t, PointsToVariable>> arguments)
: m_operation(operation), m_arguments(arguments) {
m_arguments.shrink_to_fit();
}
PointsToAction(
const PointsToOperation& operation,
const std::vector<std::pair<int32_t, PointsToVariable>>& arguments);
PointsToVariable get_arg(int32_t key) const;
PointsToOperation m_operation;
// We use a flat map (i.e., a sorted associative array) to get the convenience
// of a map while maintaining a low memory footprint. The arguments in a
// method call are denoted by positive indexes that correspond to their
// position in the original invocation. Arguments specific to a points-to
// operation (like the left-hand side of an assignment operation) have a
// negative index.
boost::container::flat_map<int32_t, PointsToVariable> m_arguments;
};
std::ostream& operator<<(std::ostream& o, const PointsToAction& a);
/*
* During the analysis we may want to distinguish among methods that don't have
* points-to equations because either the code is unavailable (external
* libraries, native methods), the code doesn't exist (abstract methods) or the
* code exists but has no effect on pointers. Each case may be subject to a
* different semantic intepretation.
*/
enum MethodKind {
PTS_APK, // Regular method defined in the APK
PTS_ABSTRACT, // Abstract method
PTS_NATIVE, // Native method
PTS_STUB, // The set of points-to equations for the method is a stub
};
/*
* The system of points-to actions representing the semantics of a method,
* together with some context information.
*/
class PointsToMethodSemantics {
public:
PointsToMethodSemantics(DexMethodRef* dex_method,
MethodKind kind,
size_t start_var_id,
size_t size_hint);
DexMethodRef* get_method() const { return m_dex_method; }
MethodKind kind() const { return m_kind; }
PointsToVariable get_new_variable() {
return PointsToVariable(m_variable_counter++);
}
const std::vector<PointsToAction>& get_points_to_actions() const {
return m_points_to_actions;
}
void add(const PointsToAction& a) { m_points_to_actions.emplace_back(a); }
/*
* This function attempts to remove points-to equations that have no effect on
* the analysis (e.g., reading a value that is not used in any write operation
* or method call). This helps relieve some of the computational burden on
* the resolution algorithm.
*/
void shrink();
sparta::s_expr to_s_expr() const;
static boost::optional<PointsToMethodSemantics> from_s_expr(
const sparta::s_expr& e);
private:
DexMethodRef* m_dex_method;
MethodKind m_kind;
// The variable counter allows us to generate new variables when we need to
// modify the system of points-to actions (e.g., for inlining method calls).
size_t m_variable_counter;
std::vector<PointsToAction> m_points_to_actions;
friend std::ostream& operator<<(std::ostream&,
const PointsToMethodSemantics&);
};
std::ostream& operator<<(std::ostream& o, const PointsToMethodSemantics& s);
/*
* This represents the points-to semantics of all methods inside a given scope.
*
* IMPORTANT: the procedure used to generate the points-to sematics assumes that
* invoke-* instructions are in denormalized form, i.e., wide arguments are
* explicitly represented by a pair of consecutive registers. The generation of
* the points-to semantics doesn't modify the IR and hence, can be used anywhere
* in Redex.
*/
class PointsToSemantics final {
public:
using iterator = std::unordered_map<DexMethodRef*,
PointsToMethodSemantics>::const_iterator;
PointsToSemantics() = delete;
PointsToSemantics(const PointsToSemantics& other) = delete;
PointsToSemantics& operator=(const PointsToSemantics& other) = delete;
/*
* The constructor generates points-to actions for all methods in the given
* scope. The generation is performed in parallel using a pool of threads. If
* the flag `generate_stubs` is set to true, all methods in the scope are
* interpreted as stubs.
*/
explicit PointsToSemantics(const Scope& scope, bool generate_stubs = false);
/*
* The stubs are stored in the specified text file as S-expressions. In case
* of a collision between a method in the APK and a stub, the stub is
* discarded.
*/
void load_stubs(const std::string& file_name);
iterator begin() { return m_method_semantics.begin(); }
iterator end() { return m_method_semantics.end(); }
const TypeSystem& get_type_system() { return m_type_system; }
boost::optional<PointsToMethodSemantics*> get_method_semantics(
DexMethodRef* dex_method);
private:
MethodKind default_method_kind() const;
void initialize_entry(DexMethod* dex_method);
void generate_points_to_actions(DexMethod* dex_method);
bool m_generate_stubs;
TypeSystem m_type_system;
PointsToSemanticsUtils m_utils;
std::unordered_map<DexMethodRef*, PointsToMethodSemantics> m_method_semantics;
friend std::ostream& operator<<(std::ostream&, const PointsToSemantics&);
};
std::ostream& operator<<(std::ostream& o, const PointsToSemantics& s);