tools/generate_taint_models/get_globals.py (224 lines of code) (raw):

# Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # pyre-strict import ast import glob import logging import os from typing import Callable, Iterable, Optional, Set, Tuple, Union from typing_extensions import Final from .model import AssignmentModel, FunctionDefinitionModel, Model from .model_generator import ModelGenerator, qualifier from .module_loader import find_all_paths, load_module LOG: logging.Logger = logging.getLogger(__name__) FunctionDefinition = Union[ast.FunctionDef, ast.AsyncFunctionDef] class GlobalModelGenerator(ModelGenerator[Model]): def __init__( self, root: str, stub_root: Optional[str] = None, blacklisted_globals: Optional[Set[str]] = None, blacklisted_global_directories: Optional[Set[str]] = None, ) -> None: self.root: str = root self.stub_root: Final[Optional[str]] = stub_root self.blacklisted_globals: Set[str] = blacklisted_globals or set() self.blacklisted_global_directories: Set[str] = ( blacklisted_global_directories or set() ) # flake8 suggests to reduce the complexity of the function, hence the noqa line def _globals(self, root: str, path: str) -> Iterable[Model]: # noqa: C901 globals = set() # The parent of the property needs to be stored as well, as we only store the # module qualifier. cached_properties: Set[Tuple[Optional[str], FunctionDefinition]] = set() module = load_module(path) if not module: return globals class NameVisitor(ast.NodeVisitor): def __init__(self, globals: Set) -> None: self.globals = globals self.blacklist: Optional[Set[str]] = None self.parent: Optional[str] = None def visit_Name(self, name: ast.Name) -> None: blacklist = self.blacklist if blacklist is not None and name.id in blacklist: return parent = self.parent if parent is not None: name_to_register = f"{parent}.__class__.{name.id}" else: name_to_register = name.id self.globals.add(name_to_register) # Ensure that we stop recursing when we're in a complex assign, such as # a.b = ... or a[b] = ... . def visit_Attribute(self, attribute: ast.Attribute) -> None: return def visit_Subscript(self, subscript: ast.Subscript) -> None: return visitor: NameVisitor = NameVisitor(globals) def visit_assignment(target: ast.expr, value: ast.expr) -> None: if value is not None: # namedtuples get preprocessed out by Pyre, and shouldn't be added # as globals. if isinstance(value, ast.Call): callee = value.func if ( isinstance(callee, ast.Attribute) and callee.attr == "namedtuple" ): return if isinstance(callee, ast.Name) and callee.id == "namedtuple": return # Omit pure aliases of the form `x = alias`. if isinstance(value, ast.Name) or isinstance(value, ast.Attribute): return # x = lambda: _ can safely be avoided, as the models confuse our taint # analysis. if isinstance(value, ast.Lambda): return visitor.visit(target) def should_visit_class(class_definition: ast.ClassDef) -> bool: # Ensure that we don't visit nested classes for now. if visitor.parent is not None: return False # TypedDicts use top-level attribute declarations to declare attributes. for base in class_definition.bases: base_name = None if isinstance(base, ast.Name): base_name = base.id if isinstance(base, ast.Attribute): base_name = base.attr if base_name == "TypedDict": return False def is_dataclass_decorator(expression: ast.expr) -> bool: if isinstance(expression, ast.Call): return is_dataclass_decorator(expression.func) if isinstance(expression, ast.Name): return expression.id == "dataclass" if isinstance(expression, ast.Attribute): base = expression.value if isinstance(base, ast.Name) and base.id == "dataclasses": return expression.attr == "dataclass" return False for decorator in class_definition.decorator_list: # Skip visiting dataclasses, as they use class variables to generate # instance variables. They can have one of the following forms: # @dataclass(args), @dataclass, or `@dataclasses.dataclass(args)`. if is_dataclass_decorator(decorator): return False return True def all_attributes(class_definition: ast.ClassDef) -> Set[str]: attributes = set() for statement in class_definition.body: if not isinstance(statement, ast.FunctionDef): continue for assignment in statement.body: if isinstance(assignment, ast.Assign): for target in assignment.targets: attribute = _get_self_attribute(target) if attribute is not None: attributes.add(attribute) elif isinstance(assignment, ast.AnnAssign): attribute = _get_self_attribute(assignment.target) if attribute is not None: attributes.add(attribute) return attributes def visit_statement(statement: ast.stmt) -> None: if isinstance(statement, ast.Assign): # Omit pure aliases of the form `x = alias`. for target in statement.targets: visit_assignment(target, statement.value) elif isinstance(statement, ast.AugAssign): visitor.visit(statement.target) # Don't attempt to register statements of the form `x: int`. elif isinstance(statement, ast.AnnAssign): value = statement.value if value is not None: visit_assignment(statement.target, value) elif isinstance(statement, ast.FunctionDef) or isinstance( statement, ast.AsyncFunctionDef ): for decorator in statement.decorator_list: if _is_cached_property_decorator(decorator): cached_properties.add((visitor.parent, statement)) elif isinstance(statement, ast.ClassDef) and should_visit_class(statement): visitor.parent = statement.name visitor.blacklist = all_attributes(statement) for toplevel_statement in statement.body: visit_statement(toplevel_statement) visitor.parent = None visitor.blacklist = None for statement in module.body: visit_statement(statement) module_qualifier = qualifier(root, path) models = set() for target in globals: if target == "__all__": continue qualified_target = f"{module_qualifier}.{target}" if qualified_target in self.blacklisted_globals: continue try: generated = AssignmentModel( annotation="TaintSink[Global]", target=qualified_target ) models.add(generated) except ValueError: pass for (parent, function_definition) in cached_properties: is_class_property = any( ( _is_class_property_decorator(decorator) for decorator in function_definition.decorator_list ) ) if is_class_property: returns = "TaintSink[Global, Via[cached_class_property]]" else: returns = "TaintSink[Global, Via[cached_property]]" if parent is not None: function_qualifier = f"{module_qualifier}.{parent}" else: function_qualifier = module_qualifier try: function_definition_model = FunctionDefinitionModel( qualifier=function_qualifier, definition=function_definition, returns=returns, ) models.add(function_definition_model) except ValueError: pass return models def gather_functions_to_model(self) -> Iterable[Callable[..., object]]: return [] def compute_models( self, functions_to_model: Iterable[Callable[..., object]] ) -> Iterable[Model]: sinks: Set[Model] = set() for path in find_all_paths(self.root): relative_path = os.path.relpath(path, self.root) should_skip = any( ( relative_path.startswith(blacklisted) for blacklisted in self.blacklisted_global_directories ) ) if should_skip: LOG.info("Skipping %s", os.path.relpath(path, self.root)) else: sinks = sinks.union(self._globals(self.root, path)) stub_root = self.stub_root if stub_root is not None: stub_root = os.path.abspath(stub_root) paths = glob.glob(stub_root + "/**/*.pyi", recursive=True) for path in paths: sinks = sinks.union(self._globals(stub_root, path)) return sinks def _get_self_attribute(target: ast.expr) -> Optional[str]: if isinstance(target, ast.Attribute): value = target.value if isinstance(value, ast.Name) and value.id == "self": return target.attr return None def _is_cached_property_decorator(decorator: ast.expr) -> bool: if isinstance(decorator, ast.Name): name = decorator.id elif isinstance(decorator, ast.Attribute): name = decorator.attr else: name = None if name is None: return False return "cached" in name and "property" in name def _is_class_property_decorator(decorator: ast.expr) -> bool: if isinstance(decorator, ast.Name): name = decorator.id elif isinstance(decorator, ast.Attribute): name = decorator.attr else: name = None if name is None: return False return "class" in name and "property" in name