uberpoet/dotreader.py

# Copyright (c) 2018 Uber Technologies, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import import itertools import logging import os import tempfile from collections import defaultdict from pprint import pprint from typing import Dict, List, Set # noqa: F401 from .moduletree import ModuleNode from .util import makedir class DotFileReader(object): """ This class reads a dot file from a `buck query "deps(target)" --dot > file.gv` output and translates it into a `ModuleNode` dependency graph for the `BuckProjectGenerator` class to consume. The entry point is `DotFileReader().read_dot_file(path)` Why make our own dot file parser vs using the two other python libaries available? They're really slow. Probably because of some extra stuff we don't do. """ def __init__(self, modules_filter=None): # type: (List[str]) -> None """ Initalizes object. :param modules_filter: If a graph edge contains a string inside the list as a substring, it is ignored. Keep it empty to not filter out anything. If not specified, a default list of test names are used. """ if modules_filter is None: # Why filter out modules with these names? # We are trying to simulate a non-test app build and ignore non-code targets such as asset # catalogs and schemes. Each module currently gets a static amount of code created for it, # so non-code modules will add code to the total when they shouldn't if we didn't filter them out. modules_filter = ['test', 'scheme', 'assetcatalog', 'resources', 'fixture', 'needle'] self.modules_filter = modules_filter def read_dot_file(self, path, root_node_name, is_debug=False): # type: (str, str, bool) -> (ModuleNode, List[ModuleNode]) """ Reads a BUCK dependency dump in a dot/gv file at `path` and returns a `ModuleNode` graph root and list of nodes to generate a mock app from it. :param path: Path to the dot file :param root_node_name: The name of the root application node in the dependency graph :param is_debug: Enable this to dump some intermediate objects to help with debugging :return: The a tuple of the root node of the tree and a list of all nodes in the tree """ with open(path, 'r') as f: text = f.read() raw_edges = self.extract_edges(text) ident_names = self.identical_names(raw_edges) edges = self.clean_edge_names(raw_edges) dep_map = self.make_dep_map_from_edges(edges) # A dep_map is really an outgoing edge map # Debug dumps of dot reader state for debugging if is_debug: incoming_map = self.incoming_edge_map_from_dep_map(dep_map) anon_edge = self.anonymize_edge_names(edges, root_node_name) self.debug_dump([edges, raw_edges, anon_edge], [dep_map, ident_names, incoming_map]) if ident_names: logging.error("Found identical buck target names in dot file: %s", path) logging.error(str(ident_names)) raise ValueError("Dot file contains buck target names that are identical, but have different paths") else: root, nodes = self.mod_graph_from_dep_map(dep_map, root_node_name) logging.debug("%s %s total nodes: %d", root, root.deps, len(nodes)) return root, nodes def extract_edges(self, text): # type: (str)-> List[List[str]] """ Converts dot file text with buck targets as edges into a simpler [(string,string)] list. Also filters out unwanted target types based on the names in self.modules_filter. Dot files are basically lines of `"string" -> "string";` that represent a list of edges in a graph. """ def edge(f_line): return f_line[:-1].split('->') def name(f_part): return str(f_part.strip().replace('"', '')) def fil(f_line): lower = line.lower() bad_word = False for k in self.modules_filter: if k in lower: bad_word = True break return '->' in f_line and not bad_word return [[name(part) for part in edge(line)] for line in text.splitlines() if fil(line)] @staticmethod def extract_buck_target(text): # type: (str) -> str """ Extracts the target name from a buck target path. Ex: //a/b/c:target -> target If the target is invalid, just returns the original text. """ parts = text.split(":") if len(parts) != 2: return text return parts[1] def clean_edge_names(self, edges): # type: (List[List[str]]) -> List[List[str]] """Makes edge names only be their buck target name""" return [[self.extract_buck_target(text) for text in pair] for pair in edges] @staticmethod def anonymize_edge_names(edges, main_app_module_name): # type: (List[List[str]], str) -> List[List[str]] """ Makes edge names anonymous so you can send them to third parties without revealing the names of your modules. """ # make lib_index an object to avoid a scope quirk of python with inner functions # https://www.codesdope.com/blog/article/nested-function-scope-of-variable-closures-in-pyth/ lib_index = itertools.count() name_dict = {main_app_module_name: "DotReaderMainModule"} def name(orig): if orig not in name_dict: name_dict[orig] = 'DotReaderLib' + str(next(lib_index)) return name_dict[orig] return [[name(left), name(right)] for left, right in edges] @staticmethod def make_dep_map_from_edges(edges): # type: (List[List[str]]) -> Dict[str, List[str]] """Converts a raw [(origin,destination)] edge list into a {origin:[destinations]} outgoing edge map.""" dep_map = defaultdict(list) for pair in edges: origin, destination = pair dep_map[origin] += [destination] return dict(dep_map) @staticmethod def incoming_edge_map_from_dep_map(dep_map): # type: (Dict[str, List[str]]) -> Dict[str, List[str]] """Converts a outgoing edge map into it's inverse, a {destination:[origins]} incoming edge map.""" incoming = defaultdict(list) for node, outgoing in dep_map.iteritems(): for out in outgoing: incoming[out] += [node] # Roots wont show up in the incoming list in the above for loop roots = set(dep_map.keys()) - set(incoming.keys()) for root in roots: incoming[root] = [] return dict(incoming) @staticmethod def reachability_set(dep_map, root_node_name): # type: (Dict[str, List[str]], str) -> Set[str] """ WARNING: Doesn't work currently. It's a TODO to fix this. Returns a set of all nodes reachable from root_node_name """ seen = set() consume_list = [root_node_name] while len(consume_list) > 0: item = consume_list.pop(0) seen.add(item) consume_list += dep_map[item] return seen def find_roots_in_dep_map(self, dep_map): # type: (Dict[str, List[str]]) -> List[str] """ Finds the roots in the DAG represented by a outgoing edge map. If it returns empty, then you have cycles and thus don't have a DAG. """ incoming = self.incoming_edge_map_from_dep_map(dep_map) # A node with no incoming edges and some outgoing edges is a root in a DAG # Nodes with no edges are not really part of a graph, so we ignore them return [node for node, incoming_edges in incoming.iteritems() if not incoming_edges and dep_map[node]] def identical_names(self, edges): # type: (List[List[str]]) -> Dict[str,int] """Returns how many times a buck target name occurs in a edge list, filtering out unique (count == 1) names""" dep_map = self.make_dep_map_from_edges(edges) name_count = {} for k in dep_map.keys(): name = self.extract_buck_target(k) name_count[name] = name_count.get(name, 0) + 1 name_count = {key: value for key, value in name_count.iteritems() if value > 1} return name_count def biggest_root_name(self, dep_map): # type: (Dict[str, List[str]]) -> str """ WARNING: Doesn't work currently. It's a TODO to fix this. Finds the root with the most reachable nodes under it inside a DAG. The biggest root is probably the app tree. With this you don't have to pass in the root node name to self.read_dot_file(...) """ roots = self.find_roots_in_dep_map(dep_map) root_name = None if len(roots) == 1: root_name = roots[0] elif len(roots) == 0: raise ValueError("Cyclic dependency graph given (len(roots) == 0), aborting") else: max_size = -1 for r in roots: size = len(self.reachability_set(dep_map, r)) if size > max_size: root_name, max_size = r, size return root_name @staticmethod def mod_graph_from_dep_map(dep_map, root_node_name): # type: (Dict[str, List[str]],str) -> (ModuleNode, List[ModuleNode]) """ Converts an outgoing edge map (`dep_map`) into a ModuleNode graph that you can generate a mock app from. You have to provide the root node / application node name (`root_node_name`) for the graph. """ def make_mod(name): return ModuleNode(name, ModuleNode.LIBRARY) mod_map = {name: make_mod(name) for name in dep_map.keys()} # TODO Fix mod_map[self.biggest_root(dep_map)]. If this works you don't # have to pass the app node name any more. # app_node = mod_map[self.biggest_root_name(dep_map)] app_node = mod_map[root_node_name] app_node.node_type = ModuleNode.APP for name, deps in dep_map.iteritems(): new_deps = [] for dep_name in deps: mod = mod_map.get(dep_name, None) if not mod: mod = make_mod(dep_name) mod_map[dep_name] = mod new_deps.append(mod) mod_map[name].deps = new_deps return app_node, mod_map.values() @staticmethod def write_struct(struct, path): # type: (object, str) -> None """Writes generic python objects to disk (not deserializable)""" with open(path, 'w') as f: pprint(struct, f) @staticmethod def write_edges(edges, path): # type: (List[List[str]], str) -> None """"Writes edge lists to disk""" with open(path, 'w') as f: for e in edges: f.write('"{}" -> "{}";\n'.format(e[0], e[1])) def debug_dump(self, edges, structs): # type: (List[List[List[str]]], List[object]) -> None """ Dumps various intermediate objects to files to help debugging. `edges` will dump dot files of a list of edge pairs. Type: [[(String,String)]] `structs` will dump a pretty print of python objects. Type: [PythonObject] """ dump_path = os.path.join(tempfile.gettempdir(), 'ub_dot_graph_dump') logging.info('Dumping dotreader.py debug structures to %s', dump_path) makedir(dump_path) for i, edge in enumerate(edges): path = os.path.join(dump_path, 'edges{}.gv'.format(i)) self.write_edges(edge, path) for i, struct in enumerate(structs): path = os.path.join(dump_path, 'struct{}.py'.format(i)) self.write_struct(struct, path)

uberpoet/dotreader.py (143 lines of code) (raw):