sources/cqljson/cqljson.py (404 lines of code) (raw):

#!/usr/bin/env python3 # Copyright (c) Meta Platforms, Inc. and affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # cqljson.py -> converts CQL JSON format into various useful outputs # # The CQL JSON format is documented here: # https://cgsql.dev/cql-guide/ch13 # and here: # https://cgsql.dev/json-diagram # # NB: This code should be considered SAMPLE code, not production code. # Which is to say you can reasonably expect that the specifics of the diagrams # and the database produced here are likely to change at whim. If you need # a particular output, you are enouraged to FORK this sample into something # stable. The JSON format itself is the contract and it evolves in a backwards # compatible way. This script is likely to change to make different pretty # pictures at various times. import json import sys def usage(): print( ( "Usage:\n" "\n" "--table_diagram input.json [universe]> tables.dot\n" " creates a .dot file for a table diagram\n" "\n" "--region_diagram input.json > regions.dot\n" " creates a .dot file for a region diagram\n" "\n" "--erd input.json [universe] > erd.dot\n" " creates a .dot file for an ER diagram\n" "\n" "--sql input.json > inputdb.sql\n" " creates a .sql file for a database with the schema info\n" "\n" "The [universe] arguments can be:\n" " * nothing in which case all tables are the used\n" " * a list of targets which are processed in order\n" " * a target is:\n" " * a table name\n" " * a table name followed by +fks (e.g. foo+fks)\n" " * a table name followed by +refs (e.g. foo+refs)\n" " * a table name followed by +graph (e.g. foo+graph)\n" " * +fks: means the table and all its FK tables, transitively\n" " * +refs: means the table and all that refer to it via FK, transitively\n" " * +graphs: means the table and any table linked to it either way, transitively\n" " * a target may be prefixed with '-' indicating that pattern should be removed\n" "\n" "To create a CQL JSON file you can start with any CQL, but\n" "probably a file with most or all of your schema is the best\n" "choice." " cql --in your_file.sql --rt json_schema --cg output.json\n" "\n" "To process a .dot file use a command like:\n" " dot x.dot -Tpng -o x.png\n" ) ) def add_colinfo(colinfo, col, data): if col not in colinfo: colinfo[col] = data else: colinfo[col] = colinfo[col] + "," + data def compute_pk(t): pk = {} for pkcol in t["primaryKey"]: pk[pkcol] = 1 return pk def compute_colinfo(t): colinfo = {} pk = {} for pkcol in t["primaryKey"]: pk[pkcol] = 1 add_colinfo(colinfo, pkcol, "PK") ifk = 0 for fktup in enumerate(t["foreignKeys"]): ifk = ifk + 1 fkname = f"FK{ifk}" fk = fktup[1] for fkcol in fk["columns"]: add_colinfo(colinfo, fkcol, fkname) iuk = 0 for uktup in enumerate(t["uniqueKeys"]): iuk = iuk + 1 ukname = f"UK{iuk}" uk = uktup[1] for ukcol in uk["columns"]: add_colinfo(colinfo, ukcol, ukname) return colinfo # First we look up all the tables and make a dictionary so we can find them by name, # then we walk the list of table arguments, and for each table: # * make a dictionary that contains the PK columns # * make a dictionary that contains the FK columns # * emit an HTML "label" # * the first row is the table name in bold # * we walk the columns twice # * the first time we emit just the pk columns # * then we emit a spacer row "---" # * the second time we emit all the non-PK columns # * for each emitted column we emit the name, type, and PK/FK status # * if the column is not null the type is bold # * the PK and FK dictionaries are used to emit the PK/FK info # * we walk the foreign keys and emit a link foo -> bar for each referenced table def emit_erd(data, universe, tables): print("digraph parse {") print("rankdir=LR;") for t_name in universe: t = tables[t_name] pk = compute_pk(t) colinfo = compute_colinfo(t) fkports = {} for fktup in enumerate(t["foreignKeys"]): fk = fktup[1] reftable = fk["referenceTable"] portcol = fk["columns"][0] print(f"{t_name}:{portcol} -> {reftable}") fkports[portcol] = 1 print(f"{t_name} [") print(" shape=plaintext") print(" label=<") print(" <table border='1' cellborder='0'>") print(f" <tr><td><b>{t_name}</b></td></tr>") for not_pk_pass in range(0, 2): for ctup in enumerate(t["columns"]): c = ctup[1] c_name = c["name"] c_type = c["type"] c_kind = "&lt;" + c["kind"] + "&gt;" if "kind" in c else "" c_notnull = c["isNotNull"] == 1 nntext1 = "<b>" if c_notnull else "" nntext2 = "</b>" if c_notnull else "?" fkport = f" port='{c_name}'" if c_name in fkports else "" if not_pk_pass != (c_name in pk): print("<tr>") print(f"<td align='left'>{c_name}</td>") print(f"<td align='left'>{nntext1}{c_type}{c_kind}{nntext2}</td>") if c_name in colinfo: print(f"<td align='left'{fkport}>{colinfo[c_name]}</td>") else: print("<td></td>") print("</tr>") if not_pk_pass == 0: print("<tr><td align='left'>---------</td></tr>") print(" </table>") print(">];") print("}") # Here we emit a digraph that has all the tables # any table that is connected to any other table is included in the main part # * this part is connected to "root" # any tables that are connected to nothing are linked to "orphans" # * this keeps them from clogging the main diagram # we compute the "connected" set by walking all foreign keys for all tables # * any table that is the source or target of an FK is marked "connected" # # The main diagram begins by walking all tables # * for each table we emit a text-only shape # * for each table we emit a link that follows its foreign keys # * if there are no foreign keys we emit a link to "root" or "orphans" # * any not connected table gets a link to "orphans" # def emit_table_diagram(data, universe, tables): connected = {} for t_name in universe: t = tables[t_name] for fktup in enumerate(t["foreignKeys"]): fk = fktup[1] reftable = fk["referenceTable"] connected[t_name] = 1 connected[reftable] = 1 print("digraph parse {") print("rankdir=LR;") need_orphans = False need_root = False for t_name in universe: t = tables[t_name] print(f'{t_name} [label = "{t_name}" shape=plaintext]') if len(t["foreignKeys"]) == 0: if t_name in connected: need_root = True print(f"{t_name} -> root") else: need_orphans = True print(f"{t_name} -> orphans") for fktup in enumerate(t["foreignKeys"]): fk = fktup[1] reftable = fk["referenceTable"] print(f"{t_name} -> {reftable}") if need_orphans and need_root: print("{rank=same orphans root}") print("}") # Here we emit a digraph that has all the tables # This works exactly the same as "tables" except we follow # the dependent region link instead of the foreign key links # everything else is the same even "orphans" etc. def emit_region_diagram(data): connected = {} print("digraph parse {") print("rankdir=LR;") for tup in enumerate(data["regions"]): r = tup[1] r_name = r["name"] print(f'{r_name} [label = "{r_name}" shape=plaintext]') for rdep in enumerate(r["using"]): rparent = rdep[1] print(f"{r_name} -> {rparent}") print("}") # This generates the schema we need for our sql output # Note that this is pretty normalized, so if you want counts # and so forth you have to do them the usual way for # normalized tables. It's done this way for simplicity # and because data volumes are expected to be low and # also if you really needed any denorms, you can make them # yourself very easily starting from this. All of this # is pretty much the relational version of what's in the JSON # for tables, columns, PKs, FKs, and regions. But not all # the fields are present (e.g. column sensitivity is absent). # This could easily be made more complete but the essential # metadata is here; at least enough to ask important questions # about usage and to study the metadata to find possible # consolidations and stuff like that. def emit_schema(): print( ( "create table tables(\n" " t_name text primary key,\n" " region text not null,\n" " deleted bool not null,\n" " create_version int not null,\n" " delete_version int not null,\n" " recreate bool not null,\n" " recreate_group text not null);\n" "\n" "create table pks(\n" " t_name text not null,\n" " c_name text not null);\n" "\n" "create table columns(\n" " t_name text not null,\n" " c_name text not null,\n" " c_type text not null,\n" " c_kind text not null,\n" " c_notnull bool not null,\n" " primary key (t_name, c_name));\n" "\n" "create table regions(\n" " r_name text primary key);\n" "\n" "create table region_deps(\n" " rchild text not null,\n" " rparent text not null);\n" "\n" "create table fks(\n" " fk_name text not null,\n" " src_table text not null,\n" " ref_table text not null,\n" " src_col text not null,\n" " ref_col text not null);\n" "\n" "create table proc_deps(\n" " p_name text not null,\n" " t_name text not null);\n" "\n" "create table views(\n" " v_name text primary key,\n" " region text not null,\n" " deleted bool not null,\n" " create_version int not null,\n" " delete_version int not null);\n" "\n" "create table proc_view_deps(\n" " p_name text not null,\n" " v_name text not null);\n" "\n" ) ) # For any chunk of JSON that has the "dependencies" sub-block # (see CQL JSON docs) we emit the table dependency info # by following the "usesTables" data. Note that per docs # this entry is not optional! def emit_tabledep(section): for src in section: pname = src["name"] usesTables = src["usesTables"] for tdep in usesTables: print(f"insert into proc_deps values('{pname}', '{tdep}');") for vdep in src.get("usesViews", []): print(f"insert into proc_view_deps values('{pname}', '{vdep}');") # This walks the various JSON chunks and emits them into the equivalent table: # * first we walk the tables, this populates: # * one row in the tables table (t_name, region) # * one row per column in the columns table (t_name, c_name, c_type, c_notnull) # * one row per primary key column in the pks table (t_name, pkcol) # * we enumerate the FKs, giving each a name like fk1, fk2, etc. (fk{ifk}) # * emit one row per FK per column (fk{ifk}, t_name, reftable, fkcol, fkref) # * next walk the regions # * emit one row per region in the region table # * emit one row per dependency to region_deps table (r_name, rparent) # * we use emit_tabledep for each chunk of procedures that has dependencies # * this is "queries", "inserts", "updates", "deletes", "general", and "generalInserts" # * see the CQL JSON docs for the meaning of each of these sections # * these all have the "dependencies" block in their JSON def emit_sql(data): for tup in enumerate(data["tables"]): t = tup[1] t_name = t["name"] region = t.get("region", "None") deleted = 1 if t["isDeleted"] else 0 recreated = 1 if t["isRecreated"] else 0 createVersion = t.get("addedVersion", 0) deleteVersion = t.get("deletedVersion", -1) groupName = t.get("recreateGroupName", "") print( f"insert into tables values('{t_name}', '{region}', {deleted}, {createVersion}, {deleteVersion}, {recreated}, '{groupName}');" ) for ctup in enumerate(t["columns"]): c = ctup[1] c_name = c["name"] c_type = c["type"] c_kind = c.get("kind", "") c_notnull = c["isNotNull"] print( f"insert into columns values ('{t_name}', '{c_name}', '{c_type}', '{c_kind}', {c_notnull});" ) for pkcol in t["primaryKey"]: print(f"insert into pks values('{t_name}', '{pkcol}');") ifk = 0 for fktup in enumerate(t["foreignKeys"]): ifk = ifk + 1 fk = fktup[1] fkcols = fk["columns"] fkrefs = fk["referenceColumns"] reftable = fk["referenceTable"] ccols = len(fkcols) for icol in range(0, ccols): fkcol = fkcols[icol] fkref = fkrefs[icol] print( f"insert into fks values('fk{ifk}', '{t_name}', '{reftable}', '{fkcol}', '{fkref}');" ) for tup in enumerate(data["regions"]): r = tup[1] r_name = r["name"] print(f"insert into regions values('{r_name}');") for rdep in enumerate(r["using"]): rparent = rdep[1] print(f"insert into region_deps values('{r_name}', '{rparent}');") emit_tabledep(data["queries"]) emit_tabledep(data["deletes"]) emit_tabledep(data["inserts"]) emit_tabledep(data["generalInserts"]) emit_tabledep(data["updates"]) emit_tabledep(data["general"]) for tup in enumerate(data["views"]): v = tup[1] v_name = v["name"] region = v.get("region", "None") deleted = 1 if v["isDeleted"] else 0 createVersion = v.get("addedVersion", 0) deleteVersion = v.get("deletedVersion", -1) print( f"insert into views values('{v_name}', '{region}', {deleted}, {createVersion}, {deleteVersion});" ) def get_fks(targets, tables, data, arg): # skips deleted tables if arg not in tables: return # skips tables already visited if arg in targets: return t = tables[arg] targets[arg] = 1 for fktup in enumerate(t["foreignKeys"]): fk = fktup[1] reftable = fk["referenceTable"] get_fks(targets, tables, data, reftable) def get_refs(targets, tables, data, refmap, arg): # skips deleted tables if arg not in tables: return # skips tables already visited if arg in targets: return targets[arg] = 1 if arg in refmap: for srctable in refmap[arg]: get_refs(targets, tables, data, refmap, srctable) def get_graph(targets, tables, data, refmap, arg): # skips deleted tables if arg not in tables: return # skips tables already visited if arg in targets: return t = tables[arg] targets[arg] = 1 if arg in refmap: for srctable in refmap[arg]: get_graph(targets, tables, data, refmap, srctable) for fktup in enumerate(t["foreignKeys"]): fk = fktup[1] reftable = fk["referenceTable"] get_graph(targets, tables, data, refmap, reftable) def compute_refmap(data): refmap = {} for tup in enumerate(data["tables"]): t = tup[1] t_name = t["name"] for fktup in enumerate(t["foreignKeys"]): fk = fktup[1] reftable = fk["referenceTable"] # now make the reverse index if reftable not in refmap: refmap[reftable] = {} refmap[reftable][t_name] = 1 return refmap def get_targets(tables, data, refmap, arg): targets = {} if arg.endswith("+fks"): arg = arg[0:-4] if arg not in tables: print(f"'{arg}' not a valid table") exit(1) get_fks(targets, tables, data, arg) elif arg.endswith("+refs"): arg = arg[0:-5] if arg not in tables: print(f"'{arg}' not a valid table") exit(1) get_refs(targets, tables, data, refmap, arg) elif arg.endswith("+graph"): arg = arg[0:-6] if arg not in tables: print(f"'{arg}' not a valid table") exit(1) get_graph(targets, tables, data, refmap, arg) elif arg in tables: targets[arg] = 1 else: print(f"'{arg}' not a valid table") exit(1) return targets def get_universe(tables, data): universe = {} # if no args, the default universe is "everything" if len(sys.argv) == 3: for t in tables: universe[t] = 1 else: refmap = compute_refmap(data) for i in range(3, len(sys.argv)): arg = sys.argv[i] subtract = False if arg[0] == "-": subtract = True arg = arg[1:] targets = get_targets(tables, data, refmap, arg) if not subtract: for t_name in targets: universe[t_name] = 1 else: for t_name in targets: if t_name in universe: del universe[t_name] return universe def dispatch_option(): jfile = sys.argv[2] with open(jfile) as json_file: data = json.load(json_file) tables = {} for tup in enumerate(data["tables"]): t = tup[1] if not t["isDeleted"]: t_name = t["name"] tables[t_name] = t universe = get_universe(tables, data) if sys.argv[1] == "--table_diagram": emit_table_diagram(data, universe, tables) elif sys.argv[1] == "--region_diagram": emit_region_diagram(data) elif sys.argv[1] == "--erd": emit_erd(data, universe, tables) elif sys.argv[1] == "--sql": emit_schema() emit_sql(data) else: usage() def main(): # here we are just going to decode the arguments if len(sys.argv) < 3: usage() else: dispatch_option() if __name__ == "__main__": main()