fx_profile.py (181 lines of code) (raw):
#!/usr/bin/env python
from collections import Counter, defaultdict
from functools import partial
from torch.cuda import synchronize
from typing import Any, Dict, Callable, Optional
import argparse
import gc
import logging
import os
import pandas as pd
import re
import time
import warnings
os.environ["FX_PATCH_GETITEM"] = "1" # make BERT fx.symbolic_trace
from torchbenchmark import list_models
from torch.fx import symbolic_trace, Node, GraphModule
from torch.fx.interpreter import Interpreter
import torch
# These do not fx.symbolic_trace()
SKIP = {"attention_is_all_you_need_pytorch", "demucs", "dlrm", "maml",
"yolov3", "tacotron2", "moco", "Super_SloMo"}
class ProfileStats(object):
@staticmethod
def _norm(cnt: Counter):
""" Normalize to unit length """
total = sum(cnt.values())
return Counter({k: v / total for k, v in cnt.items()})
def __init__(self, get_name: Optional[Callable]):
super(ProfileStats, self).__init__()
self.times: Dict[str, float] = Counter()
self.counts: Dict[str, int] = Counter()
self.get_name = get_name
def record(self, node: Node, sec: float):
""" Record timings of a single call """
name = self.get_name(node)
self.times[name] += sec
self.counts[name] += 1
def summary(self, n=5):
most_common = self._norm(self.times).most_common(n - 1)
return " ".join([f"{k}:{v:.0%}" for k, v in most_common] +
[f"other:{1.0 - sum(v for k, v in most_common):.0%}"])
class ProfileAggregate(ProfileStats):
def __init__(self, name: str):
super(ProfileAggregate, self).__init__(None)
self.df = pd.DataFrame()
self.name = name
def update(self, other: ProfileStats, name):
""" Merge stats from a finished benchmark run into this """
nt = self._norm(other.times).most_common(None)
self.times.update(nt)
self.counts.update(self._norm(other.counts))
self.df = self.df.append(pd.DataFrame(
[[t for n, t in nt]],
index=[name],
columns=[n for n, t in nt],
))
def save(self):
df = self.df.fillna(0.0).transpose()
df.insert(0, "AVERAGE", df.mean(axis=1))
df.sort_values("AVERAGE", ascending=False, inplace=True)
df.to_csv(f"{self.name}.csv")
print(f"wrote {self.name}.csv")
PROFILES = [
ProfileAggregate("operators"),
ProfileAggregate("successors1"),
ProfileAggregate("successors2"),
ProfileAggregate("predecessors1"),
ProfileAggregate("predecessors2"),
]
class FXProfiler(Interpreter):
def __init__(self, module: GraphModule):
super(FXProfiler, self).__init__(module)
self.profile_stats = [
ProfileStats(self.get_name),
ProfileStats(partial(self.succ_name, depth=2)),
ProfileStats(partial(self.succ_name, depth=3)),
ProfileStats(partial(self.pred_name, depth=2)),
ProfileStats(partial(self.pred_name, depth=3)),
]
self.successors = defaultdict(list)
self.predecessors = defaultdict(list)
for node in self.module.graph.nodes:
def visit(other_node):
self.successors[other_node].append(node)
self.predecessors[node].append(other_node)
torch.fx.map_arg((node.args, node.kwargs), visit)
def run_node(self, n: Node) -> Any:
""" Timing wrapper around executing an FX Node """
start = time.perf_counter()
result = super().run_node(n)
synchronize()
sec = time.perf_counter() - start
for prof in self.profile_stats:
prof.record(n, sec)
return result
_op_node_to_name = {
"call_function": lambda i, t: t.__name__,
"call_method": lambda i, t: t,
"call_module": lambda i, t: type(i.fetch_attr(t)).__name__,
"get_attr": lambda i, t: "get_attr",
"output": lambda i, t: "output",
"placeholder": lambda i, t: "placeholder",
}
def get_name(self, n: Node) -> Callable:
""" Coverts a Node to a string name """
return self._op_node_to_name[n.op](self, n.target).lower()
def pred_name(self, node: Node, depth: int) -> Callable:
""" A string name that includes names of predecessor nodes """
if depth <= 1:
return self.get_name(node)
pred_str = ','.join(self.pred_name(x, depth - 1) for x in self.predecessors[node])
return f"{self.get_name(node)}({pred_str})"
def succ_name(self, node: Node, depth: int) -> Callable:
""" A string name that includes names of successor nodes """
s = self.successors[node]
if depth <= 1 or len(s) == 0:
return self.get_name(node)
elif len(s) > 1:
succ_str = "MANY"
else:
succ_str = self.succ_name(s[0], depth - 1)
return f"{self.get_name(node)}->{succ_str}"
def profile(device, name, model, example_inputs, args):
model = torch.fx.symbolic_trace(model)
prof = FXProfiler(model)
for _ in range(args.warmup):
model(*example_inputs)
for _ in range(args.repeat):
synchronize()
prof.run(*example_inputs)
for aggregate, stats in zip(PROFILES, prof.profile_stats):
print(f"{device:4} {name:20} {aggregate.name:13} {stats.summary()}")
aggregate.update(stats, name=name)
return model
def short_name(name, limit=20):
""" Truncate a model name to limit chars"""
return name if len(name) <= limit else f"{name[:limit - 3].rstrip('_')}..."
def iter_models(args):
for benchmark_cls in list_models():
if (not re.search("|".join(args.filter), benchmark_cls.name, re.I) or
re.search("|".join(args.exclude), benchmark_cls.name, re.I) or
benchmark_cls.name in SKIP):
continue
try:
benchmark = benchmark_cls(test="eval", device=args.device, jit=False)
model, example_inputs = benchmark.get_module()
model.eval()
gc.collect()
yield short_name(benchmark.name), model, example_inputs
except NotImplementedError:
pass
def noop():
pass
def main(args=None):
parser = argparse.ArgumentParser()
parser.add_argument("--filter", "-k", action="append",
help="filter benchmarks")
parser.add_argument("--exclude", "-x", action="append",
help="filter benchmarks")
parser.add_argument("--device", "-d", help="cpu or cuda")
parser.add_argument("--warmup", type=int, default=1,
help="warmup runs to do")
parser.add_argument("--repeat", "-n", type=int, default=10,
help="number of timing runs")
parser.add_argument("--threads", "-p", type=int,
help="number threads")
parser.add_argument("--cpu-fusion", action="store_true",
help="enable can_fuse_on_cpu")
parser.add_argument("--no-skip", "-a", action="store_true",
help="run models that don't fx cleanly")
args = parser.parse_args(args)
# defaults
args.device = args.device or "cpu"
args.filter = args.filter or [r"."]
args.exclude = args.exclude or [r"^$"]
if args.device == "cpu":
global synchronize
synchronize = noop
if args.no_skip:
SKIP.clear()
if args.cpu_fusion:
torch._C._jit_override_can_fuse_on_cpu(True)
if args.threads:
torch.set_num_threads(args.threads)
for name, model, example_inputs in iter_models(args):
profile(args.device, name, model, example_inputs, args)
for prof in PROFILES:
prof.save()
if __name__ == '__main__':
logging.basicConfig(level=logging.WARNING)
warnings.filterwarnings("ignore")
main()