in train/compute/python/lib/pytorch/build_executor.py [0:0]
def _run_ncu(self):
NCU_BIN = "/usr/local/NVIDIA-Nsight-Compute-2021.2/ncu"
ncu_bin = os.getenv("NCU_BIN")
if not ncu_bin:
ncu_bin = NCU_BIN
param_bench_range = "param_bench@measure"
start_input_id = self.input_config_queue[0]["id"]
out_file_prefix = self.run_options["out_file_prefix"]
timestamp = int(datetime.timestamp(datetime.now()))
ncu_log_file = (
f"{out_file_prefix}_{os.getpid()}_{timestamp}_ncu.log"
)
ncu_log_file = ncu_log_file.replace(":", "-")
ncu_extra_args = self.run_options["ncu_args"]
ncu_options = (
f"--log-file {ncu_log_file} --csv --app-replay-buffer file --nvtx "
f"--nvtx-include {param_bench_range} --target-processes all"
)
if ncu_extra_args:
ncu_options += f" {ncu_extra_args}"
op_info = create_op_info()
op_info["build_iterator"] = self.op_config.info.get("build_iterator", None)
op_info["input_iterator"] = self.op_config.info.get("input_iterator", None)
op_info["build_data_generator"] = self.op_config.info.get(
"build_data_generator", None
)
op_info["input_data_generator"] = self.op_config.info.get(
"input_data_generator", None
)
op_info["config"][0]["build"] = self.build_input_config["build"]
op_info["config"][0]["input"] = self.input_config_queue
run_options = get_benchmark_options()
run_options["device"] = self.run_options["device"]
run_options["pass_type"] = self.run_options["pass_type"].value
run_options["warmup"] = 1
run_options["iteration"] = 1
config = {
"op_name": self.op_config.name,
"config_build_id": self.config_build_id,
"op_info": op_info,
"run_options": run_options,
}
config_str = json.dumps(config)
"""
BUG: Python shared memory bug workaround.
Shared memory has a bug to proper track and release memory, see
https://bugs.python.org/issue39959
Fixed PR: https://github.com/python/cpython/pull/20136
Workaround: unregister(shm._name, "shared_memory") from resource_tracker
in other processes which access this shm.
"""
shm = shared_memory.SharedMemory(create=True, size=len(config_str))
shm.buf[:] = config_str.encode("utf-8")
logger.debug(f"shared memory buffer: {shm.name}")
benchmark_cmd = f"python -m param_bench.train.compute.python.pytorch.run_batch -s {shm.name}"
if logger.getEffectiveLevel() == logging.DEBUG:
benchmark_cmd += " -v"
cmd = [ncu_bin] + ncu_options.split(" ") + benchmark_cmd.split(" ")
cmd_str = " ".join(cmd)
logger.info(f"running: {cmd_str}")
with subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
bufsize=1,
universal_newlines=True,
) as proc:
for line in proc.stdout:
if line.strip():
print(line, end="")
shm.close()
shm.unlink()
end_input_id = self.input_config_queue[-1]['id']
print(
json.dumps(
{
"ncu_file": ncu_log_file,
"ncu_cmd_str": cmd_str,
"config": config,
"start_run_id": f"{self.config_build_id}:{start_input_id}",
"end_run_id": f"{self.config_build_id}:{end_input_id}",
}
),
file=self.out_stream,
)
logger.info(f"ncu result: {ncu_log_file}")