def _run_ncu()

in train/compute/python/lib/pytorch/build_executor.py [0:0]


    def _run_ncu(self):
        NCU_BIN = "/usr/local/NVIDIA-Nsight-Compute-2021.2/ncu"
        ncu_bin = os.getenv("NCU_BIN")
        if not ncu_bin:
            ncu_bin = NCU_BIN

        param_bench_range = "param_bench@measure"
        start_input_id = self.input_config_queue[0]["id"]
        out_file_prefix = self.run_options["out_file_prefix"]
        timestamp = int(datetime.timestamp(datetime.now()))
        ncu_log_file = (
            f"{out_file_prefix}_{os.getpid()}_{timestamp}_ncu.log"
        )
        ncu_log_file = ncu_log_file.replace(":", "-")
        ncu_extra_args = self.run_options["ncu_args"]
        ncu_options = (
            f"--log-file {ncu_log_file} --csv --app-replay-buffer file --nvtx "
            f"--nvtx-include {param_bench_range} --target-processes all"
        )
        if ncu_extra_args:
            ncu_options += f" {ncu_extra_args}"

        op_info = create_op_info()
        op_info["build_iterator"] = self.op_config.info.get("build_iterator", None)
        op_info["input_iterator"] = self.op_config.info.get("input_iterator", None)
        op_info["build_data_generator"] = self.op_config.info.get(
            "build_data_generator", None
        )
        op_info["input_data_generator"] = self.op_config.info.get(
            "input_data_generator", None
        )

        op_info["config"][0]["build"] = self.build_input_config["build"]
        op_info["config"][0]["input"] = self.input_config_queue
        run_options = get_benchmark_options()
        run_options["device"] = self.run_options["device"]
        run_options["pass_type"] = self.run_options["pass_type"].value
        run_options["warmup"] = 1
        run_options["iteration"] = 1
        config = {
            "op_name": self.op_config.name,
            "config_build_id": self.config_build_id,
            "op_info": op_info,
            "run_options": run_options,
        }
        config_str = json.dumps(config)

        """
        BUG: Python shared memory bug workaround.
        Shared memory has a bug to proper track and release memory, see
        https://bugs.python.org/issue39959
        Fixed PR: https://github.com/python/cpython/pull/20136
        Workaround: unregister(shm._name, "shared_memory") from resource_tracker
        in other processes which access this shm.
        """
        shm = shared_memory.SharedMemory(create=True, size=len(config_str))

        shm.buf[:] = config_str.encode("utf-8")
        logger.debug(f"shared memory buffer: {shm.name}")
        benchmark_cmd = f"python -m param_bench.train.compute.python.pytorch.run_batch -s {shm.name}"
        if logger.getEffectiveLevel() == logging.DEBUG:
            benchmark_cmd += " -v"
        cmd = [ncu_bin] + ncu_options.split(" ") + benchmark_cmd.split(" ")
        cmd_str = " ".join(cmd)
        logger.info(f"running: {cmd_str}")
        with subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            bufsize=1,
            universal_newlines=True,
        ) as proc:
            for line in proc.stdout:
                if line.strip():
                    print(line, end="")
        shm.close()
        shm.unlink()
        end_input_id = self.input_config_queue[-1]['id']
        print(
            json.dumps(
                {
                    "ncu_file": ncu_log_file,
                    "ncu_cmd_str": cmd_str,
                    "config": config,
                    "start_run_id": f"{self.config_build_id}:{start_input_id}",
                    "end_run_id": f"{self.config_build_id}:{end_input_id}",
                }
            ),
            file=self.out_stream,
        )
        logger.info(f"ncu result: {ncu_log_file}")