void EventProfilerController::profilerLoop()

in libkineto/src/EventProfilerController.cpp [273:421]


void EventProfilerController::profilerLoop() {
  // We limit the number of profilers that can exist per GPU
  auto config = configLoader_.getConfigCopy();
  if (!enableForDevice(*config)) {
    VLOG(0) << "Not starting EventProfiler - profilers for GPU "
            << profiler_->device() << " exceeds profilers per GPU limit ("
            << config->maxEventProfilersPerGpu() << ")";
    return;
  }

  if (!profiler_->setContinuousMode()) {
    VLOG(0) << "Continuous mode not supported for GPU "
            << profiler_->device() << ". Not starting Event Profiler.";
    return;
  }

  VLOG(0) << "Starting Event Profiler for GPU " << profiler_->device();
  setThreadName("CUPTI Event Profiler");

  time_point<system_clock> next_sample_time;
  time_point<system_clock> next_report_time;
  time_point<system_clock> next_on_demand_report_time;
  time_point<system_clock> next_multiplex_time;
  std::unique_ptr<Config> on_demand_config = nullptr;
  bool reconfigure = true;
  bool restart = true;
  int report_count = 0;
  int on_demand_report_count = 0;
  while (!stopRunloop_) {
    heartbeatMonitor_.profilerHeartbeat();
    if (configLoader_.hasNewConfig(*config)) {
      config = configLoader_.getConfigCopy();
      VLOG(0) << "Base config changed";
      report_count = 0;
      reconfigure = true;
    }

    auto now = system_clock::now();
    if (on_demand_config &&
        now > (on_demand_config->eventProfilerOnDemandStartTime() +
               on_demand_config->eventProfilerOnDemandDuration())) {
      on_demand_config = nullptr;
      LOG(INFO) << "On-demand profiling complete";
      reconfigure = true;
    }

    if (!profiler_->isOnDemandActive()) {
      std::lock_guard<std::mutex> lock(mutex_);
      if (newOnDemandConfig_) {
        VLOG(0) << "Received on-demand config, reconfiguring";
        on_demand_config = std::move(newOnDemandConfig_);
        reconfigure = true;
        on_demand_report_count = 0;
      }
    }

    if (reconfigure) {
      try {
        profiler_->configure(*config, on_demand_config.get());
      } catch (const std::exception& ex) {
        LOG(ERROR) << "Encountered error while configuring event profiler: "
            << ex.what();
        // Exit profiling entirely when encountering an error here
        // as it indicates a serious problem or bug.
        break;
      }
      configureHeartbeatMonitor(
          heartbeatMonitor_, *config, on_demand_config.get());
      reconfigure = false;
      restart = true;
    }

    if (restart) {
      now = system_clock::now();
      next_sample_time = now + profiler_->samplePeriod();
      next_report_time = now + profiler_->reportPeriod();
      if (profiler_->isOnDemandActive()) {
        next_on_demand_report_time = now + profiler_->onDemandReportPeriod();
      }
      next_multiplex_time = now + profiler_->multiplexPeriod();
      // Collect an initial sample and throw it away
      // The next sample is the first valid one
      profiler_->collectSample();
      profiler_->clearSamples();
      restart = false;
    }

    auto start_sleep = now;
    while (now < next_sample_time) {
      /* sleep override */
      std::this_thread::sleep_for(next_sample_time - now);
      now = system_clock::now();
    }
    int sleep_time = duration_cast<milliseconds>(now - start_sleep).count();

    auto start_sample = now;
    profiler_->collectSample();
    now = system_clock::now();
    int sample_time = duration_cast<milliseconds>(now - start_sample).count();

    next_sample_time += profiler_->samplePeriod();
    if (now > next_sample_time) {
      reportLateSample(sleep_time, sample_time, 0, 0);
      restart = true;
      continue;
    }

    auto start_report = now;
    if (now > next_report_time) {
      VLOG(1) << "Report #" << report_count++;
      profiler_->reportSamples();
      next_report_time += profiler_->reportPeriod();
    }
    if (profiler_->isOnDemandActive() && now > next_on_demand_report_time) {
      VLOG(1) << "OnDemand Report #" << on_demand_report_count++;
      profiler_->reportOnDemandSamples();
      next_on_demand_report_time += profiler_->onDemandReportPeriod();
    }
    profiler_->eraseReportedSamples();
    now = system_clock::now();
    int report_time = duration_cast<milliseconds>(now - start_report).count();

    if (now > next_sample_time) {
      reportLateSample(sleep_time, sample_time, report_time, 0);
      restart = true;
      continue;
    }

    auto start_multiplex = now;
    if (profiler_->multiplexEnabled() && now > next_multiplex_time) {
      profiler_->enableNextCounterSet();
      next_multiplex_time += profiler_->multiplexPeriod();
    }
    now = system_clock::now();
    int multiplex_time =
        duration_cast<milliseconds>(now - start_multiplex).count();

    if (now > next_sample_time) {
      reportLateSample(sleep_time, sample_time, report_time, multiplex_time);
      restart = true;
    }

    VLOG(0) << "Runloop execution time: "
            << duration_cast<milliseconds>(now - start_sample).count() << "ms";
  }

  VLOG(0) << "Device " << profiler_->device()
          << ": Exited event profiling loop";
}