int RoctracerActivityApi::processActivities()

in libkineto/src/RoctracerActivityApi.cpp [61:255]


int RoctracerActivityApi::processActivities(
    ActivityLogger& logger) {
  // Find offset to map from monotonic clock to system clock.
  // This will break time-ordering of events but is status quo.

  timespec t0, t1, t00;
  clock_gettime(CLOCK_REALTIME, &t0);
  clock_gettime(CLOCK_MONOTONIC, &t1);
  clock_gettime(CLOCK_REALTIME, &t00);

  const timestamp_t toffset = (timespec_to_ns(t0) >> 1) + (timespec_to_ns(t00) >> 1) - timespec_to_ns(t1);

  int count = 0;

  // Basic Api calls

  for (auto &item : rows_) {
    GenericTraceActivity a;
    a.startTime = (item.begin + toffset) / 1000;
    a.endTime = (item.end + toffset) / 1000;
    a.id = item.id;
    a.device = item.pid;
    a.resource = item.tid;
    a.activityType = ActivityType::CUDA_RUNTIME;
    a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0));
    a.flow.id = item.id;
    a.flow.type = kLinkAsyncCpuGpu;
    a.flow.start = true;

    logger.handleGenericActivity(a);
    ++count;
  }

  // Malloc/Free calls
  for (auto &item : mallocRows_) {
    GenericTraceActivity a;
    a.startTime = (item.begin + toffset) / 1000;
    a.endTime = (item.end + toffset) / 1000;
    a.id = item.id;
    a.device = item.pid;
    a.resource = item.tid;
    a.activityType = ActivityType::CUDA_RUNTIME;
    a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0));
    a.flow.id = item.id;
    a.flow.type = kLinkAsyncCpuGpu;
    a.flow.start = true;

    a.addMetadata("ptr", item.ptr);
    if (item.cid == HIP_API_ID_hipMalloc) {
      a.addMetadata("size", item.size);
    }

    logger.handleGenericActivity(a);
    ++count;
  }

  // HipMemcpy calls
  for (auto &item : copyRows_) {
    GenericTraceActivity a;
    a.startTime = (item.begin + toffset) / 1000;
    a.endTime = (item.end + toffset) / 1000;
    a.id = item.id;
    a.device = item.pid;
    a.resource = item.tid;
    a.activityType = ActivityType::CUDA_RUNTIME;
    a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0));
    a.flow.id = item.id;
    a.flow.type = kLinkAsyncCpuGpu;
    a.flow.start = true;

    a.addMetadata("src", item.src);
    a.addMetadata("dst", item.dst);
    a.addMetadata("size", item.size);
    a.addMetadata("kind", item.kind);
    if ((item.cid == HIP_API_ID_hipMemcpyAsync) || (item.cid == HIP_API_ID_hipMemcpyWithStream)) {
      a.addMetadata("stream", fmt::format("{}", reinterpret_cast<void*>(item.stream)));
    }

    logger.handleGenericActivity(a);
    ++count;
  }

  // Kernel Launch Api calls

  for (auto &item : kernelRows_) {
    GenericTraceActivity a;
    a.startTime = (item.begin + toffset) / 1000;
    a.endTime = (item.end + toffset) / 1000;
    a.id = item.id;
    a.device = item.pid;
    a.resource = item.tid;
    a.activityType = ActivityType::CUDA_RUNTIME;
    a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0));
    a.flow.id = item.id;
    a.flow.type = kLinkAsyncCpuGpu;
    a.flow.start = true;

    if (item.functionAddr != nullptr) {
      a.addMetadataQuoted(
          "kernel", demangle(hipKernelNameRefByPtr(item.functionAddr, item.stream)));
    }
    else if (item.function != nullptr) {
      a.addMetadataQuoted(
          "kernel", demangle(hipKernelNameRef(item.function)));
    }
    a.addMetadata("grid dim", fmt::format("[{}, {}, {}]", item.gridX, item.gridY, item.gridZ));
    a.addMetadata("block dim", fmt::format("[{}, {}, {}]", item.workgroupX, item.workgroupY, item.workgroupZ));
    a.addMetadata("shared size", item.groupSegmentSize);
    a.addMetadata("stream", fmt::format("{}", reinterpret_cast<void*>(item.stream)));

    // Stash launches to tie to the async ops
    kernelLaunches_[a.id] = a;

    // Stash kernel names to tie to the async ops
    std::string name;
    if (item.functionAddr != nullptr) {
      name = demangle(hipKernelNameRefByPtr(item.functionAddr, item.stream));
    }
    else if (item.function != nullptr) {
      name = demangle(hipKernelNameRef(item.function));
    }
    if (!name.empty()) {
      uint32_t string_id = reverseStrings_[name];
      if (string_id == 0) {
        string_id = nextStringId_++;
        reverseStrings_[name] = string_id;
        strings_[string_id] = name;
      }
      kernelNames_[item.id] = string_id;
    }

    logger.handleGenericActivity(a);
    ++count;
  }

  // Async Ops

  for (auto& buffer : *gpuTraceBuffers_) {
    const roctracer_record_t* record = (const roctracer_record_t*)(buffer.data);
    const roctracer_record_t* end_record = (const roctracer_record_t*)(buffer.data + buffer.validSize);
    GenericTraceActivity a;

    while (record < end_record) {
      if ((record->domain == ACTIVITY_DOMAIN_HIP_API) && (loggedIds_.contains(record->op))) {
        const char *name = roctracer_op_string(record->domain, record->op, record->kind);
        a.device = record->process_id;
        a.resource = record->thread_id;

        a.startTime = (record->begin_ns + toffset) / 1000;
        a.endTime = (record->end_ns + toffset) / 1000;
        a.id = record->correlation_id;

        a.activityType = ActivityType::CUDA_RUNTIME;
        a.activityName = std::string(name);
        a.flow.id = item.id;
        a.flow.type = kLinkAsyncCpuGpu;
        a.flow.start = true;

        logger.handleGenericActivity(a);
        ++count;
      }
      else if (record->domain == ACTIVITY_DOMAIN_HCC_OPS) {
        // Overlay launch metadata for kernels
        auto kit = kernelLaunches_.find(record->correlation_id);
        if (kit != kernelLaunches_.end()) {
          a = (*kit).second;
        }

        const char *name = roctracer_op_string(record->domain, record->op, record->kind);
        a.device = record->device_id;
        a.resource = record->queue_id;

        a.startTime = (record->begin_ns + toffset) / 1000;
        a.endTime = (record->end_ns + toffset) / 1000;
        a.id = record->correlation_id;

        a.activityType = ActivityType::CONCURRENT_KERNEL;
        a.activityName = std::string(name);
        a.flow.id = item.id;
        a.flow.type = kLinkAsyncCpuGpu;

        auto it = kernelNames_.find(record->correlation_id);
        if (it != kernelNames_.end()) {
          a.activityName = strings_[it->second];
        }

        logger.handleGenericActivity(a);
        ++count;
      }

      roctracer_next_record(record, &record);
    }
  }
  return count;
}