in libkineto/src/RoctracerActivityApi.cpp [61:255]
int RoctracerActivityApi::processActivities(
ActivityLogger& logger) {
// Find offset to map from monotonic clock to system clock.
// This will break time-ordering of events but is status quo.
timespec t0, t1, t00;
clock_gettime(CLOCK_REALTIME, &t0);
clock_gettime(CLOCK_MONOTONIC, &t1);
clock_gettime(CLOCK_REALTIME, &t00);
const timestamp_t toffset = (timespec_to_ns(t0) >> 1) + (timespec_to_ns(t00) >> 1) - timespec_to_ns(t1);
int count = 0;
// Basic Api calls
for (auto &item : rows_) {
GenericTraceActivity a;
a.startTime = (item.begin + toffset) / 1000;
a.endTime = (item.end + toffset) / 1000;
a.id = item.id;
a.device = item.pid;
a.resource = item.tid;
a.activityType = ActivityType::CUDA_RUNTIME;
a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0));
a.flow.id = item.id;
a.flow.type = kLinkAsyncCpuGpu;
a.flow.start = true;
logger.handleGenericActivity(a);
++count;
}
// Malloc/Free calls
for (auto &item : mallocRows_) {
GenericTraceActivity a;
a.startTime = (item.begin + toffset) / 1000;
a.endTime = (item.end + toffset) / 1000;
a.id = item.id;
a.device = item.pid;
a.resource = item.tid;
a.activityType = ActivityType::CUDA_RUNTIME;
a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0));
a.flow.id = item.id;
a.flow.type = kLinkAsyncCpuGpu;
a.flow.start = true;
a.addMetadata("ptr", item.ptr);
if (item.cid == HIP_API_ID_hipMalloc) {
a.addMetadata("size", item.size);
}
logger.handleGenericActivity(a);
++count;
}
// HipMemcpy calls
for (auto &item : copyRows_) {
GenericTraceActivity a;
a.startTime = (item.begin + toffset) / 1000;
a.endTime = (item.end + toffset) / 1000;
a.id = item.id;
a.device = item.pid;
a.resource = item.tid;
a.activityType = ActivityType::CUDA_RUNTIME;
a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0));
a.flow.id = item.id;
a.flow.type = kLinkAsyncCpuGpu;
a.flow.start = true;
a.addMetadata("src", item.src);
a.addMetadata("dst", item.dst);
a.addMetadata("size", item.size);
a.addMetadata("kind", item.kind);
if ((item.cid == HIP_API_ID_hipMemcpyAsync) || (item.cid == HIP_API_ID_hipMemcpyWithStream)) {
a.addMetadata("stream", fmt::format("{}", reinterpret_cast<void*>(item.stream)));
}
logger.handleGenericActivity(a);
++count;
}
// Kernel Launch Api calls
for (auto &item : kernelRows_) {
GenericTraceActivity a;
a.startTime = (item.begin + toffset) / 1000;
a.endTime = (item.end + toffset) / 1000;
a.id = item.id;
a.device = item.pid;
a.resource = item.tid;
a.activityType = ActivityType::CUDA_RUNTIME;
a.activityName = std::string(roctracer_op_string(ACTIVITY_DOMAIN_HIP_API, item.cid, 0));
a.flow.id = item.id;
a.flow.type = kLinkAsyncCpuGpu;
a.flow.start = true;
if (item.functionAddr != nullptr) {
a.addMetadataQuoted(
"kernel", demangle(hipKernelNameRefByPtr(item.functionAddr, item.stream)));
}
else if (item.function != nullptr) {
a.addMetadataQuoted(
"kernel", demangle(hipKernelNameRef(item.function)));
}
a.addMetadata("grid dim", fmt::format("[{}, {}, {}]", item.gridX, item.gridY, item.gridZ));
a.addMetadata("block dim", fmt::format("[{}, {}, {}]", item.workgroupX, item.workgroupY, item.workgroupZ));
a.addMetadata("shared size", item.groupSegmentSize);
a.addMetadata("stream", fmt::format("{}", reinterpret_cast<void*>(item.stream)));
// Stash launches to tie to the async ops
kernelLaunches_[a.id] = a;
// Stash kernel names to tie to the async ops
std::string name;
if (item.functionAddr != nullptr) {
name = demangle(hipKernelNameRefByPtr(item.functionAddr, item.stream));
}
else if (item.function != nullptr) {
name = demangle(hipKernelNameRef(item.function));
}
if (!name.empty()) {
uint32_t string_id = reverseStrings_[name];
if (string_id == 0) {
string_id = nextStringId_++;
reverseStrings_[name] = string_id;
strings_[string_id] = name;
}
kernelNames_[item.id] = string_id;
}
logger.handleGenericActivity(a);
++count;
}
// Async Ops
for (auto& buffer : *gpuTraceBuffers_) {
const roctracer_record_t* record = (const roctracer_record_t*)(buffer.data);
const roctracer_record_t* end_record = (const roctracer_record_t*)(buffer.data + buffer.validSize);
GenericTraceActivity a;
while (record < end_record) {
if ((record->domain == ACTIVITY_DOMAIN_HIP_API) && (loggedIds_.contains(record->op))) {
const char *name = roctracer_op_string(record->domain, record->op, record->kind);
a.device = record->process_id;
a.resource = record->thread_id;
a.startTime = (record->begin_ns + toffset) / 1000;
a.endTime = (record->end_ns + toffset) / 1000;
a.id = record->correlation_id;
a.activityType = ActivityType::CUDA_RUNTIME;
a.activityName = std::string(name);
a.flow.id = item.id;
a.flow.type = kLinkAsyncCpuGpu;
a.flow.start = true;
logger.handleGenericActivity(a);
++count;
}
else if (record->domain == ACTIVITY_DOMAIN_HCC_OPS) {
// Overlay launch metadata for kernels
auto kit = kernelLaunches_.find(record->correlation_id);
if (kit != kernelLaunches_.end()) {
a = (*kit).second;
}
const char *name = roctracer_op_string(record->domain, record->op, record->kind);
a.device = record->device_id;
a.resource = record->queue_id;
a.startTime = (record->begin_ns + toffset) / 1000;
a.endTime = (record->end_ns + toffset) / 1000;
a.id = record->correlation_id;
a.activityType = ActivityType::CONCURRENT_KERNEL;
a.activityName = std::string(name);
a.flow.id = item.id;
a.flow.type = kLinkAsyncCpuGpu;
auto it = kernelNames_.find(record->correlation_id);
if (it != kernelNames_.end()) {
a.activityName = strings_[it->second];
}
logger.handleGenericActivity(a);
++count;
}
roctracer_next_record(record, &record);
}
}
return count;
}