@@ -300,10 +300,27 @@ void CuptiActivityProfiler::logGpuVersions() {
300
300
#endif
301
301
}
302
302
303
+ namespace {
304
+
305
+ const std::unordered_set<std::string>& getLoggerMedataAllowList () {
306
+ static const std::unordered_set<std::string> kLoggerMedataAllowList {
307
+ " with_stack" , " with_modules" , " record_shapes" , " profile_memory" };
308
+ return kLoggerMedataAllowList ;
309
+ }
310
+
311
+ } // namespace
312
+
303
313
void CuptiActivityProfiler::processTraceInternal (ActivityLogger& logger) {
304
314
LOG (INFO) << " Processing " << traceBuffers_->cpu .size () << " CPU buffers" ;
305
315
VLOG (0 ) << " Profile time range: " << captureWindowStartTime_ << " - "
306
316
<< captureWindowEndTime_;
317
+
318
+ // Pass metadata within the trace to the logger observer.
319
+ for (const auto & pair : metadata_) {
320
+ if (getLoggerMedataAllowList ().count (pair.first ) > 0 ) {
321
+ LOGGER_OBSERVER_ADD_METADATA (pair.first , pair.second );
322
+ }
323
+ }
307
324
for (auto & pair : versionMetadata_) {
308
325
addMetadata (pair.first , pair.second );
309
326
}
@@ -319,6 +336,9 @@ void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) {
319
336
device_properties.push_back (props);
320
337
}
321
338
}
339
+ for (const auto & [key, value] : session->getMetadata ()) {
340
+ addMetadata (key, value);
341
+ }
322
342
}
323
343
logger.handleTraceStart (
324
344
metadata_, fmt::format (" {}" , fmt::join (device_properties, " ," )));
@@ -429,7 +449,7 @@ void CuptiActivityProfiler::processCpuTrace(
429
449
return ;
430
450
}
431
451
setCpuActivityPresent (true );
432
-
452
+ bool warn_once = false ;
433
453
CpuGpuSpanPair& span_pair =
434
454
recordTraceSpan (cpuTrace.span , cpuTrace.gpuOpCount );
435
455
TraceSpan& cpu_span = span_pair.first ;
@@ -442,18 +462,23 @@ void CuptiActivityProfiler::processCpuTrace(
442
462
const std::unique_ptr<GenericTraceActivity>>::value,
443
463
" handleActivity is unsafe and relies on the caller to maintain not "
444
464
" only lifetime but also address stability." );
445
- if (act->type () == ActivityType::USER_ANNOTATION &&
446
- act->duration () <= 0 ) {
465
+ if (act->duration () < 0 ) {
447
466
act->endTime = captureWindowEndTime_;
448
467
act->addMetadata (" finished" , " false" );
449
- } else {
450
- act->addMetadata (" finished" , " true" );
451
468
}
452
469
logger.handleActivity (*act);
453
470
}
454
471
clientActivityTraceMap_[act->correlationId ()] = &span_pair;
455
472
activityMap_[act->correlationId ()] = act.get ();
456
-
473
+ if (act->deviceId () == 0 ) {
474
+ if (!warn_once) {
475
+ LOG (WARNING)
476
+ << " CPU activity with pid 0 detected. This is likely due to the python stack"
477
+ " tracer not being able to determine the pid for an event. Overriding pid to main thread pid" ;
478
+ }
479
+ act->setDevice (processId ());
480
+ warn_once = true ;
481
+ }
457
482
recordThreadInfo (act->resourceId (), act->getThreadId (), act->deviceId ());
458
483
}
459
484
logger.handleTraceSpan (cpu_span);
@@ -564,8 +589,8 @@ inline static bool isBlockListedRuntimeCbid(CUpti_CallbackId cbid) {
564
589
if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 ||
565
590
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 ||
566
591
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020 ||
567
- // Support cudaEventRecord and cudaEventSynchronize, revisit if others are
568
- // needed
592
+ // Support cudaEventRecord and cudaEventSynchronize, revisit if others
593
+ // are needed
569
594
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020 ||
570
595
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020 ||
571
596
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020) {
@@ -1322,20 +1347,21 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
1322
1347
|| cupti_.stopCollection
1323
1348
#endif // HAS_CUPTI || HAS_ROCTRACER
1324
1349
) {
1325
- // Update runloop state first to prevent further updates to shared state
1350
+ // Update runloop state first to prevent further updates to shared
1351
+ // state
1326
1352
LOG (INFO) << " Tracing complete." ;
1327
1353
VLOG_IF (1 , currentIter >= 0 )
1328
1354
<< " This state change was invoked by application's step() call" ;
1329
1355
1330
1356
// currentIter >= 0 means this is called from the step() api of
1331
- // the profile in pytorch main thread, it should be executed in another
1332
- // thread in case pytorch main thread is blocked
1357
+ // the profile in pytorch main thread, it should be executed in
1358
+ // another thread in case pytorch main thread is blocked
1333
1359
if (currentIter >= 0 ) {
1334
1360
// if collectTraceThread_ is already running, there's no need to
1335
1361
// execute collectTrace twice.
1336
- // Do not call collectTrace when profilerThread_ is collecting Trace.
1337
- // Otherwise, libkineto::api().client()->stop will be called twice,
1338
- // which leads to an unrecoverable ::c10:Error at
1362
+ // Do not call collectTrace when profilerThread_ is collecting
1363
+ // Trace. Otherwise, libkineto::api().client()->stop will be called
1364
+ // twice, which leads to an unrecoverable ::c10:Error at
1339
1365
// disableProfiler
1340
1366
if (!collectTraceThread_ && !getCollectTraceState ()) {
1341
1367
std::lock_guard<std::recursive_mutex> guard (mutex_);
@@ -1404,13 +1430,33 @@ void CuptiActivityProfiler::finalizeTrace(
1404
1430
iterationCountMap_.clear ();
1405
1431
}
1406
1432
1433
+ // Thread & stream info
1434
+ for (const auto & pair : resourceInfo_) {
1435
+ const auto & resource = pair.second ;
1436
+ logger.handleResourceInfo (resource, captureWindowStartTime_);
1437
+ }
1438
+
1439
+ bool use_default_device_info = true ;
1440
+ for (auto & session : sessions_) {
1441
+ auto device_info = session->getDeviceInfo ();
1442
+ if (device_info != nullptr ) {
1443
+ use_default_device_info = false ;
1444
+ logger.handleDeviceInfo (*device_info, captureWindowStartTime_);
1445
+ }
1446
+
1447
+ auto resource_infos = session->getResourceInfos ();
1448
+ for (const auto & resource_info : resource_infos) {
1449
+ logger.handleResourceInfo (resource_info, captureWindowStartTime_);
1450
+ }
1451
+ }
1452
+
1407
1453
// Process names
1408
1454
int32_t pid = processId ();
1409
1455
string process_name = processName (pid);
1410
1456
if (!process_name.empty ()) {
1411
1457
logger.handleDeviceInfo (
1412
1458
{pid, pid, process_name, " CPU" }, captureWindowStartTime_);
1413
- if (!cpuOnly_) {
1459
+ if (!cpuOnly_ && use_default_device_info ) {
1414
1460
// Usually, GPU events use device id as pid (0-7).
1415
1461
// In some cases, CPU sockets are numbered starting from 0.
1416
1462
// In the worst case, 8 CPU sockets + 8 GPUs, so the max GPU ID is 15.
@@ -1428,24 +1474,6 @@ void CuptiActivityProfiler::finalizeTrace(
1428
1474
}
1429
1475
}
1430
1476
1431
- // Thread & stream info
1432
- for (auto pair : resourceInfo_) {
1433
- const auto & resource = pair.second ;
1434
- logger.handleResourceInfo (resource, captureWindowStartTime_);
1435
- }
1436
-
1437
- for (auto & session : sessions_) {
1438
- auto device_info = session->getDeviceInfo ();
1439
- if (device_info != nullptr ) {
1440
- logger.handleDeviceInfo (*device_info, captureWindowStartTime_);
1441
- }
1442
-
1443
- auto resource_infos = session->getResourceInfos ();
1444
- for (auto resource_info : resource_infos) {
1445
- logger.handleResourceInfo (resource_info, captureWindowStartTime_);
1446
- }
1447
- }
1448
-
1449
1477
for (const auto & iterations : traceSpans_) {
1450
1478
for (const auto & span_pair : iterations.second ) {
1451
1479
const TraceSpan& gpu_span = span_pair.second ;
0 commit comments