Skip to content

Commit aebebe7

Browse files
authored
Merge branch 'main' into zejun/kineto_for_pti_0.11
2 parents c98cb20 + 7cb6ac6 commit aebebe7

14 files changed

+147
-53
lines changed

libkineto/CMakeLists.txt

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -141,14 +141,18 @@ endif()
141141
if (LIBKINETO_NOCUPTI AND LIBKINETO_NOROCTRACER AND LIBKINETO_NOXPUPTI)
142142
get_filelist("get_libkineto_cpu_only_srcs(with_api=False)" LIBKINETO_SRCS)
143143
message(INFO " CUPTI unavailable or disabled - not building GPU profilers")
144-
elseif(NOT LIBKINETO_NOROCTRACER)
145-
get_filelist("get_libkineto_roctracer_srcs(with_api=False)" LIBKINETO_SRCS)
144+
else()
145+
if(NOT LIBKINETO_NOROCTRACER)
146+
get_filelist("get_libkineto_roctracer_srcs(with_api=False)" LIBKINETO_roc_SRCS)
146147
message(INFO " Building with roctracer")
147-
elseif(DEFINED LIBKINETO_NOXPUPTI AND NOT LIBKINETO_NOXPUPTI)
148-
get_filelist("get_libkineto_xpupti_srcs(with_api=False)" LIBKINETO_SRCS)
148+
elseif(NOT LIBKINETO_NOCUPTI)
149+
get_filelist("get_libkineto_cupti_srcs(with_api=False)" LIBKINETO_cuda_SRCS)
150+
endif()
151+
if(DEFINED LIBKINETO_NOXPUPTI AND NOT LIBKINETO_NOXPUPTI)
152+
get_filelist("get_libkineto_xpupti_srcs(with_api=False)" LIBKINETO_xpu_SRCS)
149153
message(INFO " Building with xpupti")
150-
else()
151-
get_filelist("get_libkineto_cupti_srcs(with_api=False)" LIBKINETO_SRCS)
154+
endif()
155+
set(LIBKINETO_SRCS ${LIBKINETO_roc_SRCS} ${LIBKINETO_xpu_SRCS} ${LIBKINETO_cuda_SRCS})
152156
endif()
153157
get_filelist("get_libkineto_public_headers()" LIBKINETO_PUBLIC_HEADERS)
154158
get_filelist("get_libkineto_api_srcs()" LIBKINETO_API_SRCS)

libkineto/include/GenericTraceActivity.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ class GenericTraceActivity : public ITraceActivity {
4747
return resource;
4848
}
4949

50+
void setDevice(int32_t newDevice) {
51+
device = newDevice;
52+
}
53+
5054
int32_t getThreadId() const override {
5155
return threadId;
5256
}

libkineto/include/IActivityProfiler.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,10 @@ class IActivityProfilerSession {
130130
return "";
131131
}
132132

133+
virtual std::unordered_map<std::string, std::string> getMetadata() {
134+
return {};
135+
}
136+
133137
protected:
134138
TraceStatus status_ = TraceStatus::READY;
135139
};

libkineto/include/ThreadUtil.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#pragma once
1010

11+
#include <sys/stat.h>
1112
#include <cstdint>
1213
#include <string>
1314
#include <utility>
@@ -20,6 +21,7 @@ int32_t threadId();
2021
bool setThreadName(const std::string& name);
2122
std::string getThreadName();
2223

24+
int32_t pidNamespace(ino_t& ns);
2325
int32_t processId(bool cache = true);
2426
std::string processName(int32_t pid);
2527

libkineto/sample_programs/build.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,7 @@ g++ \
1818
-lpthread \
1919
-lcuda \
2020
-lcudart \
21+
-lcupti \
22+
-lnvperf_host \
2123
/usr/local/lib/libkineto.a \
2224
kplay_cu.o

libkineto/sample_programs/kineto_playground.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <cuda_runtime.h>
910
#include <iostream>
1011
#include <string>
1112

@@ -14,6 +15,20 @@
1415
// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
1516
#include "kineto_playground.cuh"
1617

18+
#define CHECK_CUDA(call) \
19+
do { \
20+
cudaError_t status = call; \
21+
if (status != cudaSuccess) { \
22+
fprintf( \
23+
stderr, \
24+
"CUDA Error at %s:%d: %s\n", \
25+
__FILE__, \
26+
__LINE__, \
27+
cudaGetErrorString(status)); \
28+
exit(1); \
29+
} \
30+
} while (0)
31+
1732
using namespace kineto;
1833

1934
static const std::string kFileName = "/tmp/kineto_playground_trace.json";
@@ -23,6 +38,7 @@ int main() {
2338
warmup();
2439

2540
// Kineto config
41+
libkineto_init(false, true);
2642

2743
// Empty types set defaults to all types
2844
std::set<libkineto::ActivityType> types;
@@ -38,6 +54,7 @@ int main() {
3854
profiler.startTrace();
3955
std::cout << "Start playground" << std::endl;
4056
playground();
57+
CHECK_CUDA(cudaDeviceSynchronize());
4158

4259
std::cout << "Stop Trace" << std::endl;
4360
auto trace = profiler.stopTrace();

libkineto/sample_programs/kineto_playground.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ __global__ void square(float* A, int N) {
7676

7777
void playground(void) {
7878
// Add your experimental CUDA implementation here.
79-
basicMemcpyFromDevice();
79+
basicMemcpyToDevice();
8080
compute();
8181
basicMemcpyFromDevice();
8282
}

libkineto/src/AbstractConfig.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,8 @@ int64_t AbstractConfig::toInt64(const string& val) const {
116116
}
117117

118118
bool AbstractConfig::toBool(string& val) const {
119-
const std::array<string, 8> bool_vals{
120-
"n", "y", "no", "yes", "f", "t", "false", "true"};
119+
const std::array<string, 10> bool_vals{
120+
"n", "y", "no", "yes", "f", "t", "false", "true", "0", "1"};
121121
const string lower_val = toLower(val);
122122
for (int i = 0; i < bool_vals.size(); i++) {
123123
if (lower_val == bool_vals[i]) {

libkineto/src/ApproximateClock.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
#include <functional>
1919
#include <type_traits>
2020

21-
namespace libkineto {
22-
2321
#if defined(__i386__) || defined(__x86_64__) || defined(__amd64__)
2422
#define KINETO_RDTSC
2523
#if defined(_MSC_VER)
@@ -42,6 +40,8 @@ namespace libkineto {
4240
#define KINETO_UNUSED __attribute__((__unused__))
4341
#endif //_MSC_VER
4442

43+
namespace libkineto {
44+
4545
using time_t = int64_t;
4646
using steady_clock_t = std::conditional_t<
4747
std::chrono::high_resolution_clock::is_steady,

libkineto/src/CuptiActivityProfiler.cpp

Lines changed: 61 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -300,10 +300,27 @@ void CuptiActivityProfiler::logGpuVersions() {
300300
#endif
301301
}
302302

303+
namespace {
304+
305+
const std::unordered_set<std::string>& getLoggerMedataAllowList() {
306+
static const std::unordered_set<std::string> kLoggerMedataAllowList{
307+
"with_stack", "with_modules", "record_shapes", "profile_memory"};
308+
return kLoggerMedataAllowList;
309+
}
310+
311+
} // namespace
312+
303313
void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) {
304314
LOG(INFO) << "Processing " << traceBuffers_->cpu.size() << " CPU buffers";
305315
VLOG(0) << "Profile time range: " << captureWindowStartTime_ << " - "
306316
<< captureWindowEndTime_;
317+
318+
// Pass metadata within the trace to the logger observer.
319+
for (const auto& pair : metadata_) {
320+
if (getLoggerMedataAllowList().count(pair.first) > 0) {
321+
LOGGER_OBSERVER_ADD_METADATA(pair.first, pair.second);
322+
}
323+
}
307324
for (auto& pair : versionMetadata_) {
308325
addMetadata(pair.first, pair.second);
309326
}
@@ -319,6 +336,9 @@ void CuptiActivityProfiler::processTraceInternal(ActivityLogger& logger) {
319336
device_properties.push_back(props);
320337
}
321338
}
339+
for (const auto& [key, value] : session->getMetadata()) {
340+
addMetadata(key, value);
341+
}
322342
}
323343
logger.handleTraceStart(
324344
metadata_, fmt::format("{}", fmt::join(device_properties, ",")));
@@ -429,7 +449,7 @@ void CuptiActivityProfiler::processCpuTrace(
429449
return;
430450
}
431451
setCpuActivityPresent(true);
432-
452+
bool warn_once = false;
433453
CpuGpuSpanPair& span_pair =
434454
recordTraceSpan(cpuTrace.span, cpuTrace.gpuOpCount);
435455
TraceSpan& cpu_span = span_pair.first;
@@ -442,18 +462,23 @@ void CuptiActivityProfiler::processCpuTrace(
442462
const std::unique_ptr<GenericTraceActivity>>::value,
443463
"handleActivity is unsafe and relies on the caller to maintain not "
444464
"only lifetime but also address stability.");
445-
if (act->type() == ActivityType::USER_ANNOTATION &&
446-
act->duration() <= 0) {
465+
if (act->duration() < 0) {
447466
act->endTime = captureWindowEndTime_;
448467
act->addMetadata("finished", "false");
449-
} else {
450-
act->addMetadata("finished", "true");
451468
}
452469
logger.handleActivity(*act);
453470
}
454471
clientActivityTraceMap_[act->correlationId()] = &span_pair;
455472
activityMap_[act->correlationId()] = act.get();
456-
473+
if (act->deviceId() == 0) {
474+
if (!warn_once) {
475+
LOG(WARNING)
476+
<< "CPU activity with pid 0 detected. This is likely due to the python stack"
477+
" tracer not being able to determine the pid for an event. Overriding pid to main thread pid";
478+
}
479+
act->setDevice(processId());
480+
warn_once = true;
481+
}
457482
recordThreadInfo(act->resourceId(), act->getThreadId(), act->deviceId());
458483
}
459484
logger.handleTraceSpan(cpu_span);
@@ -564,8 +589,8 @@ inline static bool isBlockListedRuntimeCbid(CUpti_CallbackId cbid) {
564589
if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 ||
565590
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 ||
566591
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020 ||
567-
// Support cudaEventRecord and cudaEventSynchronize, revisit if others are
568-
// needed
592+
// Support cudaEventRecord and cudaEventSynchronize, revisit if others
593+
// are needed
569594
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020 ||
570595
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020 ||
571596
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020) {
@@ -1322,20 +1347,21 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
13221347
|| cupti_.stopCollection
13231348
#endif // HAS_CUPTI || HAS_ROCTRACER
13241349
) {
1325-
// Update runloop state first to prevent further updates to shared state
1350+
// Update runloop state first to prevent further updates to shared
1351+
// state
13261352
LOG(INFO) << "Tracing complete.";
13271353
VLOG_IF(1, currentIter >= 0)
13281354
<< "This state change was invoked by application's step() call";
13291355

13301356
// currentIter >= 0 means this is called from the step() api of
1331-
// the profile in pytorch main thread, it should be executed in another
1332-
// thread in case pytorch main thread is blocked
1357+
// the profile in pytorch main thread, it should be executed in
1358+
// another thread in case pytorch main thread is blocked
13331359
if (currentIter >= 0) {
13341360
// if collectTraceThread_ is already running, there's no need to
13351361
// execute collectTrace twice.
1336-
// Do not call collectTrace when profilerThread_ is collecting Trace.
1337-
// Otherwise, libkineto::api().client()->stop will be called twice,
1338-
// which leads to an unrecoverable ::c10:Error at
1362+
// Do not call collectTrace when profilerThread_ is collecting
1363+
// Trace. Otherwise, libkineto::api().client()->stop will be called
1364+
// twice, which leads to an unrecoverable ::c10:Error at
13391365
// disableProfiler
13401366
if (!collectTraceThread_ && !getCollectTraceState()) {
13411367
std::lock_guard<std::recursive_mutex> guard(mutex_);
@@ -1404,13 +1430,33 @@ void CuptiActivityProfiler::finalizeTrace(
14041430
iterationCountMap_.clear();
14051431
}
14061432

1433+
// Thread & stream info
1434+
for (const auto& pair : resourceInfo_) {
1435+
const auto& resource = pair.second;
1436+
logger.handleResourceInfo(resource, captureWindowStartTime_);
1437+
}
1438+
1439+
bool use_default_device_info = true;
1440+
for (auto& session : sessions_) {
1441+
auto device_info = session->getDeviceInfo();
1442+
if (device_info != nullptr) {
1443+
use_default_device_info = false;
1444+
logger.handleDeviceInfo(*device_info, captureWindowStartTime_);
1445+
}
1446+
1447+
auto resource_infos = session->getResourceInfos();
1448+
for (const auto& resource_info : resource_infos) {
1449+
logger.handleResourceInfo(resource_info, captureWindowStartTime_);
1450+
}
1451+
}
1452+
14071453
// Process names
14081454
int32_t pid = processId();
14091455
string process_name = processName(pid);
14101456
if (!process_name.empty()) {
14111457
logger.handleDeviceInfo(
14121458
{pid, pid, process_name, "CPU"}, captureWindowStartTime_);
1413-
if (!cpuOnly_) {
1459+
if (!cpuOnly_ && use_default_device_info) {
14141460
// Usually, GPU events use device id as pid (0-7).
14151461
// In some cases, CPU sockets are numbered starting from 0.
14161462
// In the worst case, 8 CPU sockets + 8 GPUs, so the max GPU ID is 15.
@@ -1428,24 +1474,6 @@ void CuptiActivityProfiler::finalizeTrace(
14281474
}
14291475
}
14301476

1431-
// Thread & stream info
1432-
for (auto pair : resourceInfo_) {
1433-
const auto& resource = pair.second;
1434-
logger.handleResourceInfo(resource, captureWindowStartTime_);
1435-
}
1436-
1437-
for (auto& session : sessions_) {
1438-
auto device_info = session->getDeviceInfo();
1439-
if (device_info != nullptr) {
1440-
logger.handleDeviceInfo(*device_info, captureWindowStartTime_);
1441-
}
1442-
1443-
auto resource_infos = session->getResourceInfos();
1444-
for (auto resource_info : resource_infos) {
1445-
logger.handleResourceInfo(resource_info, captureWindowStartTime_);
1446-
}
1447-
}
1448-
14491477
for (const auto& iterations : traceSpans_) {
14501478
for (const auto& span_pair : iterations.second) {
14511479
const TraceSpan& gpu_span = span_pair.second;

0 commit comments

Comments
 (0)