Skip to content

Commit c03fdae

Browse files
authored
Merge pull request #1097 from erieaton-amd/rocprofv3-2
Collect dispatches and counter values with Rocprofv3
2 parents dd4db35 + 1639598 commit c03fdae

File tree

13 files changed

+768
-8
lines changed

13 files changed

+768
-8
lines changed

CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ else()
3333
endif()
3434

3535
find_package(Threads REQUIRED)
36+
find_package(rocprofiler-sdk PATHS "/opt/rocm/lib/cmake")
3637

3738
set(TRACY_PUBLIC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/public)
3839

@@ -56,6 +57,10 @@ target_link_libraries(
5657
Threads::Threads
5758
${CMAKE_DL_LIBS}
5859
)
60+
if(rocprofiler-sdk_FOUND)
61+
target_compile_definitions(TracyClient PUBLIC TRACY_ROCPROF)
62+
target_link_libraries(TracyClient PUBLIC rocprofiler-sdk::rocprofiler-sdk)
63+
endif()
5964

6065
if(TRACY_Fortran)
6166
add_library(TracyClientF90 ${TRACY_VISIBILITY} "${TRACY_PUBLIC_DIR}/TracyClient.F90")
@@ -142,6 +147,10 @@ set_option(TRACY_VERBOSE "[advanced] Verbose output from the profiler" OFF)
142147
mark_as_advanced(TRACY_VERBOSE)
143148
set_option(TRACY_DEMANGLE "[advanced] Don't use default demangling function - You'll need to provide your own" OFF)
144149
mark_as_advanced(TRACY_DEMANGLE)
150+
if(rocprofiler-sdk_FOUND)
151+
set_option(TRACY_ROCPROF_CALIBRATION "[advanced] Use continuous calibration of the Rocprof GPU time." OFF)
152+
mark_as_advanced(TRACY_ROCPROF_CALIBRATION)
153+
endif()
145154

146155
# handle incompatible combinations
147156
if(TRACY_MANUAL_LIFETIME AND NOT TRACY_DELAYED_INIT)

manual/tracy.tex

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,6 +1707,35 @@ \subsubsection{CUDA}
17071707

17081708
To stop profiling, call the \texttt{TracyCUDAStopProfiling(ctx)} macro.
17091709

1710+
\subsubsection{ROCm}
1711+
1712+
On Linux, if rocprofiler-sdk is installed, tracy can automatically trace GPU dispatches and collect
1713+
performance counter values. If CMake can't find rocprofiler-sdk, you can set the CMake variable
1714+
\texttt{rocprofiler-sdk\_DIR} to point it at the correct module directory. Use the
1715+
\texttt{TRACY\_ROCPROF\_COUNTERS} environment variable with the desired counters separated by commas
1716+
to control what values are collected. The results will appear for each dispatch in the tool tip and
1717+
zone detail window. Results are summed across dimensions. You can get a list of the counters
1718+
available for your hardware with this command:
1719+
\begin{lstlisting}[language=sh]
1720+
rocprofv3 -L
1721+
\end{lstlisting}
1722+
1723+
\subparagraph{Troubleshooting}
1724+
\begin{itemize}
1725+
\item If you are taking very long captures, you may see drift between the GPU and
1726+
CPU timelines. This may be mitigated by setting the CMake variable
1727+
\texttt{TRACY\_ROCPROF\_CALIBRATION}, which will refresh the time synchronization about every
1728+
second.
1729+
\item The timeline drift may also be affected by network time synchronization, in which case the
1730+
drift will be reduced by disabling that, with the advantage that there is no application performance
1731+
cost.
1732+
\item On some GPUs, you will need to change the the performance level to see non-zero results from
1733+
some counters. Use this command:
1734+
\begin{lstlisting}[language=sh]
1735+
sudo amd-smi set -g 0 -l stable_std
1736+
\end{lstlisting}
1737+
\end{itemize}
1738+
17101739
\subsubsection{Multiple zones in one scope}
17111740

17121741
Putting more than one GPU zone macro in a single scope features the same issue as with the \texttt{ZoneScoped} macros, described in section~\ref{multizone} (but this time the variable name is \texttt{\_\_\_tracy\_gpu\_zone}).

profiler/src/profiler/TracyView.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ constexpr const char* GpuContextNames[] = {
4646
"Direct3D 11",
4747
"Metal",
4848
"Custom",
49-
"CUDA"
49+
"CUDA",
50+
"Rocprof"
5051
};
5152

5253
struct MemoryPage;

profiler/src/profiler/TracyView_ZoneInfo.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1580,6 +1580,21 @@ void View::DrawGpuInfoWindow()
15801580
TextFocused( "Delay to execution:", TimeToString( AdjustGpuTime( ev.GpuStart(), begin, drift ) - ev.CpuStart() ) );
15811581
}
15821582

1583+
if( ctx->notes.contains( ev.query_id ) )
1584+
{
1585+
for( auto& p : ctx->notes.at( ev.query_id ) )
1586+
{
1587+
if( ctx->noteNames.count( p.first ) )
1588+
{
1589+
TextFocused( m_worker.GetString( ctx->noteNames.at( p.first ) ), RealToString( p.second ) );
1590+
}
1591+
else
1592+
{
1593+
TextFocused( RealToString( p.first ), RealToString( p.second ) );
1594+
}
1595+
}
1596+
}
1597+
15831598
ImGui::Separator();
15841599

15851600
std::vector<const GpuEvent*> zoneTrace;
@@ -2047,6 +2062,21 @@ void View::ZoneTooltip( const GpuEvent& ev )
20472062
TextFocused( "Delay to execution:", TimeToString( AdjustGpuTime( ev.GpuStart(), begin, drift ) - ev.CpuStart() ) );
20482063
}
20492064

2065+
if( ctx->notes.contains( ev.query_id ) )
2066+
{
2067+
for( auto& p : ctx->notes.at( ev.query_id ) )
2068+
{
2069+
if( ctx->noteNames.count( p.first ) )
2070+
{
2071+
TextFocused( m_worker.GetString( ctx->noteNames.at( p.first ) ), RealToString( p.second ) );
2072+
}
2073+
else
2074+
{
2075+
TextFocused( RealToString( p.first ), RealToString( p.second ) );
2076+
}
2077+
}
2078+
}
2079+
20502080
ImGui::EndTooltip();
20512081
}
20522082

public/TracyClient.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
#include "client/TracyOverride.cpp"
3333
#include "client/TracyKCore.cpp"
3434

35+
#ifdef TRACY_ROCPROF
36+
# include "client/TracyRocprof.cpp"
37+
#endif
38+
3539
#if defined(TRACY_HAS_CALLSTACK)
3640
# if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6
3741
# include "libbacktrace/alloc.cpp"

public/client/TracyProfiler.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2383,6 +2383,10 @@ static void FreeAssociatedMemory( const QueueItem& item )
23832383
tracy_free( (void*)ptr );
23842384
break;
23852385
#endif
2386+
case QueueType::GpuAnnotationName:
2387+
ptr = MemRead<uint64_t>( &item.gpuAnnotationNameFat.ptr );
2388+
tracy_free( (void*)ptr );
2389+
break;
23862390
#ifdef TRACY_ON_DEMAND
23872391
case QueueType::MessageAppInfo:
23882392
case QueueType::GpuContextName:
@@ -2598,6 +2602,12 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token )
25982602
tracy_free_fast( (void*)ptr );
25992603
#endif
26002604
break;
2605+
case QueueType::GpuAnnotationName:
2606+
ptr = MemRead<uint64_t>( &item->gpuAnnotationNameFat.ptr );
2607+
size = MemRead<uint16_t>( &item->gpuAnnotationNameFat.size );
2608+
SendSingleString( (const char*)ptr, size );
2609+
tracy_free_fast( (void*)ptr );
2610+
break;
26012611
case QueueType::PlotDataInt:
26022612
case QueueType::PlotDataFloat:
26032613
case QueueType::PlotDataDouble:
@@ -2956,6 +2966,14 @@ Profiler::DequeueStatus Profiler::DequeueSerial()
29562966
#endif
29572967
break;
29582968
}
2969+
case QueueType::GpuAnnotationName:
2970+
{
2971+
ptr = MemRead<uint64_t>( &item->gpuAnnotationNameFat.ptr );
2972+
uint16_t size = MemRead<uint16_t>( &item->gpuAnnotationNameFat.size );
2973+
SendSingleString( (const char*)ptr, size );
2974+
tracy_free_fast( (void*)ptr );
2975+
break;
2976+
}
29592977
#ifdef TRACY_FIBERS
29602978
case QueueType::ZoneBegin:
29612979
case QueueType::ZoneBeginCallstack:

0 commit comments

Comments
 (0)