Skip to content

Commit 13920f4

Browse files
committed
Merge branch 'gpu_performance_examples'
2 parents c13615b + 90b8e49 commit 13920f4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+22601
-42
lines changed
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#pragma once
2+
3+
#include <algorithm>
4+
#include <array>
5+
#include <chrono>
6+
#include <cmath>
7+
#include <cstddef>
8+
#include <cstdio>
9+
#include <ratio>
10+
#include <vector>
11+
12+
template <typename T> constexpr void saxpy(size_t i, T a, T *x, T *y, T *r) {
13+
r[i] = a * x[i] + y[i];
14+
}
15+
16+
template <typename T> constexpr void init_x(size_t i, T *x) {
17+
x[i] = (T)2.3 * sin(i);
18+
}
19+
20+
template <typename T> constexpr void init_y(size_t i, T *y) {
21+
y[i] = (T)1.1 * cos(i);
22+
}
23+
24+
template <typename T> void init(size_t n, T *x, T *y) {
25+
for (size_t i = 0; i < n; i++) {
26+
init_x(i, x);
27+
init_y(i, y);
28+
}
29+
}
30+
31+
template <typename Allocate, typename Deallocate, typename Init, typename Func>
32+
void run(Allocate allocate, Deallocate deallocate, Init init, Func func) {
33+
constexpr std::array ns{1 << 6, 1 << 9, 1 << 12, 1 << 15, 1 << 18,
34+
1 << 21, 1 << 24, 1 << 27, 1 << 30};
35+
constexpr size_t max_n = *std::max_element(ns.begin(), ns.end());
36+
constexpr size_t num_bytes = sizeof(float) * max_n;
37+
38+
float *const x = static_cast<float *>(allocate(num_bytes));
39+
float *const y = static_cast<float *>(allocate(num_bytes));
40+
float *const r = static_cast<float *>(allocate(num_bytes));
41+
init(max_n, x, y);
42+
43+
for (size_t n : ns) {
44+
constexpr auto n_iter = 20;
45+
size_t avg = 0;
46+
for (auto iteration = 0; iteration < n_iter; iteration++) {
47+
constexpr float a = 3.4f;
48+
const auto start = std::chrono::high_resolution_clock::now();
49+
func(n, a, x, y, r);
50+
const auto end = std::chrono::high_resolution_clock::now();
51+
const std::chrono::duration<double, std::nano> dur = end - start;
52+
avg += iteration == 0 ? 0 : dur.count();
53+
}
54+
55+
std::fprintf(stderr, "%f\n", r[n - 1]);
56+
std::printf("%ld, %ld\n", n, avg / (n_iter - 1));
57+
}
58+
59+
deallocate(x);
60+
deallocate(y);
61+
deallocate(r);
62+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#include "common.h"
2+
#include <cstddef>
3+
#include <hip/hip_runtime.h>
4+
5+
__global__ void saxpy_(size_t n, float a, float *x, float *y, float *r) {
6+
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
7+
const size_t stride = gridDim.x * blockDim.x;
8+
9+
for (; tid < n; tid += stride) {
10+
saxpy(tid, a, x, y, r);
11+
}
12+
}
13+
14+
__global__ void init_data(size_t n, float *x, float *y) {
15+
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
16+
const size_t stride = gridDim.x * blockDim.x;
17+
18+
for (; tid < n; tid += stride) {
19+
init_x(tid, x);
20+
init_y(tid, y);
21+
}
22+
}
23+
24+
void *gpu_allocate(size_t bytes) {
25+
void *p = nullptr;
26+
[[maybe_unused]] const auto result = hipMalloc(&p, bytes);
27+
return p;
28+
}
29+
30+
void gpu_free(void *p) { [[maybe_unused]] const auto result = hipFree(p); }
31+
32+
void gpu_init(size_t n, float *x, float *y) {
33+
constexpr dim3 blocks(32);
34+
constexpr dim3 threads(256);
35+
init_data<<<blocks, threads, 0, 0>>>(n, x, y);
36+
}
37+
38+
int main() {
39+
run(gpu_allocate, gpu_free, gpu_init,
40+
[](auto n, auto a, auto *x, auto *y, auto *r) -> auto {
41+
constexpr dim3 blocks(32);
42+
constexpr dim3 threads(256);
43+
44+
saxpy_<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
45+
[[maybe_unused]] const auto result = hipDeviceSynchronize();
46+
});
47+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#include "common.h"
2+
#include <cstddef>
3+
4+
int main() {
5+
run(malloc, free, init<float>,
6+
[](auto n, auto a, auto *x, auto *y, auto *r) -> auto {
7+
#pragma omp parallel for
8+
for (size_t i = 0; i < n; i++) {
9+
saxpy(i, a, x, y, r);
10+
}
11+
});
12+
}
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#!/bin/bash
2+
3+
submit_job() {
4+
sub="$(sbatch "$@")"
5+
6+
if [[ "$sub" =~ Submitted\ batch\ job\ ([0-9]+) ]]; then
7+
echo "${BASH_REMATCH[1]}"
8+
else
9+
exit 1
10+
fi
11+
}
12+
13+
echo "Submitting cpu job"
14+
cpujobid=$(submit_job << "EOF"
15+
#!/bin/bash
16+
17+
#SBATCH --account=project_465001194
18+
#SBATCH --nodes=1
19+
#SBATCH --ntasks=1
20+
#SBATCH --cpus-per-task=64
21+
#SBATCH --mem=13G
22+
#SBATCH --time=00:30:00
23+
#SBATCH --partition=debug
24+
#SBATCH --exclusive
25+
26+
ml PrgEnv-cray
27+
28+
(srun CC -std=c++17 -O3 -fopenmp -Wall -Wextra -Wpedantic -pedantic-errors -o omp omp_saxpy.cpp) || { echo "Failed to build openMP code"; exit 1; }
29+
(srun CC -std=c++17 -O3 -Wall -Wextra -Wpedantic -pedantic-errors -o serial serial_saxpy.cpp) || { echo "Failed to build serial code"; exit 1; }
30+
31+
srun ./serial > "serial.dat"
32+
33+
export OMP_PROC_BIND=close
34+
export OMP_PLACES=cores
35+
36+
for nthreads in 2 64
37+
do
38+
OMP_NUM_THREADS=$nthreads srun ./omp > "omp$nthreads.dat"
39+
done
40+
EOF
41+
)
42+
43+
echo "Submitting gpu job"
44+
gpujobid=$(submit_job << EOF
45+
#!/bin/bash
46+
47+
#SBATCH --account=project_465001194
48+
#SBATCH --nodes=1
49+
#SBATCH --ntasks=1
50+
#SBATCH --cpus-per-task=1
51+
#SBATCH --gpus-per-task=1
52+
#SBATCH --mem=1G
53+
#SBATCH --time=00:01:00
54+
#SBATCH --partition=dev-g
55+
56+
ml PrgEnv-cray
57+
ml craype-accel-amd-gfx90a
58+
ml rocm
59+
60+
(srun CC -std=c++17 -xhip -O3 -Wall -Wextra -Wpedantic -pedantic-errors -o hip hip_saxpy.cpp) || { echo "Failed to build hip code"; exit 1; }
61+
srun ./hip > "hip.dat"
62+
EOF
63+
)
64+
65+
echo "Submitting gnuplot job with dependency on jobs $cpujobid and $gpujobid"
66+
sbatch --dependency afterok:$cpujobid:$gpujobid << EOF
67+
#!/bin/bash
68+
69+
#SBATCH --account=project_465001194
70+
#SBATCH --nodes=1
71+
#SBATCH --ntasks=1
72+
#SBATCH --cpus-per-task=1
73+
#SBATCH --time=00:01:00
74+
#SBATCH --partition=debug
75+
76+
echo "Loading modules"
77+
ml LUMI/23.09
78+
ml partition/C
79+
ml gnuplot/5.4.8-cpeGNU-23.09
80+
81+
echo "Plotting problem size vs runtimes "
82+
gnuplot -e "\
83+
set terminal png size 1000,1000; \
84+
set output \"runtimes.png\"; \
85+
set style data linespoints; \
86+
set key left top; \
87+
set logscale x; \
88+
set logscale y; \
89+
set title \"Runtime of Ax + y with different implementation strategies\"; \
90+
set xlabel \"problem size\"; \
91+
set ylabel \"time [ns]\"; \
92+
set grid; \
93+
set xrange [10:10000000000]; \
94+
plot \"serial.dat\" title \"serial\" lw 2.5, \
95+
\"omp2.dat\" title \"OpenMP 2 threads\" lw 2.5, \
96+
\"omp64.dat\" title \"OpenMP 64 threads\" lw 2.5, \
97+
\"hip.dat\" title \"gpu\" lw 2.5; \
98+
"
99+
EOF
Loading
Loading
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#include "common.h"
2+
#include <cstddef>
3+
4+
int main() {
5+
run(malloc, free, init<float>,
6+
[](auto n, auto a, auto *x, auto *y, auto *r) -> auto {
7+
for (size_t i = 0; i < n; i++) {
8+
saxpy(i, a, x, y, r);
9+
}
10+
});
11+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Build
2+
3+
Build on the login node with `./build.sh`
4+
5+
# Run
6+
7+
Run with `sbatch profile.sbatch`
8+
9+
# Analyze
10+
11+
1. Go to www.lumi.csc.fi
12+
2. Start a desktop session
13+
3. Launch a terminal on the desktop session
14+
4. cd to this directory
15+
5. Do `. ../sourceme.sh`
16+
6. run `omniperf analyze -p workloads/01_three_kernels/mi200/ --gui`
17+
7. Open Firefox
18+
8. Go to address `localhost:8050`
19+
9. Analyze
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
3+
ml LUMI/23.09
4+
ml partition/G
5+
ml rocm/5.4.6
6+
ml PrgEnv-cray/8.4.0
7+
8+
CC -xhip -pg -O2 main.cpp
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#include <cstddef>
2+
#include <hip/hip_runtime.h>
3+
#include <math.h>
4+
5+
__global__ void kernel1(size_t n, float *x, float *y) {
6+
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
7+
const size_t stride = gridDim.x * blockDim.x;
8+
9+
for (; tid < n; tid += stride) {
10+
x[tid] = 0.666f * sin(tid);
11+
y[tid] = 1.337f * cos(tid);
12+
}
13+
}
14+
15+
__global__ void kernel2(size_t n, float a, float *x, float *y, float *r) {
16+
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
17+
const size_t stride = gridDim.x * blockDim.x;
18+
19+
for (; tid < n; tid += stride) {
20+
r[tid] = a * x[tid] + y[tid];
21+
}
22+
}
23+
24+
__global__ void kernel3(size_t n, float a, float *x, float *y, float *r) {
25+
size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
26+
const size_t stride = gridDim.x * blockDim.x;
27+
28+
for (; tid < n; tid += stride) {
29+
const float x1 = x[tid];
30+
const float x2 = x1 * x1;
31+
const float x3 = x1 * x2;
32+
const float x4 = x2 * x2;
33+
34+
const float y1 = y[tid];
35+
const float y2 = y1 * y1;
36+
const float y3 = y1 * y2;
37+
const float y4 = y2 * y2;
38+
// clang-format off
39+
r[tid] =
40+
1.0f * a * x1
41+
- 2.0f * a * x2
42+
+ 3.0f * a * x3
43+
- 4.0f * a * x4
44+
+ 4.0f * a * y1
45+
- 3.0f * a * y2
46+
+ 2.0f * a * y3
47+
- 1.0f * a * y4;
48+
// clang-format on
49+
}
50+
}
51+
52+
void *gpu_allocate(size_t bytes) {
53+
void *p = nullptr;
54+
[[maybe_unused]] const auto result = hipMalloc(&p, bytes);
55+
return p;
56+
}
57+
58+
int main() {
59+
constexpr size_t n = 1 << 30;
60+
constexpr size_t num_bytes = sizeof(float) * n;
61+
constexpr float a = 3.4f;
62+
63+
float *const x = static_cast<float *>(gpu_allocate(num_bytes));
64+
float *const y = static_cast<float *>(gpu_allocate(num_bytes));
65+
float *const r = static_cast<float *>(gpu_allocate(num_bytes));
66+
67+
constexpr dim3 blocks(1024);
68+
constexpr dim3 threads(1024);
69+
kernel1<<<blocks, threads, 0, 0>>>(n, x, y);
70+
kernel2<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
71+
kernel3<<<blocks, threads, 0, 0>>>(n, a, x, y, r);
72+
[[maybe_unused]] auto t = hipDeviceSynchronize();
73+
74+
hipFree(x);
75+
hipFree(y);
76+
hipFree(r);
77+
}

0 commit comments

Comments
 (0)