-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathmpi_cuda_hip.cpp
105 lines (86 loc) · 2.84 KB
/
mpi_cuda_hip.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#ifdef __NVCC__
#include <cuda_runtime.h>
#define gpuFree cudaFree
#define gpuMalloc cudaMalloc
#define gpuMemcpy cudaMemcpy
#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
#define gpuMemset cudaMemset
#define gpuDeviceSynchronize cudaDeviceSynchronize
#define gpuGetDeviceCount cudaGetDeviceCount
#define gpuGetDevice cudaGetDevice
#define gpuSetDevice cudaSetDevice
#else
#include <hip/hip_runtime.h>
#define gpuFree hipFree
#define gpuMalloc hipMalloc
#define gpuMemcpy hipMemcpy
#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
#define gpuMemset hipMemset
#define gpuDeviceSynchronize hipDeviceSynchronize
#define gpuGetDeviceCount hipGetDeviceCount
#define gpuGetDevice hipGetDevice
#define gpuSetDevice hipSetDevice
#endif
#include <cstdio>
#include <iostream>
#include <sstream>
#include <vector>
#include <mpi.h>
__global__ void init_(const size_t n, double *x)
{
size_t i = threadIdx.x + blockIdx.x * blockDim.x;
size_t stride = gridDim.x * blockDim.x;
for (; i < n; i += stride) {
x[i] = 1.1 * i;
}
}
void init(const size_t n, double *x)
{
dim3 blocks(n / 256 + n % 256 ? 1 : 0);
dim3 threads(256);
init_<<<blocks, threads>>>(n, x);
}
int main(int argc, char *argv[])
{
MPI_Init(&argc, &argv);
const size_t n = argc > 1 ? (size_t)std::stoll(argv[1]) : 1024;
const size_t nbytes = sizeof(double) * n;
int size, rank;
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int count, device;
gpuGetDeviceCount(&count);
gpuSetDevice(rank % count);
gpuGetDevice(&device);
printf("Hello from MPI rank %d/%d with GPU %d/%d\n", rank, size, device, count);
// Device data
double *x;
gpuMalloc(&x, nbytes);
gpuMemset(x, 0, nbytes);
gpuDeviceSynchronize();
if (rank == 0) {
// Initialize data on rank 0
init(n, x);
gpuDeviceSynchronize();
// Send with rank 0
MPI_Send(x, n, MPI_DOUBLE, 1, 123, MPI_COMM_WORLD);
printf("Rank %d sent\n", rank);
} else if (rank == 1) {
// Receive with rank 1
MPI_Recv(x, n, MPI_DOUBLE, 0, 123, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("Rank %d received\n", rank);
}
// Copy result to CPU and print
std::vector<double> h_x(n);
gpuMemcpy(h_x.data(), x, nbytes, gpuMemcpyDeviceToHost);
std::stringstream ss;
ss << "Rank " << rank << " has";
for (int i = 0; i < std::min(8ul, n); ++i) ss << " " << h_x[i];
if (n > 8) ss << " ...";
ss << "\n";
std::cout << ss.str();
gpuFree(x);
MPI_Finalize();
}