-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathgemm_mkl_cublas_usm.cpp
251 lines (210 loc) · 8.13 KB
/
gemm_mkl_cublas_usm.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
//==============================================================
// Matrix Multiplication: MKL, CUBLAS
//==============================================================
// This code built using snippets of code from Intel trainings
//
// SPDX-License-Identifier: MIT
// =============================================================
//
// If uyou are lucky you make this https://github.com/oneapi-src/oneMKL work
//
#include <sycl/sycl.hpp>
#include <ctime>
#include <chrono>
#include <getopt.h>
#if MKL_LIB
#include "oneapi/mkl.hpp" //# oneMKL DPC++ interface
#endif
#if CUBLAS
// cuda interface
#include <cublas_v2.h>
#include <cuda.h>
#define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
if (status != CUBLAS_STATUS_SUCCESS) {
std::cout << "ERROR CUBLAS:" << msg << " - " << status << std::endl;
exit(EXIT_FAILURE);
}
}
#endif
using namespace sycl;
#if MKL_LIB
using namespace oneapi::mkl;
#endif
int main(int argc, char *argv[]) {
size_t N = 1024;
size_t M = 32;
int VERIFY = 0;
int PRINT_OUTPUT_MATRIX = 0;
int arg;
while ((arg = getopt (argc, argv, "n:m:vp:h")) != -1)
switch (arg){
case 'n':
N = std::atoi(optarg);
break;
case 'm':
M = std::atoi(optarg);
break;
case 'v':
VERIFY = 1;
break;
case 'p':
PRINT_OUTPUT_MATRIX = 1;
break;
case 'h':
std::cout << std::endl;
std::cout << "Usage : ./a.out -n <MATRIX_SIZE> -m <WORK_GROUP_SIZE> -v -p\n\n";
std::cout << " [-n] size for matrix, eg: 1024\n";
std::cout << " [-m] size of work_group, eg: 8/16\n";
std::cout << " [-v] verify output with linear computation on cpu\n";
std::cout << " [-p] print output matrix\n";
std::cout << "Example : ./a.out -n 1024 -m 16 -v -p\n\n";
std::exit(0);
}
//# Define vectors for matricies
std::vector<float> matrix_a(N*N);
std::vector<float> matrix_b(N*N);
std::vector<float> matrix_c(N*N);
std::vector<float> matrix_d(N*N);
//# Initialize matricies with values
float v1 = 2.f;
float v2 = 3.f;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++){
matrix_a[i*N+j] = v1++;
matrix_b[i*N+j] = v2++;
matrix_c[i*N+j] = 0.f;
matrix_d[i*N+j] = 0.f;
}
//# Define queue with default device for offloading computation
queue q{property::queue::in_order{}};
// First we warm-up the device
std::cout << "Warm-up first" << "\n";
{
//# Create buffers for matrices
buffer<float, 1> a(matrix_a.data(), range<1>(N*N));
buffer<float, 1> b(matrix_b.data(), range<1>(N*N));
buffer<float, 1> c(matrix_c.data(), range<1>(N*N));
//# Submit command groups to execute on device
q.submit([&](handler &h){
//# Create accessors to copy buffers to the device
auto A = a.get_access<access::mode::read>(h);
auto B = b.get_access<access::mode::read>(h);
auto C = c.get_access<access::mode::write>(h);
//# Define size for ND-Range and work-group size
range<2> global_size(N,N);
range<2> work_group_size(M,M);
//# Parallel Compute Matrix Multiplication
h.parallel_for(nd_range<2>{global_size, work_group_size}, [=](nd_item<2> item){
const int i = item.get_global_id(0);
const int j = item.get_global_id(1);
//# Use private mem to store intermediate result
float temp=0.f;
for (int k = 0; k < N; k++) {
temp += A[i*N+k] * B[k*N+j];
}
C[i*N+j] = temp;
});
});
} // warm-up done
//# Initialize matrices with values
v1 = 2.f;
v2 = 3.f;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++){
matrix_a[i*N+j] = v1++;
matrix_b[i*N+j] = v2++;
matrix_c[i*N+j] = 0.f;
matrix_d[i*N+j] = 0.f;
}
auto start = std::chrono::high_resolution_clock::now().time_since_epoch().count();
std::cout << "Offload Device : " << q.get_device().get_info<info::device::name>() << "\n";
std::cout << "max_work_group_size : " << q.get_device().get_info<info::device::max_work_group_size>() << "\n";
std::cout << "Configuration : MATRIX_SIZE= " << N << "x" << N << "\n";
float* dev_a = sycl::malloc_device<float>(N*N, q);
float* dev_b = sycl::malloc_device<float>(N*N, q);
float* dev_c = sycl::malloc_device<float>(N*N, q);
q.memcpy(dev_a, matrix_a.data(), N*N*sizeof(float)).wait();
q.memcpy(dev_b, matrix_b.data(), N*N*sizeof(float)).wait();
//# scalar multipliers
float alpha = 1.f, beta = 1.f;
#if MKL_LIB // uses mkl blas
//# transpose status of matrices for oneMKL
oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans;
oneapi::mkl::transpose transB = oneapi::mkl::transpose::nontrans;
//# Submit MKL library call to execute on device
blas::gemm(q, transA, transB, N, N, N, alpha, dev_b, N, dev_a, N, beta, dev_c, N);
q.wait();
#endif
#if CUBLAS
// Create cublas handle
cublasHandle_t handle;
CHECK_ERROR(cublasCreate(&handle));
#if ACPP
std::cout << "\n"<< "Running with ACPP interoperability. \n";
q.submit([&](handler &cgh) {
cgh.AdaptiveCpp_enqueue_custom_operation([=](sycl::interop_handle &ih) {
// Set the correct stream
auto cuStream = ih.get_native_queue<sycl::backend::cuda>();
cublasSetStream(handle, cuStream);
// Call generalised matrix-matrix multiply
// Call generalised matrix-matrix multiply
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N,N,
N, &alpha, dev_b, N, dev_a, N, &beta,
dev_c, N));
});
}).wait();
#endif
#if DPCPP
std::cout << "\n"<< "Warning!!! " << " \n" << " The DPC++ & CUDA \n relies on the number of assumptions:\n in-order queues,\n no event- or buffer-based dependencies, \n no frequent switching between multiple devices \n stars aligning properly.\n\n";
q.submit([&](handler &h) {
h.host_task([=](sycl::interop_handle ih) {
// Set the correct cuda context & stream
cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
cublasSetStream(handle, cuStream);
// Call generalised matrix-matrix multiply
CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N,N,
N, &alpha, dev_b, N, dev_a, N, &beta,
dev_c, N));
cuStreamSynchronize(cuStream);
});
}).wait();
#endif
#endif
q.memcpy(matrix_c.data(), dev_c, N*N*sizeof(float)).wait();
// Nor sure if this works
auto duration = std::chrono::high_resolution_clock::now().time_since_epoch().count() - start;
std::cout << "\tCompute Duration : " << duration / 1e+9 << " seconds\n";
//# Print Output
if (PRINT_OUTPUT_MATRIX){
for (int i=0; i<N; i++){
for (int j=0; j<N; j++){
std::cout << matrix_c[i*N+j] << " ";
}
std::cout << "\n";
}
} else {
std::cout << " [0][0] = " << matrix_c[0] << "\n";
}
//# Compute local and compare with offload computation
if (VERIFY){
int fail = 0;
for(int i=0; i<N; i++){
for (int j = 0; j < N; j++) {
for(int k=0; k<N; k++){
matrix_d[i*N+j] += matrix_a[i*N+k] * matrix_b[k*N+j];
}
if(matrix_c[i*N+j] != matrix_d[i*N+j]) fail = 1;
}
}
if(fail == 1){
std::cout << "FAIL\n";
} else {
std::cout << "PASS\n";
}
}
free(dev_a, q);
free(dev_b, q);
free(dev_c, q);
}