Skip to content

Commit 2aa088d

Browse files
authored
Thrust β†’ CUDA β†’ PTX β†’ SASS πŸ‹οΈβ€β™‚οΈπŸ‹οΈβ€β™€οΈ (#25)
* Add: Thrust, CUB, CUDA sorting This is a draft. It still lacks manual timing and async scheduling. * Add: Thrust, CUB, CUDA sorting This is a draft. It still lacks manual timing and async scheduling. * Make: Options for CUDA & TBB in CMake * Make: Switch to CUDA Toolkit for GPU libs * Fix: Ranges require `constexpr` on NVCC * Make: Upgrade `fmt` for NVCC builds fmtlib/fmt#4297 * Fix: NVCC compilation issues * Make: Silence NVCC warnings * Add: Sorting with `thrust` and `cub` * Add: PTX and `.cuh` kernels * Make: Don't compile PTX * Add: Using CUDA Driver API to JIT `.ptx` * Add: Precompiled CUDA C++ kernels * Add: cuBLAS benchmarks * Fix: Compiling `cuBLAS` calls * Fix: Avoid optimizing-out SASS code Unless we put an impossible condition with a `wmma::store_matrix_sync` the result of fragment multiplication is optimized out. * Add: Tensor Core intrinsic benchmarks Targeting `f16`, `bf16`, `tf16`, `f32`, `f64` on Volta, Turing, and Ampere. * Make: Build CUDA for multiple platforms Currently covering Volta, Turing, Ampere, Ada Lovelace, and Hopper. * Add: Binary BMMA kernels for GPU XOR variant for Turing+. AND variant for Ampere+. * Docs: Introduce Warp-Group-MMA on Hopper * Fix: Working PTX kernel * Fix: Lower PTX version for JIT * Fix: Use `f16` MMA * Make: Drop OpenBLAS
2 parents 3c45f6e + 3c92c36 commit 2aa088d

File tree

6 files changed

+1015
-80
lines changed

6 files changed

+1015
-80
lines changed

β€Ž.vscode/settings.json

Lines changed: 54 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,26 @@
66
"Andreas",
77
"ashvardanian",
88
"asynchrony",
9+
"bfloat",
910
"bioinformatics",
1011
"BLAS",
12+
"bmma",
1113
"Boccara",
1214
"bootcamps",
1315
"Byrne",
1416
"Cawley",
1517
"cblas",
18+
"CCCL",
1619
"clflush",
1720
"colsb",
1821
"consteval",
22+
"constexpr",
1923
"coro",
2024
"cplusplus",
2125
"cppcoro",
26+
"cstddef",
2227
"CTRE",
28+
"cublas",
2329
"CUDA",
2430
"denormal",
2531
"DOTPROD",
@@ -28,20 +34,26 @@
2834
"Eron",
2935
"excerise",
3036
"fconcepts",
37+
"fdividef",
3138
"Fedor",
3239
"Fugaku",
3340
"Giga",
3441
"Goodput",
3542
"GOPS",
43+
"GPGPU",
3644
"grandkids",
45+
"GTEST",
3746
"Hana",
47+
"HKSTU",
48+
"HMMA",
3849
"Ibireme",
3950
"JeanHeyd",
4051
"jemalloc",
4152
"Kulukundis",
4253
"Lelbach",
4354
"Lemire",
4455
"Lib",
56+
"LIBPFM",
4557
"libunifex",
4658
"Lohmann",
4759
"Maclaurin",
@@ -54,19 +66,27 @@
5466
"Niebler",
5567
"Niels",
5668
"nlohmann",
69+
"noexcept",
70+
"nullptr",
5771
"NVCC",
72+
"nvcuda",
5873
"openblas",
5974
"openmp",
6075
"Ormrod",
6176
"Peta",
6277
"Pikus",
6378
"pmf",
79+
"POPCOUNT",
6480
"popcountll",
81+
"Pranjal",
6582
"prefetcher",
6683
"pthread",
84+
"PTXAS",
6785
"RDMA",
6886
"reorderable",
87+
"Shankhdhar",
6988
"simdjson",
89+
"sinf",
7090
"SLEEF",
7191
"spdlog",
7292
"STREQUAL",
@@ -86,7 +106,11 @@
86106
"Vardanian",
87107
"vfmadd",
88108
"VNNI",
109+
"VPCLMULQDQ",
89110
"Weis",
111+
"WGMMA",
112+
"wmma",
113+
"Worklog",
90114
"XCOMP",
91115
"XFEATURE",
92116
"XTILE",
@@ -97,7 +121,10 @@
97121
"Zverovich"
98122
],
99123
"files.associations": {
124+
"*.evaluator": "cpp",
125+
"*.ipp": "cpp",
100126
"*.tcc": "cpp",
127+
"*.traits": "cpp",
101128
"__bit_reference": "cpp",
102129
"__bits": "cpp",
103130
"__config": "cpp",
@@ -114,12 +141,16 @@
114141
"__tree": "cpp",
115142
"__tuple": "cpp",
116143
"__verbose_abort": "cpp",
144+
"adolcforward": "cpp",
117145
"algorithm": "cpp",
146+
"alignedvector3": "cpp",
118147
"any": "cpp",
119148
"array": "cpp",
120149
"atomic": "cpp",
150+
"autodiff": "cpp",
121151
"bit": "cpp",
122152
"bitset": "cpp",
153+
"bvh": "cpp",
123154
"cctype": "cpp",
124155
"cfenv": "cpp",
125156
"charconv": "cpp",
@@ -130,6 +161,7 @@
130161
"complex": "cpp",
131162
"concepts": "cpp",
132163
"condition_variable": "cpp",
164+
"core": "cpp",
133165
"coroutine": "cpp",
134166
"cstdarg": "cpp",
135167
"cstddef": "cpp",
@@ -141,30 +173,38 @@
141173
"cwchar": "cpp",
142174
"cwctype": "cpp",
143175
"deque": "cpp",
176+
"eulerangles": "cpp",
144177
"exception": "cpp",
145178
"execution": "cpp",
146179
"expected": "cpp",
180+
"fft": "cpp",
147181
"format": "cpp",
148182
"forward_list": "cpp",
149183
"fstream": "cpp",
150184
"functional": "cpp",
151185
"initializer_list": "cpp",
186+
"inplace_vector": "cpp",
152187
"iomanip": "cpp",
153188
"ios": "cpp",
154189
"iosfwd": "cpp",
155190
"iostream": "cpp",
156191
"istream": "cpp",
157192
"iterator": "cpp",
193+
"kroneckerproduct": "cpp",
158194
"limits": "cpp",
159195
"list": "cpp",
160196
"locale": "cpp",
161197
"map": "cpp",
162198
"memory": "cpp",
163199
"memory_resource": "cpp",
200+
"mprealsupport": "cpp",
164201
"mutex": "cpp",
165202
"new": "cpp",
203+
"nnls": "cpp",
166204
"numbers": "cpp",
167205
"numeric": "cpp",
206+
"numericaldiff": "cpp",
207+
"openglsupport": "cpp",
168208
"optional": "cpp",
169209
"ostream": "cpp",
170210
"random": "cpp",
@@ -176,12 +216,15 @@
176216
"shared_mutex": "cpp",
177217
"source_location": "cpp",
178218
"span": "cpp",
219+
"specialfunctions": "cpp",
220+
"splines": "cpp",
179221
"sstream": "cpp",
180222
"stdexcept": "cpp",
181223
"stop_token": "cpp",
182224
"streambuf": "cpp",
183225
"string": "cpp",
184226
"string_view": "cpp",
227+
"superlusupport": "cpp",
185228
"system_error": "cpp",
186229
"text_encoding": "cpp",
187230
"thread": "cpp",
@@ -194,22 +237,16 @@
194237
"valarray": "cpp",
195238
"variant": "cpp",
196239
"vector": "cpp",
197-
"core": "cpp",
198-
"superlusupport": "cpp",
199-
"*.evaluator": "cpp",
200-
"*.traits": "cpp",
201-
"adolcforward": "cpp",
202-
"alignedvector3": "cpp",
203-
"autodiff": "cpp",
204-
"bvh": "cpp",
205-
"eulerangles": "cpp",
206-
"fft": "cpp",
207-
"kroneckerproduct": "cpp",
208-
"mprealsupport": "cpp",
209-
"nnls": "cpp",
210-
"numericaldiff": "cpp",
211-
"openglsupport": "cpp",
212-
"specialfunctions": "cpp",
213-
"splines": "cpp"
240+
"version": "cpp",
241+
"strstream": "cpp",
242+
"cinttypes": "cpp",
243+
"codecvt": "cpp",
244+
"csetjmp": "cpp",
245+
"csignal": "cpp",
246+
"hash_map": "cpp",
247+
"future": "cpp",
248+
"scoped_allocator": "cpp",
249+
"typeindex": "cpp",
250+
"*.inc": "cpp"
214251
}
215252
}

0 commit comments

Comments
Β (0)