Skip to content

Commit f7b707d

Browse files
committed
correct scoreP compilation. add new diagram generator files
1 parent d1c0cee commit f7b707d

File tree

10 files changed

+2155
-18
lines changed

10 files changed

+2155
-18
lines changed

cpu_2d/Makefile.am

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ endif
4747
endif
4848

4949
if NVIDIA_ARCHITECTURE_FERMI
50-
NV_ARCHITECTURE=$(PTXAS) -arch=sm_20 $(NVFLAGS)
50+
NV_ARCHITECTURE=$(PTXAS) -arch=sm_20 $(NVFLAGS)
5151
else
5252
if NVIDIA_ARCHITECTURE_KEPLER
5353
NV_ARCHITECTURE=$(PTXAS) -arch=sm_35 $(NVFLAGS)
@@ -70,12 +70,15 @@ else
7070
THREADS=none
7171
endif
7272
endif
73-
scorep_instrumentation_flag=--user --cuda --mpp=mpi --thread=$(THREADS) --nopdt --online-access --instrument-filter=\"filter.scorep\"
73+
PROFILER=@SCOREP_EXEC@
74+
scorep_instrumentation_flag=--user --cuda --mpp=mpi --thread=$(THREADS) --nopdt --online-access
75+
# --instrument-filter=\"filter.scorep\"
7476
SCOREP_INSTRUMENTATION=-D_SCOREP_USER_INSTRUMENTATION -DSCOREP_USER_ENABLE
7577
LDFLAGS_SCOREP=@SCOREP_LFLAG@
7678
LLIBS_SCOREP=@SCOREP_LLIBS@
7779
SCOREP_CXXCPPFLAGS=@SCOREP_CPPFLAG@
7880
else
81+
PROFILER=
7982
libcudart_static_path=@CUDA_LDFLAGS@
8083
libcudart_static_flag=@CUDA_LIBS@
8184
endif
@@ -180,7 +183,7 @@ AM_CFLAGS=@CC_MBITS@ @CFLAGS@ @WARN_CFLAGS@
180183
#
181184

182185
NVCXX=@NVCC@
183-
AM_LD=$(PROFILER) @MPICXX@
186+
AM_LD=$(PROFILER) $(scorep_instrumentation_flag) @MPICXX@
184187

185188

186189
### List of sources to compile

cpu_2d/distmatrix2d.hh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
#include <cstdlib> // only for values of packed_edge typ!!!
1818
#include "config.h"
1919

20-
#ifdef _DISABLED_OPENMP
20+
#ifdef _OPENMP
2121
#include <omp.h>
2222
#define OMP_CHUNK 4
2323
#endif

cpu_2d/globalbfs.hh

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,6 @@ void GlobalBFS<Derived, FQ_T, MType, STORE>::allReduceBitCompressed(typename STO
195195
//step 1
196196
MPI_Comm_size(col_comm, &communicatorSize);
197197
MPI_Comm_rank(col_comm, &communicatorRank);
198-
199198
const int32_t intLdSize = ilogbf(static_cast<float>(communicatorSize)); //integer log_2 of size
200199
const int32_t power2intLdSize = 1 << intLdSize; // 2^n
201200
const int32_t residuum = communicatorSize - power2intLdSize;
@@ -205,7 +204,6 @@ void GlobalBFS<Derived, FQ_T, MType, STORE>::allReduceBitCompressed(typename STO
205204
{
206205
return (oldr < (residuum << 1)) ? (oldr >> 1) : oldr - residuum;
207206
};
208-
209207
const function <int32_t(int32_t)> oldRank = [&residuum](uint32_t newr)
210208
{
211209
return (newr < residuum) ? (newr << 1) : newr + residuum;
@@ -219,7 +217,6 @@ void GlobalBFS<Derived, FQ_T, MType, STORE>::allReduceBitCompressed(typename STO
219217
MPI_Sendrecv(owenmap, psize, bm_type, communicatorRank + 1, 0, tmpmap, psize, bm_type, communicatorRank + 1, 0,
220218
col_comm, &status);
221219

222-
std::cout << "allBitmap for-loop size (psize) " << psize << std::endl;
223220
for (int32_t i = 0; i < psize; ++i)
224221
{
225222
tmpmap[i] &= ~owenmap[i];
@@ -246,7 +243,6 @@ std::cout << "allBitmap for-loop size (psize) " << psize << std::endl;
246243
MPI_Sendrecv(owenmap, psize, bm_type, communicatorRank - 1, 0, tmpmap, psize, bm_type, communicatorRank - 1, 0,
247244
col_comm, &status);
248245

249-
std::cout << "allBitmap for-loop size (psize2) " << psize2 << std::endl;
250246
for (int32_t i = 0; i < psize; ++i)
251247
{
252248
tmpmap[i] = ~tmpmap[i] & owenmap[i];
@@ -275,7 +271,7 @@ std::cout << "allBitmap for-loop size (psize2) " << psize2 << std::endl;
275271
ssize = psize;
276272
const int32_t vrank = newRank(communicatorRank);
277273
offset = 0;
278-
std::cout << "allBitmap for-loop size (intLdSize) " << intLdSize << std::endl;
274+
// intLdSize: ~2 to 4 iteractions (scale 22, 16 gpus)
279275
for (int32_t it = 0; it < intLdSize; ++it)
280276
{
281277
int32_t orankEven, orankOdd, iterator2, iterator3;
@@ -295,21 +291,29 @@ std::cout << "allBitmap for-loop size (intLdSize) " << intLdSize << std::endl;
295291
tmpmap + offset, ssize, bm_type, orankEven, iterator2,
296292
col_comm, &status);
297293

298-
std::cout << "allBitmap for-loop size (lowers) " << lowers << std::endl;
294+
// lowers: ~65k iteractions per MPI node (scale 22, 16 gpus)
295+
#ifdef _OPENMP
296+
#pragma omp parallel for
297+
#endif
299298
for (int32_t i = 0; i < lowers; ++i)
300299
{
301300
const int32_t iOffset = i + offset;
302301
tmpmap[iOffset] &= ~owenmap[iOffset];
303302
owenmap[iOffset] |= tmpmap[iOffset];
304303
}
304+
305+
// lowers: ~65k iteractions per MPI node (scale 22, 16 gpus)
306+
#ifdef _OPENMP
307+
#pragma omp parallel for
308+
#endif
305309
for (int32_t i = lowers; i < ssize; ++i)
306310
{
307311
const int32_t iOffset = i + offset;
308312
tmpmap[iOffset] = (~tmpmap[iOffset]) & owenmap[iOffset];
309313
}
310314

311-
std::cout << "allBitmap for-loop size (uppers) " << uppers << std::endl;
312315
//Generation of foreign updates
316+
// uppers: ~65k iteractions per MPI node (scale 22, 16 gpus)
313317
int32_t p = 0;
314318
for (int32_t i = 0; i < uppers; ++i)
315319
{
@@ -332,9 +336,9 @@ std::cout << "allBitmap for-loop size (uppers) " << uppers << std::endl;
332336
orankEven, iterator3,
333337
col_comm, &status);
334338

335-
std::cout << "allBitmap for-loop size (lowers) " << lowers << std::endl;
336339
//Updates for own data
337340
p = 0;
341+
// lowers: ~65k iteractions per MPI node (scale 22, 16 gpus)
338342
for (int32_t i = 0; i < lowers; ++i)
339343
{
340344
const int32_t iOffset = i + offset;
@@ -359,21 +363,28 @@ std::cout << "allBitmap for-loop size (lowers) " << lowers << std::endl;
359363
orankOdd, iterator2,
360364
col_comm, &status);
361365

362-
std::cout << "allBitmap for-loop size (lowers) " << lowers << std::endl;
366+
// lowers: ~65k iteractions per MPI node (scale 22, 16 gpus)
367+
#ifdef _OPENMP
368+
#pragma omp parallel for
369+
#endif
363370
for (int32_t i = 0; i < lowers; ++i)
364371
{
365372
const int32_t iOffset = i + offset;
366373
tmpmap[iOffset] = (~tmpmap[iOffset]) & owenmap[iOffset];
367374
}
368375

369-
std::cout << "allBitmap for-loop size (lowers) " << lowers << std::endl;
376+
// lowers: ~65k iteractions per MPI node (scale 22, 16 gpus)
377+
#ifdef _OPENMP
378+
#pragma omp parallel for
379+
#endif
370380
for (int32_t i = lowers; i < ssize; ++i)
371381
{
372382
const int32_t iOffset = i + offset;
373383
tmpmap[iOffset] &= ~owenmap[iOffset];
374384
owenmap[iOffset] |= tmpmap[iOffset];
375385
}
376386
//Generation of foreign updates
387+
// inner p: ~50k iteractions per MPI node (scale 22, 16 gpus)
377388
int32_t p = 0;
378389
for (int32_t i = 0; i < lowers; ++i)
379390
{
@@ -388,7 +399,6 @@ std::cout << "allBitmap for-loop size (lowers) " << lowers << std::endl;
388399
tmpm ^= (1 << last);
389400
}
390401
}
391-
std::cout << "allBitmap for-loop size (p O(n^2)) " << p << std::endl;
392402

393403
//Transmission of updates
394404
MPI_Sendrecv(tmp, p, fq_tp_type,
@@ -397,8 +407,8 @@ std::cout << "allBitmap for-loop size (p O(n^2)) " << p << std::endl;
397407
orankOdd, iterator3,
398408
col_comm, &status);
399409

400-
std::cout << "allBitmap for-loop size (uppers) " << uppers << std::endl;
401410
//Updates for own data
411+
// inner p: ~50k iteractions per MPI node (scale 22, 16 gpus)
402412
p = 0;
403413
for (int32_t i = 0; i < uppers; ++i)
404414
{
@@ -440,7 +450,6 @@ std::cout << "allBitmap for-loop size (uppers) " << uppers << std::endl;
440450
sizes[lastTargetNode] = (psize >> intLdSize) * mtypesize;
441451
disps[lastTargetNode] = 0;
442452

443-
std::cout << "allBitmap for-loop size (power2intLdSize) " << power2intLdSize << std::endl;
444453
for (int slice = 1; slice < power2intLdSize; ++slice)
445454
{
446455
const uint32_t reversedSliceIDs = reverse(slice, intLdSize);
@@ -492,7 +501,6 @@ void GlobalBFS<Derived, FQ_T, MType, STORE>::generatOwenMask()
492501
#pragma omp for schedule (guided, OMP_CHUNK)
493502
#endif
494503

495-
std::cout << "allBitmap for-loop size (mask_size) " << mask_size << std::endl;
496504

497505
for (int64_t i = 0L; i < mask_size; ++i)
498506
{

0 commit comments

Comments
 (0)