@@ -195,7 +195,6 @@ void GlobalBFS<Derived, FQ_T, MType, STORE>::allReduceBitCompressed(typename STO
195
195
// step 1
196
196
MPI_Comm_size (col_comm, &communicatorSize);
197
197
MPI_Comm_rank (col_comm, &communicatorRank);
198
-
199
198
const int32_t intLdSize = ilogbf (static_cast <float >(communicatorSize)); // integer log_2 of size
200
199
const int32_t power2intLdSize = 1 << intLdSize; // 2^n
201
200
const int32_t residuum = communicatorSize - power2intLdSize;
@@ -205,7 +204,6 @@ void GlobalBFS<Derived, FQ_T, MType, STORE>::allReduceBitCompressed(typename STO
205
204
{
206
205
return (oldr < (residuum << 1 )) ? (oldr >> 1 ) : oldr - residuum;
207
206
};
208
-
209
207
const function <int32_t (int32_t )> oldRank = [&residuum](uint32_t newr)
210
208
{
211
209
return (newr < residuum) ? (newr << 1 ) : newr + residuum;
@@ -219,7 +217,6 @@ void GlobalBFS<Derived, FQ_T, MType, STORE>::allReduceBitCompressed(typename STO
219
217
MPI_Sendrecv (owenmap, psize, bm_type, communicatorRank + 1 , 0 , tmpmap, psize, bm_type, communicatorRank + 1 , 0 ,
220
218
col_comm, &status);
221
219
222
- std::cout << " allBitmap for-loop size (psize) " << psize << std::endl;
223
220
for (int32_t i = 0 ; i < psize; ++i)
224
221
{
225
222
tmpmap[i] &= ~owenmap[i];
@@ -246,7 +243,6 @@ std::cout << "allBitmap for-loop size (psize) " << psize << std::endl;
246
243
MPI_Sendrecv (owenmap, psize, bm_type, communicatorRank - 1 , 0 , tmpmap, psize, bm_type, communicatorRank - 1 , 0 ,
247
244
col_comm, &status);
248
245
249
- std::cout << " allBitmap for-loop size (psize2) " << psize2 << std::endl;
250
246
for (int32_t i = 0 ; i < psize; ++i)
251
247
{
252
248
tmpmap[i] = ~tmpmap[i] & owenmap[i];
@@ -275,7 +271,7 @@ std::cout << "allBitmap for-loop size (psize2) " << psize2 << std::endl;
275
271
ssize = psize;
276
272
const int32_t vrank = newRank (communicatorRank);
277
273
offset = 0 ;
278
- std::cout << " allBitmap for-loop size (intLdSize) " << intLdSize << std::endl;
274
+ // intLdSize: ~2 to 4 iteractions (scale 22, 16 gpus)
279
275
for (int32_t it = 0 ; it < intLdSize; ++it)
280
276
{
281
277
int32_t orankEven, orankOdd, iterator2, iterator3;
@@ -295,21 +291,29 @@ std::cout << "allBitmap for-loop size (intLdSize) " << intLdSize << std::endl;
295
291
tmpmap + offset, ssize, bm_type, orankEven, iterator2,
296
292
col_comm, &status);
297
293
298
- std::cout << " allBitmap for-loop size (lowers) " << lowers << std::endl;
294
+ // lowers: ~65k iteractions per MPI node (scale 22, 16 gpus)
295
+ #ifdef _OPENMP
296
+ #pragma omp parallel for
297
+ #endif
299
298
for (int32_t i = 0 ; i < lowers; ++i)
300
299
{
301
300
const int32_t iOffset = i + offset;
302
301
tmpmap[iOffset] &= ~owenmap[iOffset];
303
302
owenmap[iOffset] |= tmpmap[iOffset];
304
303
}
304
+
305
+ // lowers: ~65k iteractions per MPI node (scale 22, 16 gpus)
306
+ #ifdef _OPENMP
307
+ #pragma omp parallel for
308
+ #endif
305
309
for (int32_t i = lowers; i < ssize; ++i)
306
310
{
307
311
const int32_t iOffset = i + offset;
308
312
tmpmap[iOffset] = (~tmpmap[iOffset]) & owenmap[iOffset];
309
313
}
310
314
311
- std::cout << " allBitmap for-loop size (uppers) " << uppers << std::endl;
312
315
// Generation of foreign updates
316
+ // uppers: ~65k iteractions per MPI node (scale 22, 16 gpus)
313
317
int32_t p = 0 ;
314
318
for (int32_t i = 0 ; i < uppers; ++i)
315
319
{
@@ -332,9 +336,9 @@ std::cout << "allBitmap for-loop size (uppers) " << uppers << std::endl;
332
336
orankEven, iterator3,
333
337
col_comm, &status);
334
338
335
- std::cout << " allBitmap for-loop size (lowers) " << lowers << std::endl;
336
339
// Updates for own data
337
340
p = 0 ;
341
+ // lowers: ~65k iteractions per MPI node (scale 22, 16 gpus)
338
342
for (int32_t i = 0 ; i < lowers; ++i)
339
343
{
340
344
const int32_t iOffset = i + offset;
@@ -359,21 +363,28 @@ std::cout << "allBitmap for-loop size (lowers) " << lowers << std::endl;
359
363
orankOdd, iterator2,
360
364
col_comm, &status);
361
365
362
- std::cout << " allBitmap for-loop size (lowers) " << lowers << std::endl;
366
+ // lowers: ~65k iteractions per MPI node (scale 22, 16 gpus)
367
+ #ifdef _OPENMP
368
+ #pragma omp parallel for
369
+ #endif
363
370
for (int32_t i = 0 ; i < lowers; ++i)
364
371
{
365
372
const int32_t iOffset = i + offset;
366
373
tmpmap[iOffset] = (~tmpmap[iOffset]) & owenmap[iOffset];
367
374
}
368
375
369
- std::cout << " allBitmap for-loop size (lowers) " << lowers << std::endl;
376
+ // lowers: ~65k iteractions per MPI node (scale 22, 16 gpus)
377
+ #ifdef _OPENMP
378
+ #pragma omp parallel for
379
+ #endif
370
380
for (int32_t i = lowers; i < ssize; ++i)
371
381
{
372
382
const int32_t iOffset = i + offset;
373
383
tmpmap[iOffset] &= ~owenmap[iOffset];
374
384
owenmap[iOffset] |= tmpmap[iOffset];
375
385
}
376
386
// Generation of foreign updates
387
+ // inner p: ~50k iteractions per MPI node (scale 22, 16 gpus)
377
388
int32_t p = 0 ;
378
389
for (int32_t i = 0 ; i < lowers; ++i)
379
390
{
@@ -388,7 +399,6 @@ std::cout << "allBitmap for-loop size (lowers) " << lowers << std::endl;
388
399
tmpm ^= (1 << last);
389
400
}
390
401
}
391
- std::cout << " allBitmap for-loop size (p O(n^2)) " << p << std::endl;
392
402
393
403
// Transmission of updates
394
404
MPI_Sendrecv (tmp, p, fq_tp_type,
@@ -397,8 +407,8 @@ std::cout << "allBitmap for-loop size (p O(n^2)) " << p << std::endl;
397
407
orankOdd, iterator3,
398
408
col_comm, &status);
399
409
400
- std::cout << " allBitmap for-loop size (uppers) " << uppers << std::endl;
401
410
// Updates for own data
411
+ // inner p: ~50k iteractions per MPI node (scale 22, 16 gpus)
402
412
p = 0 ;
403
413
for (int32_t i = 0 ; i < uppers; ++i)
404
414
{
@@ -440,7 +450,6 @@ std::cout << "allBitmap for-loop size (uppers) " << uppers << std::endl;
440
450
sizes[lastTargetNode] = (psize >> intLdSize) * mtypesize;
441
451
disps[lastTargetNode] = 0 ;
442
452
443
- std::cout << " allBitmap for-loop size (power2intLdSize) " << power2intLdSize << std::endl;
444
453
for (int slice = 1 ; slice < power2intLdSize; ++slice)
445
454
{
446
455
const uint32_t reversedSliceIDs = reverse (slice, intLdSize);
@@ -492,7 +501,6 @@ void GlobalBFS<Derived, FQ_T, MType, STORE>::generatOwenMask()
492
501
#pragma omp for schedule (guided, OMP_CHUNK)
493
502
#endif
494
503
495
- std::cout << " allBitmap for-loop size (mask_size) " << mask_size << std::endl;
496
504
497
505
for (int64_t i = 0L ; i < mask_size; ++i)
498
506
{
0 commit comments