1+ #include < faasm/input.h>
12#include < faasm/shared_mem.h>
23
34#include < cstdio>
45#include < math.h>
56#include < omp.h>
67#include < unistd.h>
8+ #include < vector>
79
8- bool doReduce ()
10+ // This reduce method is called with a varying number of threads, but with
11+ // a maximum of 10. In addition, the inner parallel for pragma may be
12+ // elastically scaled from nThreads, all the way up to 10.
13+ bool doReduce (int numThreads)
914{
10- int nThreads = 10 ;
1115 int chunkSize = 1000 ;
12- int loopSize = nThreads * chunkSize;
13- int counts[] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
16+ int loopSize = numThreads * chunkSize;
17+ int maxNumThreads = 10 ;
18+ std::vector<int > counts (maxNumThreads, 0 );
1419
1520 int reducedA = 0 ;
1621 int reducedB = 0 ;
@@ -20,7 +25,7 @@ bool doReduce()
2025 FAASM_REDUCE (reducedA, FAASM_TYPE_INT, FAASM_OP_SUM)
2126 FAASM_REDUCE (reducedB, FAASM_TYPE_INT, FAASM_OP_SUM)
2227
23- #pragma omp parallel for num_threads(nThreads ) default(none) \
28+ #pragma omp parallel for num_threads(numThreads ) default(none) \
2429 shared (counts, loopSize, success) reduction (+ : reducedA, reducedB)
2530 for (int i = 0 ; i < loopSize; i++) {
2631 int threadNum = omp_get_thread_num ();
@@ -49,17 +54,53 @@ bool doReduce()
4954 return 1 ;
5055 }
5156
52- // Check counts
53- for (int t = 0 ; t < nThreads; t++) {
54- if (counts[t] != chunkSize) {
55- printf (
56- " Loop count for thread %i: %i != %i\n " , t, counts[t], chunkSize);
57+ // First, work out how many threads actually executed the loop, by checking
58+ // how many threads wrote to the counts array
59+ int actualNumThreads = 0 ;
60+ for (int i = 0 ; i < counts.size (); i++) {
61+ if (counts.at (i) != 0 ) {
62+ actualNumThreads++;
63+ }
64+ }
65+
66+ if ((actualNumThreads < numThreads) || (actualNumThreads > maxNumThreads)) {
67+ printf (" Actual number of threads outside valid range: %i \\ not \\ in "
68+ " [%i, %i]\n " ,
69+ actualNumThreads,
70+ numThreads,
71+ maxNumThreads);
72+
73+ // Exit fast in this case as posterior checks may seg-fault
74+ return false ;
75+ }
76+
77+ // Check counts (only count the aggregate, and a uniform distribution, as
78+ // we may elastically change the parallelism of the loop)
79+ int actualChunkSize = (int )loopSize / actualNumThreads;
80+ int total = 0 ;
81+ for (int tNum = 0 ; tNum < actualNumThreads; tNum++) {
82+ if (counts[tNum] != actualChunkSize) {
83+ printf (" Loop count for thread %i: %i != %i\n " ,
84+ tNum,
85+ counts[tNum],
86+ actualChunkSize);
5787 success = false ;
5888 }
89+
90+ total += counts[tNum];
5991 }
6092
61- int expectedFinalReducedA = 550000 ;
62- int expectedFinalReducedB = 825000 ;
93+ if (total != loopSize) {
94+ printf (" Total loop count failed: %i != %i\n " , total, loopSize);
95+ success = false ;
96+ }
97+
98+ // The expected final value is: constant (10/15) * (sum [1, nThreads]) *
99+ // chunkSize
100+ int expectedFinalReducedA =
101+ (int )10 * actualNumThreads * (actualNumThreads + 1 ) / 2 * actualChunkSize;
102+ int expectedFinalReducedB =
103+ (int )15 * actualNumThreads * (actualNumThreads + 1 ) / 2 * actualChunkSize;
63104
64105 if (reducedA != expectedFinalReducedA) {
65106 printf (" reducedA %i != %i\n " , reducedA, expectedFinalReducedA);
@@ -76,10 +117,16 @@ bool doReduce()
76117
77118int main (int argc, char * argv[])
78119{
120+ int numThreads = faasm::getIntInput ();
121+ if (numThreads <= 0 ) {
122+ printf (" Incorrect number of threads passed as input: %i\n " , numThreads);
123+ return 1 ;
124+ }
125+
79126 // Run reduce in a loop and check each iteration is correct
80127 int nLoops = 10 ;
81128 for (int i = 0 ; i < nLoops; i++) {
82- bool success = doReduce ();
129+ bool success = doReduce (numThreads );
83130 if (!success) {
84131 printf (" Repeated reduce failed on loop %i\n " , i);
85132 return 1 ;
0 commit comments