1
+ #include < faasm/input.h>
1
2
#include < faasm/shared_mem.h>
2
3
3
4
#include < cstdio>
4
5
#include < math.h>
5
6
#include < omp.h>
6
7
#include < unistd.h>
8
+ #include < vector>
7
9
8
- bool doReduce ()
10
+ // This reduce method is called with a varying number of threads, but with
11
+ // a maximum of 10. In addition, the inner parallel for pragma may be
12
+ // elastically scaled from nThreads, all the way up to 10.
13
+ bool doReduce (int numThreads)
9
14
{
10
- int nThreads = 10 ;
11
15
int chunkSize = 1000 ;
12
- int loopSize = nThreads * chunkSize;
13
- int counts[] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
16
+ int loopSize = numThreads * chunkSize;
17
+ int maxNumThreads = 10 ;
18
+ std::vector<int > counts (maxNumThreads, 0 );
14
19
15
20
int reducedA = 0 ;
16
21
int reducedB = 0 ;
@@ -20,7 +25,7 @@ bool doReduce()
20
25
FAASM_REDUCE (reducedA, FAASM_TYPE_INT, FAASM_OP_SUM)
21
26
FAASM_REDUCE (reducedB, FAASM_TYPE_INT, FAASM_OP_SUM)
22
27
23
- #pragma omp parallel for num_threads(nThreads ) default(none) \
28
+ #pragma omp parallel for num_threads(numThreads ) default(none) \
24
29
shared (counts, loopSize, success) reduction (+ : reducedA, reducedB)
25
30
for (int i = 0 ; i < loopSize; i++) {
26
31
int threadNum = omp_get_thread_num ();
@@ -49,17 +54,53 @@ bool doReduce()
49
54
return 1 ;
50
55
}
51
56
52
- // Check counts
53
- for (int t = 0 ; t < nThreads; t++) {
54
- if (counts[t] != chunkSize) {
55
- printf (
56
- " Loop count for thread %i: %i != %i\n " , t, counts[t], chunkSize);
57
+ // First, work out how many threads actually executed the loop, by checking
58
+ // how many threads wrote to the counts array
59
+ int actualNumThreads = 0 ;
60
+ for (int i = 0 ; i < counts.size (); i++) {
61
+ if (counts.at (i) != 0 ) {
62
+ actualNumThreads++;
63
+ }
64
+ }
65
+
66
+ if ((actualNumThreads < numThreads) || (actualNumThreads > maxNumThreads)) {
67
+ printf (" Actual number of threads outside valid range: %i \\ not \\ in "
68
+ " [%i, %i]\n " ,
69
+ actualNumThreads,
70
+ numThreads,
71
+ maxNumThreads);
72
+
73
+ // Exit fast in this case as posterior checks may seg-fault
74
+ return false ;
75
+ }
76
+
77
+ // Check counts (only count the aggregate, and a uniform distribution, as
78
+ // we may elastically change the parallelism of the loop)
79
+ int actualChunkSize = (int )loopSize / actualNumThreads;
80
+ int total = 0 ;
81
+ for (int tNum = 0 ; tNum < actualNumThreads; tNum++) {
82
+ if (counts[tNum] != actualChunkSize) {
83
+ printf (" Loop count for thread %i: %i != %i\n " ,
84
+ tNum,
85
+ counts[tNum],
86
+ actualChunkSize);
57
87
success = false ;
58
88
}
89
+
90
+ total += counts[tNum];
59
91
}
60
92
61
- int expectedFinalReducedA = 550000 ;
62
- int expectedFinalReducedB = 825000 ;
93
+ if (total != loopSize) {
94
+ printf (" Total loop count failed: %i != %i\n " , total, loopSize);
95
+ success = false ;
96
+ }
97
+
98
+ // The expected final value is: constant (10/15) * (sum [1, nThreads]) *
99
+ // chunkSize
100
+ int expectedFinalReducedA =
101
+ (int )10 * actualNumThreads * (actualNumThreads + 1 ) / 2 * actualChunkSize;
102
+ int expectedFinalReducedB =
103
+ (int )15 * actualNumThreads * (actualNumThreads + 1 ) / 2 * actualChunkSize;
63
104
64
105
if (reducedA != expectedFinalReducedA) {
65
106
printf (" reducedA %i != %i\n " , reducedA, expectedFinalReducedA);
@@ -76,10 +117,16 @@ bool doReduce()
76
117
77
118
int main (int argc, char * argv[])
78
119
{
120
+ int numThreads = faasm::getIntInput ();
121
+ if (numThreads <= 0 ) {
122
+ printf (" Incorrect number of threads passed as input: %i\n " , numThreads);
123
+ return 1 ;
124
+ }
125
+
79
126
// Run reduce in a loop and check each iteration is correct
80
127
int nLoops = 10 ;
81
128
for (int i = 0 ; i < nLoops; i++) {
82
- bool success = doReduce ();
129
+ bool success = doReduce (numThreads );
83
130
if (!success) {
84
131
printf (" Repeated reduce failed on loop %i\n " , i);
85
132
return 1 ;
0 commit comments