Skip to content

Commit 4d9f40c

Browse files
authored
func(omp): make repeated reduce function more flexible (#125)
* func(omp): set number of threads from input for repeated reduce * func(omp): make repeated reduce function more flexible
1 parent 23da49b commit 4d9f40c

File tree

2 files changed

+61
-16
lines changed

2 files changed

+61
-16
lines changed

.gitignore

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
build-wasm/
2-
build-native/
3-
build-native-shared/
1+
build-*/
42
compile_commands.json
53
.clangd/
64

func/omp/repeated_reduce.cpp

+60-13
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
1+
#include <faasm/input.h>
12
#include <faasm/shared_mem.h>
23

34
#include <cstdio>
45
#include <math.h>
56
#include <omp.h>
67
#include <unistd.h>
8+
#include <vector>
79

8-
bool doReduce()
10+
// This reduce method is called with a varying number of threads, but with
11+
// a maximum of 10. In addition, the inner parallel for pragma may be
12+
// elastically scaled from nThreads, all the way up to 10.
13+
bool doReduce(int numThreads)
914
{
10-
int nThreads = 10;
1115
int chunkSize = 1000;
12-
int loopSize = nThreads * chunkSize;
13-
int counts[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
16+
int loopSize = numThreads * chunkSize;
17+
int maxNumThreads = 10;
18+
std::vector<int> counts(maxNumThreads, 0);
1419

1520
int reducedA = 0;
1621
int reducedB = 0;
@@ -20,7 +25,7 @@ bool doReduce()
2025
FAASM_REDUCE(reducedA, FAASM_TYPE_INT, FAASM_OP_SUM)
2126
FAASM_REDUCE(reducedB, FAASM_TYPE_INT, FAASM_OP_SUM)
2227

23-
#pragma omp parallel for num_threads(nThreads) default(none) \
28+
#pragma omp parallel for num_threads(numThreads) default(none) \
2429
shared(counts, loopSize, success) reduction(+ : reducedA, reducedB)
2530
for (int i = 0; i < loopSize; i++) {
2631
int threadNum = omp_get_thread_num();
@@ -49,17 +54,53 @@ bool doReduce()
4954
return 1;
5055
}
5156

52-
// Check counts
53-
for (int t = 0; t < nThreads; t++) {
54-
if (counts[t] != chunkSize) {
55-
printf(
56-
"Loop count for thread %i: %i != %i\n", t, counts[t], chunkSize);
57+
// First, work out how many threads actually executed the loop, by checking
58+
// how many threads wrote to the counts array
59+
int actualNumThreads = 0;
60+
for (int i = 0; i < counts.size(); i++) {
61+
if (counts.at(i) != 0) {
62+
actualNumThreads++;
63+
}
64+
}
65+
66+
if ((actualNumThreads < numThreads) || (actualNumThreads > maxNumThreads)) {
67+
printf("Actual number of threads outside valid range: %i \\not \\in "
68+
"[%i, %i]\n",
69+
actualNumThreads,
70+
numThreads,
71+
maxNumThreads);
72+
73+
// Exit fast in this case as posterior checks may seg-fault
74+
return false;
75+
}
76+
77+
// Check counts (only count the aggregate, and a uniform distribution, as
78+
// we may elastically change the parallelism of the loop)
79+
int actualChunkSize = (int)loopSize / actualNumThreads;
80+
int total = 0;
81+
for (int tNum = 0; tNum < actualNumThreads; tNum++) {
82+
if (counts[tNum] != actualChunkSize) {
83+
printf("Loop count for thread %i: %i != %i\n",
84+
tNum,
85+
counts[tNum],
86+
actualChunkSize);
5787
success = false;
5888
}
89+
90+
total += counts[tNum];
5991
}
6092

61-
int expectedFinalReducedA = 550000;
62-
int expectedFinalReducedB = 825000;
93+
if (total != loopSize) {
94+
printf("Total loop count failed: %i != %i\n", total, loopSize);
95+
success = false;
96+
}
97+
98+
// The expected final value is: constant (10/15) * (sum [1, nThreads]) *
99+
// chunkSize
100+
int expectedFinalReducedA =
101+
(int)10 * actualNumThreads * (actualNumThreads + 1) / 2 * actualChunkSize;
102+
int expectedFinalReducedB =
103+
(int)15 * actualNumThreads * (actualNumThreads + 1) / 2 * actualChunkSize;
63104

64105
if (reducedA != expectedFinalReducedA) {
65106
printf("reducedA %i != %i\n", reducedA, expectedFinalReducedA);
@@ -76,10 +117,16 @@ bool doReduce()
76117

77118
int main(int argc, char* argv[])
78119
{
120+
int numThreads = faasm::getIntInput();
121+
if (numThreads <= 0) {
122+
printf("Incorrect number of threads passed as input: %i\n", numThreads);
123+
return 1;
124+
}
125+
79126
// Run reduce in a loop and check each iteration is correct
80127
int nLoops = 10;
81128
for (int i = 0; i < nLoops; i++) {
82-
bool success = doReduce();
129+
bool success = doReduce(numThreads);
83130
if (!success) {
84131
printf("Repeated reduce failed on loop %i\n", i);
85132
return 1;

0 commit comments

Comments
 (0)