Improved implementation of OpenMP Kahan summation without global thre…

…ad-local kc and ks. Instead, an accumulation is done while still in the parallel section and this is finally summed up to yield the result. This reduces the differences between the two implementations. OpenMP implementation of BGQ scalar_prod_r with Kahan summation
Meinersbur · Sep 26, 2012 · 3e18dfd · 3e18dfd
1 parent fc301ab
commit 3e18dfd
Show file tree

Hide file tree

Showing 8 changed files with 131 additions and 147 deletions.
diff --git a/clover_leaf.c b/clover_leaf.c
@@ -410,22 +410,27 @@ inline void add_tm(_Complex double a[6][6], const double mu) {
 }
 
 double sw_trace(const int ieo, const double mu) {
-  double ALIGN ks, kc;
-  ks = kc = 0.0;
+  double ALIGN res = 0.0;
+#ifdef MPI
+  double ALIGN mres = 0.0;
+#endif
+
 #ifdef OMP
 #pragma omp parallel
   {
   int thread_num = omp_get_thread_num();
-  g_omp_ks_re[thread_num] = g_omp_kc_re[thread_num] = 0.0;
+  g_omp_acc_re[thread_num] = 0.0;
 #endif
 
   int i,x,ioff;
   su3 ALIGN v;
   _Complex double ALIGN a[6][6];
   double ALIGN tra;
-  double ALIGN tr,ts,tt;
+  double ALIGN ks,kc,tr,ts,tt;
   _Complex double ALIGN det;
 
+  ks = kc = 0.0;
+
   if(ieo==0) {
     ioff=0;
   } 
@@ -434,7 +439,7 @@ double sw_trace(const int ieo, const double mu) {
   }
 
 #ifdef OMP
-#pragma omp for schedule(static)
+#pragma omp for
 #endif
   for(int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
     x = g_eo2lexic[icx];
@@ -452,36 +457,32 @@ double sw_trace(const int ieo, const double mu) {
       tra = log(conj(det)*det);
       // we need to compute only the one with +mu
       // the one with -mu must be the complex conjugate!
-#ifdef OMP
-      tr=tra+g_omp_kc_re[thread_num];
-      ts=tr+g_omp_ks_re[thread_num];
-      tt=ts-g_omp_ks_re[thread_num];
-      g_omp_ks_re[thread_num]=ts;
-      g_omp_kc_re[thread_num]=tr-tt;
-#else
+
       tr=tra+kc;
       ts=tr+ks;
       tt=ts-ks;
       ks=ts;
       kc=tr-tt;
-#endif
     }
   }
+  kc=ks+kc;
+
 #ifdef OMP
-  g_omp_kc_re[thread_num] = g_omp_ks_re[thread_num] + g_omp_kc_re[thread_num];
+  g_omp_acc_re[thread_num] = kc;
   } /* OpenMP parallel closing brace */
 
   for(int i = 0; i < omp_num_threads; ++i) {
-    kc += g_omp_kc_re[i];
+    res += g_omp_acc_re[i];
   }
 #else
-  kc=ks+kc;
+  res=kc;
 #endif
+
 #ifdef MPI
-  MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-  return(ks);
+  MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  return(mres);
 #else
-  return(kc);
+  return(res);
 #endif
 
 }

diff --git a/global.h b/global.h
@@ -227,10 +227,8 @@ EXTERN int g_mpi_ST_rank;
 EXTERN int g_nb_list[8];
 
 /* OpenMP Kahan accumulation arrays */
-EXTERN _Complex double *g_omp_kc;
-EXTERN _Complex double *g_omp_ks;
-EXTERN double* g_omp_kc_re;
-EXTERN double* g_omp_ks_re;
+EXTERN _Complex double *g_omp_acc_cp;
+EXTERN double* g_omp_acc_re;
 
 /* Deflation information */
 EXTERN int g_dflgcr_flag;

diff --git a/init_omp_kahan_arrays.c → init_omp_accumulators.c b/init_omp_kahan_arrays.c → init_omp_accumulators.c
@@ -27,41 +27,27 @@
 #include <stdio.h>
 #include <errno.h>
 #include "global.h"
-#include "init_omp_kahan_arrays.h"
+#include "init_omp_accumulators.h"
 
-int init_omp_kahan_arrays(const int num) {
-  g_omp_ks=NULL;
-  g_omp_kc=NULL;
-  g_omp_ks_re=NULL;
-  g_omp_kc_re=NULL;
+int init_omp_accumulators(const int num) {
+  g_omp_acc_cp=NULL;
+  g_omp_acc_re=NULL;
 
-  if((void*)(g_omp_ks = (_Complex double*)malloc(num*sizeof(_Complex double))) == NULL) {
-    printf ("init_omp_kahan_arrays malloc errno : %d\n",errno); 
+  if((void*)(g_omp_acc_cp = (_Complex double*)malloc(num*sizeof(_Complex double))) == NULL) {
+    printf ("init_omp_accumulators malloc errno : %d\n",errno); 
     errno = 0;
     return(1);
   }
-  if((void*)(g_omp_kc = (_Complex double*)malloc(num*sizeof(_Complex double))) == NULL) {
-    printf ("init_omp_kahan_arrays malloc errno : %d\n",errno); 
+  if((void*)(g_omp_acc_re = (double*)malloc(num*sizeof(double))) == NULL) {
+    printf ("init_omp_accumulators malloc errno : %d\n",errno); 
     errno = 0;
     return(2);
   }
-  if((void*)(g_omp_ks_re = (double*)malloc(num*sizeof(double))) == NULL) {
-    printf ("init_omp_kahan_arrays malloc errno : %d\n",errno); 
-    errno = 0;
-    return(3);
-  }
-  if((void*)(g_omp_kc_re = (double*)malloc(num*sizeof(double))) == NULL) {
-    printf ("init_omp_kahan_arrays malloc errno : %d\n",errno); 
-    errno = 0;
-    return(4);
-  }
 
   return(0);
 }
 
-void free_omp_kahan_arrays() {
-  free(g_omp_kc);
-  free(g_omp_ks);
-  free(g_omp_kc_re);
-  free(g_omp_ks_re);
+void free_omp_accumulators() {
+  free(g_omp_acc_cp);
+  free(g_omp_acc_re);
 }
diff --git a/init_omp_kahan_arrays.h → init_omp_accumulators.h b/init_omp_kahan_arrays.h → init_omp_accumulators.h
@@ -16,10 +16,10 @@
  * You should have received a copy of the GNU General Public License
  * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
  ***********************************************************************/
-#ifndef _INIT_OMP_KAHAN_ARRAYS_H
-#define _INIT_OMP_KAHAN_ARRAYS_H
+#ifndef _INIT_OMP_ACCUMULATORS_H
+#define _INIT_OMP_ACCUMULATORS_H
 
-int init_omp_kahan_arrays(int num);
-void free_omp_kahan_arrays();
+int init_omp_accumulators(int num);
+void free_omp_accumulators();
 
 #endif
diff --git a/linalg/scalar_prod.c b/linalg/scalar_prod.c
@@ -32,26 +32,31 @@
 #include "scalar_prod.h"
 
 /*  <S,R>=S^* times R */
-_Complex double scalar_prod(spinor * const S, spinor * const R, const int N, const int parallel) {
-  _Complex double ALIGN ks,kc;
-  ks = kc = 0.0;
+_Complex double scalar_prod(const spinor * const S, const spinor * const R, const int N, const int parallel) {
+  _Complex double ALIGN res = 0.0;
+#ifdef MPI
+  _Complex double ALIGN mres = 0.0;
+#endif
+
 #ifdef OMP
 #pragma omp parallel
   {
   int thread_num = omp_get_thread_num();
-  g_omp_kc[thread_num] = 0.0;
-  g_omp_ks[thread_num] = 0.0;
+  g_omp_acc_cp[thread_num] = 0.0;
 #endif
 
-  _Complex double ALIGN ds,tr,ts,tt;
+  _Complex double ALIGN ds,tr,ts,tt,ks,kc;
   spinor *s,*r;
 
+  ks = kc = 0.0;
+
 #if (defined BGL && defined XLC)
   __alignx(16, S);
   __alignx(16, R);
 #endif
+
 #ifdef OMP
-#pragma omp for schedule(static)
+#pragma omp for
 #endif
   for (int ix = 0; ix < N; ix++)
   {
@@ -64,43 +69,35 @@ _Complex double scalar_prod(spinor * const S, spinor * const R, const int N, con
          r->s3.c0 * conj(s->s3.c0) + r->s3.c1 * conj(s->s3.c1) + r->s3.c2 * conj(s->s3.c2);
 
     /* Kahan Summation */
-#ifdef OMP
-    tr = ds + g_omp_kc[thread_num];
-    ts = tr + g_omp_ks[thread_num];
-    tt = ts - g_omp_ks[thread_num];
-    g_omp_ks[thread_num] = ts;
-    g_omp_kc[thread_num] = tr - tt;
-#else
     tr=ds+kc;
     ts=tr+ks;
     tt=ts-ks;
     ks=ts;
     kc=tr-tt;
-#endif
   }
+  kc=ks+kc;
 
 #ifdef OMP
-  /* thread-local last step of the Kahan summation */
-  g_omp_kc[thread_num] = g_omp_ks[thread_num] + g_omp_kc[thread_num];
+  g_omp_acc_cp[thread_num] = kc;
 
   } /* OpenMP closing brace */
 
   /* having left the parallel section, we can now sum up the Kahan
      corrected sums from each thread into kc */
   for(int i = 0; i < omp_num_threads; ++i)
-    kc += g_omp_kc[i];
+    res += g_omp_acc_cp[i];
 #else
-  kc=ks+kc;
+  res=kc;
 #endif
 
 #ifdef MPI
   if(parallel == 1)
   {
-    MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
-    return(ks);
+    MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
+    return(mres);
   }
 #endif
-  return(kc);
+  return(res);
 }
 
 #ifdef WITHLAPH