Skip to content

Commit

Permalink
Improved implementation of OpenMP Kahan summation without global thre…
Browse files Browse the repository at this point in the history
…ad-local kc and ks.

Instead, an accumulation is done while still in the parallel section and this is finally
summed up to yield the result. This reduces the differences between the two implementations.

OpenMP implementation of BGQ scalar_prod_r with Kahan summation
  • Loading branch information
kostrzewa committed Sep 26, 2012
1 parent fc301ab commit 3e18dfd
Show file tree
Hide file tree
Showing 8 changed files with 131 additions and 147 deletions.
39 changes: 20 additions & 19 deletions clover_leaf.c
Original file line number Diff line number Diff line change
Expand Up @@ -410,22 +410,27 @@ inline void add_tm(_Complex double a[6][6], const double mu) {
}

double sw_trace(const int ieo, const double mu) {
double ALIGN ks, kc;
ks = kc = 0.0;
double ALIGN res = 0.0;
#ifdef MPI
double ALIGN mres = 0.0;
#endif

#ifdef OMP
#pragma omp parallel
{
int thread_num = omp_get_thread_num();
g_omp_ks_re[thread_num] = g_omp_kc_re[thread_num] = 0.0;
g_omp_acc_re[thread_num] = 0.0;
#endif

int i,x,ioff;
su3 ALIGN v;
_Complex double ALIGN a[6][6];
double ALIGN tra;
double ALIGN tr,ts,tt;
double ALIGN ks,kc,tr,ts,tt;
_Complex double ALIGN det;

ks = kc = 0.0;

if(ieo==0) {
ioff=0;
}
Expand All @@ -434,7 +439,7 @@ double sw_trace(const int ieo, const double mu) {
}

#ifdef OMP
#pragma omp for schedule(static)
#pragma omp for
#endif
for(int icx = ioff; icx < (VOLUME/2+ioff); icx++) {
x = g_eo2lexic[icx];
Expand All @@ -452,36 +457,32 @@ double sw_trace(const int ieo, const double mu) {
tra = log(conj(det)*det);
// we need to compute only the one with +mu
// the one with -mu must be the complex conjugate!
#ifdef OMP
tr=tra+g_omp_kc_re[thread_num];
ts=tr+g_omp_ks_re[thread_num];
tt=ts-g_omp_ks_re[thread_num];
g_omp_ks_re[thread_num]=ts;
g_omp_kc_re[thread_num]=tr-tt;
#else

tr=tra+kc;
ts=tr+ks;
tt=ts-ks;
ks=ts;
kc=tr-tt;
#endif
}
}
kc=ks+kc;

#ifdef OMP
g_omp_kc_re[thread_num] = g_omp_ks_re[thread_num] + g_omp_kc_re[thread_num];
g_omp_acc_re[thread_num] = kc;
} /* OpenMP parallel closing brace */

for(int i = 0; i < omp_num_threads; ++i) {
kc += g_omp_kc_re[i];
res += g_omp_acc_re[i];
}
#else
kc=ks+kc;
res=kc;
#endif

#ifdef MPI
MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
return(ks);
MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
return(mres);
#else
return(kc);
return(res);
#endif

}
Expand Down
6 changes: 2 additions & 4 deletions global.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,10 +227,8 @@ EXTERN int g_mpi_ST_rank;
EXTERN int g_nb_list[8];

/* OpenMP Kahan accumulation arrays */
EXTERN _Complex double *g_omp_kc;
EXTERN _Complex double *g_omp_ks;
EXTERN double* g_omp_kc_re;
EXTERN double* g_omp_ks_re;
EXTERN _Complex double *g_omp_acc_cp;
EXTERN double* g_omp_acc_re;

/* Deflation information */
EXTERN int g_dflgcr_flag;
Expand Down
36 changes: 11 additions & 25 deletions init_omp_kahan_arrays.c → init_omp_accumulators.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,41 +27,27 @@
#include <stdio.h>
#include <errno.h>
#include "global.h"
#include "init_omp_kahan_arrays.h"
#include "init_omp_accumulators.h"

int init_omp_kahan_arrays(const int num) {
g_omp_ks=NULL;
g_omp_kc=NULL;
g_omp_ks_re=NULL;
g_omp_kc_re=NULL;
int init_omp_accumulators(const int num) {
g_omp_acc_cp=NULL;
g_omp_acc_re=NULL;

if((void*)(g_omp_ks = (_Complex double*)malloc(num*sizeof(_Complex double))) == NULL) {
printf ("init_omp_kahan_arrays malloc errno : %d\n",errno);
if((void*)(g_omp_acc_cp = (_Complex double*)malloc(num*sizeof(_Complex double))) == NULL) {
printf ("init_omp_accumulators malloc errno : %d\n",errno);
errno = 0;
return(1);
}
if((void*)(g_omp_kc = (_Complex double*)malloc(num*sizeof(_Complex double))) == NULL) {
printf ("init_omp_kahan_arrays malloc errno : %d\n",errno);
if((void*)(g_omp_acc_re = (double*)malloc(num*sizeof(double))) == NULL) {
printf ("init_omp_accumulators malloc errno : %d\n",errno);
errno = 0;
return(2);
}
if((void*)(g_omp_ks_re = (double*)malloc(num*sizeof(double))) == NULL) {
printf ("init_omp_kahan_arrays malloc errno : %d\n",errno);
errno = 0;
return(3);
}
if((void*)(g_omp_kc_re = (double*)malloc(num*sizeof(double))) == NULL) {
printf ("init_omp_kahan_arrays malloc errno : %d\n",errno);
errno = 0;
return(4);
}

return(0);
}

void free_omp_kahan_arrays() {
free(g_omp_kc);
free(g_omp_ks);
free(g_omp_kc_re);
free(g_omp_ks_re);
void free_omp_accumulators() {
free(g_omp_acc_cp);
free(g_omp_acc_re);
}
8 changes: 4 additions & 4 deletions init_omp_kahan_arrays.h → init_omp_accumulators.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
* You should have received a copy of the GNU General Public License
* along with tmLQCD. If not, see <http://www.gnu.org/licenses/>.
***********************************************************************/
#ifndef _INIT_OMP_KAHAN_ARRAYS_H
#define _INIT_OMP_KAHAN_ARRAYS_H
#ifndef _INIT_OMP_ACCUMULATORS_H
#define _INIT_OMP_ACCUMULATORS_H

int init_omp_kahan_arrays(int num);
void free_omp_kahan_arrays();
int init_omp_accumulators(int num);
void free_omp_accumulators();

#endif
41 changes: 19 additions & 22 deletions linalg/scalar_prod.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,31 @@
#include "scalar_prod.h"

/* <S,R>=S^* times R */
_Complex double scalar_prod(spinor * const S, spinor * const R, const int N, const int parallel) {
_Complex double ALIGN ks,kc;
ks = kc = 0.0;
_Complex double scalar_prod(const spinor * const S, const spinor * const R, const int N, const int parallel) {
_Complex double ALIGN res = 0.0;
#ifdef MPI
_Complex double ALIGN mres = 0.0;
#endif

#ifdef OMP
#pragma omp parallel
{
int thread_num = omp_get_thread_num();
g_omp_kc[thread_num] = 0.0;
g_omp_ks[thread_num] = 0.0;
g_omp_acc_cp[thread_num] = 0.0;
#endif

_Complex double ALIGN ds,tr,ts,tt;
_Complex double ALIGN ds,tr,ts,tt,ks,kc;
spinor *s,*r;

ks = kc = 0.0;

#if (defined BGL && defined XLC)
__alignx(16, S);
__alignx(16, R);
#endif

#ifdef OMP
#pragma omp for schedule(static)
#pragma omp for
#endif
for (int ix = 0; ix < N; ix++)
{
Expand All @@ -64,43 +69,35 @@ _Complex double scalar_prod(spinor * const S, spinor * const R, const int N, con
r->s3.c0 * conj(s->s3.c0) + r->s3.c1 * conj(s->s3.c1) + r->s3.c2 * conj(s->s3.c2);

/* Kahan Summation */
#ifdef OMP
tr = ds + g_omp_kc[thread_num];
ts = tr + g_omp_ks[thread_num];
tt = ts - g_omp_ks[thread_num];
g_omp_ks[thread_num] = ts;
g_omp_kc[thread_num] = tr - tt;
#else
tr=ds+kc;
ts=tr+ks;
tt=ts-ks;
ks=ts;
kc=tr-tt;
#endif
}
kc=ks+kc;

#ifdef OMP
/* thread-local last step of the Kahan summation */
g_omp_kc[thread_num] = g_omp_ks[thread_num] + g_omp_kc[thread_num];
g_omp_acc_cp[thread_num] = kc;

} /* OpenMP closing brace */

/* having left the parallel section, we can now sum up the Kahan
corrected sums from each thread into kc */
for(int i = 0; i < omp_num_threads; ++i)
kc += g_omp_kc[i];
res += g_omp_acc_cp[i];
#else
kc=ks+kc;
res=kc;
#endif

#ifdef MPI
if(parallel == 1)
{
MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
return(ks);
MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD);
return(mres);
}
#endif
return(kc);
return(res);
}

#ifdef WITHLAPH
Expand Down
Loading

0 comments on commit 3e18dfd

Please sign in to comment.