Skip to content

Commit

Permalink
SPI multiprecision
Browse files Browse the repository at this point in the history
  • Loading branch information
Meinersbur committed Nov 20, 2012
1 parent fd713c9 commit e2149c8
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 75 deletions.
4 changes: 2 additions & 2 deletions DirectPut.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ unsigned int spi_num_dirs = NUM_DIRS;
// in bytes!
uint64_t totalMessageSize;
// Allocate static memory for descriptors
char SPIDescriptorsMemory[ NUM_DIRS * sizeof(MUHWI_Descriptor_t) + 64/*for alignment*/];
char SPIDescriptorsMemory32[ NUM_DIRS * sizeof(MUHWI_Descriptor_t) + 64/*for alignment*/];
char SPIDescriptorsMemory[NUM_DIRS * sizeof(MUHWI_Descriptor_t) + 64/*for alignment*/];
char SPIDescriptorsMemory32[NUM_DIRS * sizeof(MUHWI_Descriptor_t) + 64/*for alignment*/];
// pointer to descriptor array
MUHWI_Descriptor_t *SPIDescriptors;
MUHWI_Descriptor_t *SPIDescriptors32;
Expand Down
16 changes: 8 additions & 8 deletions bgq/bgq_HoppingMatrixWorker.inc.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ void bgq_HoppingMatrix_worker(void *arg, size_t tid, size_t threads, bool kamul,
if (!noprefetchstream) {
bgq_prefetch_forward(&g_bgq_gaugefield_fromCollapsed[isOdd][begin]);
if (readFulllayout) {
bgq_prefetch_forward(&spinorfield->sec_fullspinor[begin]);
bgq_prefetch_forward(&spinorfield->BGQ_SEC_FULLLAYOUT[begin]);
} else {
bgq_prefetch_forward(&spinorfield->sec_collapsed[begin]);
bgq_prefetch_forward(&spinorfield->BGQ_SEC_WEYLLAYOUT[begin]);
}
bgq_prefetch_forward(&targetfield->sendptr[begin]);
bgq_prefetch_forward(&targetfield->BGQ_SENDPTR[begin]);
}

bgq_gaugesite *gaugesite = &g_bgq_gaugefield_fromCollapsed[isOdd][begin];
Expand Down Expand Up @@ -140,7 +140,7 @@ void bgq_HoppingMatrix_worker(void *arg, size_t tid, size_t threads, bool kamul,
{
bgq_su3_weylnext_prefetch_double(weylsite);

// TUP
// T+ /////////////////////////////////////////////////////////////////////////
{
bgq_su3_weyl_decl(weylnext_tup);
bgq_qvlfduxa(weylnext_tup_v0_c0, weylsite, 32);
Expand All @@ -154,7 +154,7 @@ void bgq_HoppingMatrix_worker(void *arg, size_t tid, size_t threads, bool kamul,

bgq_su3_weylnext_prefetch_double(weylsite);

// TDOWN
// T- /////////////////////////////////////////////////////////////////////////
{
bgq_su3_weyl_decl(weylnext_tdown);
bgq_qvlfduxa(weylnext_tdown_v0_c0, weylsite, 32);
Expand Down Expand Up @@ -252,13 +252,13 @@ void bgq_HoppingMatrix_worker(void *arg, size_t tid, size_t threads, bool kamul,
}
}
ucoord ic_comp = ic_load;
ic_load+=1;
ic_load += 1;
for (; ic_load<end; ic_load+=1) {
bgq_su3_spinor_decl(spinor);
bgq_su3_spinor_mov(spinor, spinornext);
//weylsite = (bgq_weylsite*)(((uintptr_t)&spinorfield->sec_collapsed[ic_load])-32);
//gaugesite = (bgq_gaugesite *)(((uintptr_t)&g_bgq_gaugefield_fromCollapsed[isOdd][ic_comp])-32);
bgq_weyl_ptr_t *targetptrs = &targetfield->sendptr[ic_comp];// 8*sizeof(bgq_weyl_vec*)=64 bytes
bgq_weyl_ptr_t *targetptrs = &targetfield->BGQ_SENDPTR[ic_comp];// 8*sizeof(bgq_weyl_vec*)=64 bytes

bgq_su3_weylnext_prefetch_double(weylsite);
bgq_su3_matrixnext_prefetch_double(gaugesite);
Expand Down Expand Up @@ -561,7 +561,7 @@ void bgq_HoppingMatrix_worker(void *arg, size_t tid, size_t threads, bool kamul,
{
bgq_su3_spinor_decl(spinor);
bgq_su3_spinor_mov(spinor, spinornext);
bgq_weyl_ptr_t *targetptrs = &targetfield->sendptr[ic_comp];
bgq_weyl_ptr_t *targetptrs = &targetfield->BGQ_SENDPTR[ic_comp];

// tup
{
Expand Down
135 changes: 86 additions & 49 deletions bgq/bgq_comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -189,19 +189,19 @@ static inline unsigned bgq_abcde2spirank(Personality_t *pers, uint8_t a, uint8_t

static void setup_destinations(Personality_t *pers) {
torus_t tcoords = {
pers->Network_Config.Acoord,
pers->Network_Config.Bcoord,
pers->Network_Config.Ccoord,
pers->Network_Config.Dcoord,
pers->Network_Config.Ecoord
pers->Network_Config.Acoord,
pers->Network_Config.Bcoord,
pers->Network_Config.Ccoord,
pers->Network_Config.Dcoord,
pers->Network_Config.Ecoord
};

torus_t tdims = {
pers->Network_Config.Anodes,
pers->Network_Config.Bnodes,
pers->Network_Config.Cnodes,
pers->Network_Config.Dnodes,
pers->Network_Config.Enodes
pers->Network_Config.Anodes,
pers->Network_Config.Bnodes,
pers->Network_Config.Cnodes,
pers->Network_Config.Dnodes,
pers->Network_Config.Enodes
};

//numNodes = tdims.a * tdims.b * tdims.c * tdims.d * tdims.e;
Expand Down Expand Up @@ -231,13 +231,15 @@ static void setup_destinations(Personality_t *pers) {
nb2dest[cd].hintsABCD = 0;
nb2dest[cd].hintsE = 0;
//printf("node %d: %d(%d,%d,%d,%d,%d)-%d->%d(%d,%d,%d,%d,%d)\n", g_proc_id, mySpirank, tcoords.a, tcoords.b, tcoords.c, tcoords.e, tcoords.d, cd, nbrank, nb.a, nb.b, nb.c, nb.d, nb.e);

//nb2dest[COMMDIR_COUNT + cd] = nb2dest[cd];
}
}
#endif



static void bgq_comm_test(bool nospi) {
static void bgq_comm_test(bool nospi, bool sloppy) {
// test communication

for (ucoord cd = 0; cd < COMMDIR_COUNT; cd+=1) {
Expand All @@ -255,9 +257,9 @@ static void bgq_comm_test(bool nospi) {
}
}

bgq_comm_recv(nospi, false);
bgq_comm_send(nospi, false);
bgq_comm_wait(nospi, false);
bgq_comm_recv(nospi, sloppy);
bgq_comm_send(nospi, sloppy);
bgq_comm_wait(nospi, sloppy);

for (ucoord cd = 0; cd < COMMDIR_COUNT; cd+=1) {
bgq_direction d = bgq_commdir2direction(cd);
Expand All @@ -281,7 +283,7 @@ static void bgq_comm_test(bool nospi) {
}
}

master_print("Communication nospi=%d tested successfully\n", nospi);
master_print("%s Communication sloppy=%d tested successfully\n", nospi ? "MPI" : "MU SPI", sloppy);
}


Expand All @@ -292,10 +294,12 @@ static void bgq_comm_common_init(void) {
g_bgq_comm_common_initialized = true;
bgq_indices_init();

{
size_t commbufsize = bgq_weyl_section_offset(sec_comm_end) - bgq_weyl_section_offset(sec_comm);
uint8_t *buf = (uint8_t*)malloc_aligned(commbufsize, BGQ_ALIGNMENT_L2);
uint8_t *buf = (uint8_t*)malloc_aligned(commbufsize + commbufsize/*some memory is wasted here, but no additional work to be done for alignment*/, BGQ_ALIGNMENT_L2);
uint8_t *bufend = buf + commbufsize;
g_bgq_sec_comm = buf;
g_bgq_sec_comm_float = buf + commbufsize;
for (bgq_direction d = 0; d < PHYSICAL_LD; d+=1) {
bgq_dimension dim = bgq_direction2dimension(d);

Expand All @@ -304,20 +308,21 @@ static void bgq_comm_common_init(void) {

g_bgq_sec_send_double[d] = (bgq_weyl_vec_double*)(buf + bgq_weyl_section_offset(sec_send) - bgq_weyl_section_offset(sec_comm));
assert((uint8_t*)g_bgq_sec_send_double[d] <= bufend);
g_bgq_sec_send_float[d] = (bgq_weyl_vec_float*)g_bgq_sec_send_double[d];
g_bgq_sec_send_float[d] = (bgq_weyl_vec_float*)((uint8_t*)g_bgq_sec_send_double[d] + commbufsize);

g_bgq_sec_recv_double[d] = (bgq_weyl_vec_double*)(buf + bgq_weyl_section_offset(sec_recv) - bgq_weyl_section_offset(sec_comm));
assert((uint8_t*)g_bgq_sec_recv_double[d] <= bufend);
g_bgq_sec_recv_float[d] = (bgq_weyl_vec_float*)g_bgq_sec_recv_double[d];
g_bgq_sec_recv_float[d] = (bgq_weyl_vec_float*)((uint8_t*)g_bgq_sec_recv_double[d] + commbufsize);

assert((uintptr_t)g_bgq_sec_send_double[d] % BGQ_ALIGNMENT_L2 == 0);
assert((uintptr_t)g_bgq_sec_recv_double[d] % BGQ_ALIGNMENT_L2 == 0);
}
}

if (BGQ_UNVECTORIZE || !COMM_T) {
g_bgq_sec_temp_tup_double = malloc_aligned(LOCAL_HALO_T/PHYSICAL_LP *sizeof(bgq_weyl_vec_double),BGQ_ALIGNMENT_L2);
g_bgq_sec_temp_tup_double = malloc_aligned(LOCAL_HALO_T/PHYSICAL_LP *sizeof(bgq_weyl_vec_double), BGQ_ALIGNMENT_L2);
g_bgq_sec_temp_tup_float = (bgq_weyl_vec_float*)g_bgq_sec_temp_tup_double;
g_bgq_sec_temp_tdown_double = malloc_aligned(LOCAL_HALO_T/PHYSICAL_LP *sizeof(bgq_weyl_vec_double),BGQ_ALIGNMENT_L2);
g_bgq_sec_temp_tdown_double = malloc_aligned(LOCAL_HALO_T/PHYSICAL_LP *sizeof(bgq_weyl_vec_double), BGQ_ALIGNMENT_L2);
g_bgq_sec_temp_tdown_float = (bgq_weyl_vec_float*)g_bgq_sec_temp_tdown_double;
//g_bgq_sec_vrecv_tup = malloc_aligned(PHYSICAL_HALO_T,BGQ_ALIGNMENT_L2);
//g_bgq_sec_vrecv_tdown = malloc_aligned(PHYSICAL_HALO_T,BGQ_ALIGNMENT_L2);
Expand Down Expand Up @@ -359,11 +364,12 @@ void bgq_comm_mpi_init(void) {
//master_print("MPI_CHECK(MPI_Send_init(%zu, %zu, %zu, %zu, %zu, %zu, %zu))\n", g_bgq_sec_send[d_dst], secsize / sizeof(double), MPI_DOUBLE, bgq_direction2rank(d_dst), d_dst, g_cart_grid, &g_bgq_request_send[commdir_dst]);
}


bgq_comm_test(true);
bgq_comm_test(true, false);
bgq_comm_test(true, true);
}


static uint64_t totalMessageSize_float;

static bool g_bgq_comm_spi_initialized = false;
void bgq_comm_spi_init(void) {
Expand All @@ -374,14 +380,19 @@ void bgq_comm_spi_init(void) {
bgq_comm_common_init();


size_t messageSizes[PHYSICAL_LD];
size_t roffsets[PHYSICAL_LD];
size_t soffsets[PHYSICAL_LD];
//size_t totalMessageSize = 0;
size_t messageSizes[2*COMMDIR_COUNT];
size_t roffsets[2*COMMDIR_COUNT];
size_t soffsets[2*COMMDIR_COUNT];

size_t messageSizes_float[COMMDIR_COUNT];
//size_t roffsets_flaot[COMMDIR_COUNT];
//size_t soffsets_float[COMMDIR_COUNT];

// here comes the SPI initialization
int spi_num_dirs = COMMDIR_COUNT;

totalMessageSize = 0;
totalMessageSize_float = 0;
size_t bufsize_double = bgq_weyl_section_offset(sec_send_end) - bgq_weyl_section_offset(sec_send_begin);
for (ucoord cd = 0; cd < COMMDIR_COUNT; cd+=1) {
bgq_direction d_src = bgq_commdir2direction(cd);
bgq_direction d_dst = bgq_direction_revert(d_src);
Expand All @@ -407,9 +418,25 @@ void bgq_comm_spi_init(void) {
assert((roffsets[cd] + secsize) <= (bgq_weyl_section_offset(sec_recv_end) - bgq_weyl_section_offset(sec_recv_begin)));
totalMessageSize += secsize;


size_t secsize_float = secsize/2;
messageSizes_float[commdir] = secsize_float;
messageSizes[COMMDIR_COUNT+commdir] = secsize_float;
//soffsets_float[commdir] = bufsize_double + soffsets[commdir];
soffsets[COMMDIR_COUNT+commdir] = bufsize_double + soffsets[commdir];
//roffsets_commdir[cd] = bufsize_double + roffsets[cd];
roffsets[COMMDIR_COUNT+cd] = bufsize_double + roffsets[cd];
totalMessageSize_float += secsize_float;

//master_print("SPI %llu: d=%llu msize=%zu soffset=%zu d_dst=%llu roffset=%zu\n", cd, d_src, messageSizes[commdir], soffsets[commdir], d_dst, roffsets[cd]);
}
assert(totalMessageSize == bgq_weyl_section_offset(sec_recv_end) - bgq_weyl_section_offset(sec_recv_begin));
assert(totalMessageSize_float == totalMessageSize/2);






do_dynamic = 0; // Use static routing (since only neighbor-to-neighbor communication)

Expand All @@ -426,21 +453,21 @@ void bgq_comm_spi_init(void) {
setup_destinations(&pers);

// adjust the SPI pointers to the send and receive buffers
SPIrecvBuffers = (char*)g_bgq_sec_recv[0];
SPIrecvBuffers = (char*)g_bgq_sec_recv_double[0];
assert((uintptr_t)SPIrecvBuffers % BGQ_ALIGNMENT_L2 == 0);
SPIsendBuffers = (char*)g_bgq_sec_send[0];
SPIsendBuffers = (char*)g_bgq_sec_send_double[0];
assert((uintptr_t)SPIsendBuffers % BGQ_ALIGNMENT_L2 == 0);

// Setup the FIFO handles
rc = msg_InjFifoInit(&injFifoHandle,
0, /* startingSubgroupId */
0, /* startingFifoId */
spi_num_dirs, /* numFifos */
COMMDIR_COUNT, /* numFifos */
INJ_MEMORY_FIFO_SIZE+1, /* fifoSize */
NULL /* Use default attributes */
);
if(rc != 0) {
fprintf(stderr, "msg_InjFifoInit failed with rc=%d\n",rc);
fprintf(stderr, "msg_InjFifoInit failed with rc=%d\n", rc);
exit(1);
}

Expand All @@ -452,7 +479,10 @@ void bgq_comm_spi_init(void) {
// Create descriptors
// Injection Direct Put Descriptor, one for each neighbour
SPIDescriptors = (MUHWI_Descriptor_t*)(((uint64_t)SPIDescriptorsMemory+64)&~(64-1));
create_descriptors(SPIDescriptors, messageSizes, soffsets, roffsets, spi_num_dirs);
create_descriptors(SPIDescriptors, messageSizes, soffsets, roffsets, COMMDIR_COUNT);

SPIDescriptors32 = (MUHWI_Descriptor_t*)(((uint64_t)SPIDescriptorsMemory32+64)&~(64-1));
create_descriptors(SPIDescriptors32, messageSizes_float, soffsets, roffsets, COMMDIR_COUNT);

// Initialize the barrier, resetting the hardware.
rc = MUSPI_GIBarrierInit(&GIBarrier, 0 /*comm world class route */);
Expand All @@ -474,19 +504,21 @@ void bgq_comm_spi_init(void) {
#endif


bgq_comm_test(false);
bgq_comm_test(false, false);
bgq_comm_test(false, true);
#endif
}


//TODO: inline?
void bgq_comm_recv(bool nospi, bool sloppy) {
assert(omp_get_thread_num()==0);
//master_print("Comm Receiving...\n");
//master_print("Comm Receiving... nospi=%d sloppy=%d\n", nospi, sloppy);

#ifdef SPI
if (!nospi) {
// reset the recv counter
recvCounter = totalMessageSize;
recvCounter = sloppy ? totalMessageSize_float : totalMessageSize;
return;
}
#endif
Expand All @@ -500,18 +532,20 @@ void bgq_comm_recv(bool nospi, bool sloppy) {

void bgq_comm_send(bool nospi, bool sloppy) {
assert(omp_get_thread_num()==0);
//master_print("Comm Sending...\n");
//master_print("Comm Sending... nospi=%d sloppy=%d\n", nospi, sloppy);

#ifdef SPI
if (!nospi) {
// make sure everybody has reset recvCounter
global_barrier(); //TODO: Can we get rid of it?
for (size_t cd = 0; cd < COMMDIR_COUNT; cd+=1) {
descCount[cd] = msg_InjFifoInject(injFifoHandle, cd, &SPIDescriptors[cd]);
if (descCount[cd] == -1) {
printf("msg_InjFifoInject failed, most likely because there is no room in the fifo\n");
abort();
}
}

for (size_t cd = 0; cd < COMMDIR_COUNT; cd += 1) {
descCount[cd] = msg_InjFifoInject(injFifoHandle, cd, sloppy ? &SPIDescriptors32[cd] : &SPIDescriptors[cd]);
if (descCount[cd] == -1) {
printf("msg_InjFifoInject failed, most likely because there is no room in the fifo\n");
abort();
}
}
return;
}
#endif
Expand All @@ -525,7 +559,7 @@ void bgq_comm_send(bool nospi, bool sloppy) {

void bgq_comm_wait(bool nospi, bool sloppy) {
assert(omp_get_thread_num()==0);
//master_print("Comm Waiting...\n");
//master_print("Comm Waiting... nospi=%d sloppy=%d\n", nospi, sloppy);

#if BGQ_QPX
uint64_t ppc32 = mfspr(SPRN_PPR32);
Expand All @@ -535,24 +569,27 @@ void bgq_comm_wait(bool nospi, bool sloppy) {
#ifdef SPI
if (!nospi) {
uint64_t startTime = 0;
uint64_t expectedBytes = sloppy ? totalMessageSize_float : totalMessageSize;

// Wait for all data is received
//printf("node %d: %llu bytes to be received\n", g_proc_id, totalMessageSize);
//printf("node %d: %llu bytes to be received\n", g_proc_id, expectedBytes);
while(recvCounter > 0) {
// Check range of pending bytes to receive
assert(recvCounter <= totalMessageSize);
assert(recvCounter <= expectedBytes);

#if 0
if (GetTimeBase() - startTime >= 1600) {
//printf("node %d: %llu bytes left\n", g_proc_id, recvCounter);
printf("node %d: %llu bytes left\n", g_proc_id, recvCounter);
startTime = GetTimeBase();
}
#endif
}
//printf("node %d: All data received\n", g_proc_id);

// Wait for all data sent
while (true) {
size_t sendDone = 0;
for (unsigned j = 0; j < COMMDIR_COUNT; j+=1) {
for (size_t j = 0; j < COMMDIR_COUNT; j += 1) {
sendDone += msg_InjFifoCheckCompletion(injFifoHandle, j, descCount[j]);
}
if (sendDone == COMMDIR_COUNT)
Expand All @@ -578,7 +615,7 @@ void bgq_comm_wait(bool nospi, bool sloppy) {
#ifndef NDEBUG
for (ucoord commdir = 0; commdir < COMMDIR_COUNT; commdir += 1) {
bgq_direction d = bgq_commdir2direction(commdir);
bgq_weylfield_section sec = bgq_direction2section(d,false);
bgq_weylfield_section sec = bgq_direction2section(d, false);
size_t size = bgq_weyl_section_offset(sec+1) - bgq_weyl_section_offset(sec);
if (sloppy)
size /= 2;
Expand Down
1 change: 1 addition & 0 deletions bgq/bgq_comm.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ void bgq_comm_wait(bool nospi, bool sloppy);


EXTERN_FIELD uint8_t *g_bgq_sec_comm;
EXTERN_FIELD uint8_t *g_bgq_sec_comm_float;
EXTERN_FIELD bgq_weyl_vec_double *g_bgq_sec_recv_double[PHYSICAL_LD];
EXTERN_FIELD bgq_weyl_vec_double *g_bgq_sec_send_double[PHYSICAL_LD];

Expand Down
Loading

0 comments on commit e2149c8

Please sign in to comment.