Skip to content

Commit

Permalink
fused version of QPX post halfspinor
Browse files Browse the repository at this point in the history
  • Loading branch information
urbach committed Sep 1, 2012
1 parent 959b509 commit 35d4aa2
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 4 deletions.
6 changes: 6 additions & 0 deletions bgq.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@
r4 = vec_ld(128L, (double*) &(phi).c0); \
r5 = vec_ld(160L, (double*) &(phi).c0);

#define _vec_load_halfspinor(r0, r1, r2, phi) \
r0 = vec_ld(0L, (double*) &(phi).c0); \
r1 = vec_ld(32L, (double*) &(phi).c0); \
r2 = vec_ld(64L, (double*) &(phi).c0);


#define _vec_store_spinor(phi, r0, r1, r2, r3, r4, r5) \
vec_st(r0, 0L, (double*) &(phi).c0); \
vec_st(r1, 32L, (double*) &(phi).c0); \
Expand Down
2 changes: 2 additions & 0 deletions bgq2.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,8 @@
rs1 = vec_xxcpnmadd(r1, tmp, rs1); \
rs2 = vec_xxcpnmadd(r2, tmp, rs2);

// pushes the second quadword from r0, r1, r2
// int the first quadword of r3, r4, r5
#define _vec_unfuse(r0, r1, r2, r3, r4, r5) \
r3 = vec_sldw(r0, r0, 2); \
r4 = vec_sldw(r1, r1, 2); \
Expand Down
81 changes: 77 additions & 4 deletions operator/halfspinor_hopping.h
Original file line number Diff line number Diff line change
Expand Up @@ -816,7 +816,7 @@
_vec_su3_multiply_double2c(U); \
rtmp = vec_ld2(0, (double*) &ka0); \
_vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \
_vec_store_double(phi[ix]->s0, r0, r1, r2);
_vec_store_halfspinor(phi[ix]->s0, r0, r1, r2);

#define _hop_t_m_pre() \
_vec_sub_to2(r0, r1, r2, rs0, rs1, rs2, rs6, rs7, rs8); \
Expand Down Expand Up @@ -846,7 +846,7 @@
rtmp = vec_ld2(0, (double*) &ka1); \
_vec_su3_multiply_double2c(U); \
_vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \
_vec_store_double(phi[ix]->s0, r0, r1, r2);
_vec_store_halfspinor(phi[ix]->s0, r0, r1, r2);

#define _hop_x_m_pre() \
_vec_i_mul_sub_to2(r0, r1, r2, rs0, rs1, rs2, rs9, rs10, rs11, U0); \
Expand Down Expand Up @@ -876,7 +876,7 @@
rtmp = vec_ld2(0, (double*) &ka2); \
_vec_su3_multiply_double2c(U); \
_vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \
_vec_store_double(phi[ix]->s0, r0, r1, r2);
_vec_store_halfspinor(phi[ix]->s0, r0, r1, r2);

#define _hop_y_m_pre() \
_vec_sub_to2(r0, r1, r2, rs0, rs1, rs2, rs9, rs10, rs11); \
Expand Down Expand Up @@ -906,7 +906,7 @@
rtmp = vec_ld2(0, (double*) &ka3); \
_vec_su3_multiply_double2c(U); \
_vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \
_vec_store_double(phi[ix]->s0, r0, r1, r2);
_vec_store_halfspinor(phi[ix]->s0, r0, r1, r2);

#define _hop_z_m_pre() \
_vec_i_mul_sub_to2(r0, r1, r2, rs0, rs1, rs2, rs6, rs7, rs8, U0); \
Expand All @@ -930,6 +930,12 @@
rs10= rs4; \
rs11= rs5;

#define _hop_t_p_post2() \
_vec_load_halfspinor(rs0, rs1, rs2, phi[ix]->s0); \
_vec_unfuse(rs0, rs1, rs2, rs3, rs4, rs5); \
rs6 = rs0; rs7 = rs1; rs8 = rs2; \
rs9 = rs3; rs10= rs4; rs11= rs5;

#define _hop_t_m_post() \
_prefetch_su3(U+1); \
_vec_load2(r0, r1, r2, phi[ix]->s0); \
Expand All @@ -942,6 +948,16 @@
_vec_add2(rs3, rs4, rs5, r3, r4, r5); \
_vec_sub2(rs9, rs10, rs11, r3, r4, r5);

#define _hop_t_m_post2() \
_vec_load(r0, r1, phi[ix]->s0); \
_vec_load16(r2, r3, phi[ix]->s1); \
rtmp = vec_ld2(0, (double*) &ka0); \
_vec_su3_inverse_multiply_double2c(U); \
_vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \
_vec_unfuse(r0, r1, r2, r3, r4, r5); \
_vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
_vec_sub_double2(rs6, rs7, rs8, rs9, rs10, rs11, r0, r1, r2, r3, r4, r5);

#define _hop_x_p_post() \
_vec_load2(r0, r1, r2, phi[ix]->s0); \
_vec_load2(r3, r4, r5, phi[ix]->s1); \
Expand All @@ -950,6 +966,13 @@
_vec_add2(rs3, rs4, rs5, r3, r4, r5); \
_vec_i_mul_sub2(rs6, rs7, rs8, r3, r4, r5, U0);

#define _hop_x_p_post2() \
_vec_load_halfspinor(r0, r1, r2, phi[ix]->s0); \
_vec_unfuse(r0, r1, r2, r3, r4, r5); \
_vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
_vec_i_mul_sub2(rs6, rs7, rs8, r3, r4, r5, U0); \
_vec_i_mul_sub2(rs9, rs10, rs11, r0, r1, r2, U1);

#define _hop_x_m_post() \
_prefetch_su3(U+1); \
_vec_load2(r0, r1, r2, phi[ix]->s0); \
Expand All @@ -960,13 +983,30 @@
_vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
_vec_i_mul_add_double2(rs9, rs10, rs11, rs6, rs7, rs8, r0, r1, r2, r3, r4, r5, U0);

#define _hop_x_m_post2() \
_vec_load(r0, r1, phi[ix]->s0); \
_vec_load16(r2, r3, phi[ix]->s1); \
rtmp = vec_ld2(0, (double*) &ka1); \
_vec_su3_inverse_multiply_double2c(U); \
_vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \
_vec_unfuse(r0, r1, r2, r3, r4, r5); \
_vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
_vec_i_mul_add_double2(rs9, rs10, rs11, rs6, rs7, rs8, r0, r1, r2, r3, r4, r5, U0);

#define _hop_y_p_post() \
_vec_load2(r0, r1, r2, phi[ix]->s0); \
_vec_load2(r3, r4, r5, phi[ix]->s1); \
_vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
_vec_sub2(rs6, rs7, rs8, r3, r4, r5); \
_vec_add2(rs9, rs10, rs11, r0, r1, r2);

#define _hop_y_p_post2() \
_vec_load_halfspinor(r0, r1, r2, phi[ix]->s0); \
_vec_unfuse(r0, r1, r2, r3, r4, r5); \
_vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
_vec_sub2(rs6, rs7, rs8, r3, r4, r5); \
_vec_add2(rs9, rs10, rs11, r0, r1, r2);

#define _hop_y_m_post() \
_prefetch_su3(U+1); \
_vec_load2(r0, r1, r2, phi[ix]->s0); \
Expand All @@ -978,13 +1018,31 @@
_vec_add2(rs6, rs7, rs8, r3, r4, r5); \
_vec_sub2(rs9, rs10, rs11, r0, r1, r2);

#define _hop_y_m_post2() \
_vec_load(r0, r1, phi[ix]->s0); \
_vec_load16(r2, r3, phi[ix]->s1); \
rtmp = vec_ld2(0, (double*) &ka2); \
_vec_su3_inverse_multiply_double2c(U); \
_vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \
_vec_unfuse(r0, r1, r2, r3, r4, r5); \
_vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
_vec_add2(rs6, rs7, rs8, r3, r4, r5); \
_vec_sub2(rs9, rs10, rs11, r0, r1, r2);

#define _hop_z_p_post() \
_vec_load2(r0, r1, r2, phi[ix]->s0); \
_vec_load2(r3, r4, r5, phi[ix]->s1); \
_vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
_vec_i_mul_sub2(rs6, rs7, rs8, r0, r1, r2, U0); \
_vec_i_mul_add2(rs9, rs10, rs11, r3, r4, r5, U0);

#define _hop_z_p_post2() \
_vec_load_halfspinor(r0, r1, r2, phi[ix]->s0); \
_vec_unfuse(r0, r1, r2, r3, r4, r5); \
_vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
_vec_i_mul_sub2(rs6, rs7, rs8, r0, r1, r2, U0); \
_vec_i_mul_add2(rs9, rs10, rs11, r3, r4, r5, U1);

#define _hop_z_m_post() \
_prefetch_su3(U+1); \
_vec_load2(r0, r1, r2, phi[ix]->s0); \
Expand All @@ -1001,6 +1059,21 @@
_vec_store2(s->s2, rs6, rs7, rs8); \
_vec_store2(s->s3, rs9, rs10, rs11);

#define _hop_z_m_post2() \
_vec_load(r0, r1, phi[ix]->s0); \
_vec_load16(r2, r3, phi[ix]->s1); \
rtmp = vec_ld2(0, (double*) &ka3); \
_vec_su3_inverse_multiply_double2c(U); \
_vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \
_vec_unfuse(r0, r1, r2, r3, r4, r5); \
_vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \
_vec_i_mul_add2(rs6, rs7, rs8, r0, r1, r2, U0); \
_vec_i_mul_sub2(rs9, rs10, rs11, r3, r4, r5, U1);
_vec_store2(s->s0, rs0, rs1, rs2); \
_vec_store2(s->s1, rs3, rs4, rs5); \
_vec_store2(s->s2, rs6, rs7, rs8); \
_vec_store2(s->s3, rs9, rs10, rs11);

#define _declare_hregs() \
vector4double ALIGN r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; \
vector4double ALIGN rs0, rs1, rs2, rs3, rs4, rs5, rs6, rs7, rs8, rs9, rs10, rs11; \
Expand Down

0 comments on commit 35d4aa2

Please sign in to comment.