forked from lhhunghimself/fast_protein_cluster
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrmsd.cl
212 lines (199 loc) · 6.28 KB
/
rmsd.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
//Ling-Hong Hung 2013
#define zero4 ((float4)(0.0f,0.0f,0.0f,0.0f))
#define sqrt3 1.73205080756888
#define inv3 0.33333333333333
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#endif
float svd(float4 r0, float4 r1, float4 r2);
__kernel void rmsd_matrix (int4 sizes,int4 start_points,__global float4 *coords4,__global float *rmsds)
{
//outputs all vs all matrix
//stuff that doesn't change with the threads
const int ng=get_global_size(0);
const int nt=get_local_size(0);
const int nat=sizes.x,nat4=sizes.y,nwu=sizes.w;
const float invfnat=1.0f/(float)nat;
//initializing private thread dependent variables
int gid=get_global_id(0)/nt; //workgroup id
int t=get_local_id(0); //thread in workgroup
int nwg=ng/nt; //number of workgroups - stride for sampling different pairs of structures
local float4 lcoords[PDB4SIZE]; //stores coords of first structure using an interweaved pattern
int4 index=start_points;
int pstartx=index.x;
index=(int4)(index.x+gid,index.x+gid+1,index.z,index.w);
int work_i=index.z*gid-(gid*(gid+1)/2);
float ssq1;
//main loop - threads will continue until the workunits are done or all the structures are looped through
while(index.x < pstartx +index.z -1 && work_i < nwu)
{
//read coords
{
int i=t;
ssq1=0;
while(i<nat4*3)
{
float4 pcoords1=coords4[nat4*3*index.x+i];
ssq1+=dot(pcoords1,pcoords1);
lcoords[i]=pcoords1;
i+=nt;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
//reduce ssq
{
local float local_sum[NTHREADS];
int max_items=nt;
int diff=max_items/2;
local_sum[t]=ssq1;
barrier(CLK_LOCAL_MEM_FENCE);
while(max_items >1)
{
if(t+diff<max_items)
local_sum[t]+=local_sum[t+diff];
max_items-=diff;
diff=max_items/2;
barrier(CLK_LOCAL_MEM_FENCE);
}
ssq1=local_sum[0];
}
work_i+=t;
index.y=index.x+1+t;
while(index.y <index.w && work_i <nwu)
{
//now read in j - each thread does this individually
int m=0;
int offset=nat4*3*index.y;
float4 r0=zero4,r1=zero4, r2=zero4;
float ssq=0.0f;
while(m<3*nat4)
{
float4 c1x=lcoords[m++];float4 c1y=lcoords[m++];float4 c1z=lcoords[m++];
float4 c2x=coords4[offset++]; float4 c2y=coords4[offset++]; float4 c2z=coords4[offset++];
ssq+=dot(c2x,c2x)+dot(c2y,c2y)+dot(c2z,c2z);
r0+=(float4)(dot(c1x,c2x),dot(c1x,c2y),dot(c1x,c2z),0);
r1+=(float4)(dot(c1y,c2x),dot(c1y,c2y),dot(c1y,c2z),0);
r2+=(float4)(dot(c1z,c2x),dot(c1z,c2y),dot(c1z,c2z),0);
}
float ev=svd(r0,r1,r2);
float rms=(ssq+ssq1-2.0f*ev)*invfnat;
rmsds[work_i]=(rms>0) ? sqrt(rms) : 0;
work_i+=nt;
index.y+=nt;
}
index.x+=nwg;
barrier(CLK_LOCAL_MEM_FENCE);
{
int h=index.x-pstartx;
work_i=index.z*h-(h*(h+1)/2);
}
}
}
__kernel void rmsd_matrix_rect (int4 sizes,int4 start_points,__global float4 *coords4,__global float4 *coords42,__global float *rmsds)
{
//outputs all vs all matrix
//stuff that doesn't change with the threads
const int ng=get_global_size(0);
const int nt=get_local_size(0);
const int nat=sizes.x,nat4=sizes.y,nwu=sizes.w; //nstructs is kept in sizes.z - not used
const float invfnat=1.0f/(float)nat;
//initializing private thread dependent variables
int gid=get_global_id(0)/nt; //workgroup id
int t=get_local_id(0); //thread in workgroup
int nwg=ng/nt; //number of workgroups - stride for sampling different pairs of structures
local float4 lcoords[PDB4SIZE]; //stores coords of first structure using an interweaved pattern
int4 index=start_points;
int pstartx=index.x;
int pstarty=index.y;
index=(int4)(index.x+gid,index.y,index.z,index.w);
int work_i=index.w*gid;
float ssq1;
//main loop - threads will continue until the workunits are done or all the structures are looped through
while(index.x < pstartx +index.z && work_i < nwu)
{
//read coords
{
int i=t;
ssq1=0;
while(i<nat4*3)
{
float4 pcoords1=coords4[nat4*3*index.x+i];
ssq1+=dot(pcoords1,pcoords1);
lcoords[i]=pcoords1;
i+=nt;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
//reduce ssq
{
local float local_sum[NTHREADS];
int max_items=nt;
int diff=max_items/2;
local_sum[t]=ssq1;
barrier(CLK_LOCAL_MEM_FENCE);
while(max_items >1)
{
if(t+diff<max_items)
local_sum[t]+=local_sum[t+diff];
max_items-=diff;
diff=max_items/2;
barrier(CLK_LOCAL_MEM_FENCE);
}
ssq1=local_sum[0];
}
work_i+=t;
index.y=pstarty+t;
while(index.y <pstarty+index.w && work_i <nwu)
{
//now read in j - each thread does this individually
int m=0;
int offset=nat4*3*index.y;
float4 r0=zero4,r1=zero4, r2=zero4;
float ssq=0.0f;
while(m<3*nat4)
{
float4 c1x=lcoords[m++];float4 c1y=lcoords[m++];float4 c1z=lcoords[m++];
float4 c2x=coords42[offset++]; float4 c2y=coords42[offset++]; float4 c2z=coords42[offset++];
ssq+=dot(c2x,c2x)+dot(c2y,c2y)+dot(c2z,c2z);
r0+=(float4)(dot(c1x,c2x),dot(c1x,c2y),dot(c1x,c2z),0);
r1+=(float4)(dot(c1y,c2x),dot(c1y,c2y),dot(c1y,c2z),0);
r2+=(float4)(dot(c1z,c2x),dot(c1z,c2y),dot(c1z,c2z),0);
}
float ev=svd(r0,r1,r2);
float rms=(ssq+ssq1-2.0f*ev)*invfnat;
rmsds[work_i]=(rms>0) ? sqrt(rms) : 0;
work_i+=nt;
index.y+=nt;
}
index.x+=nwg;
barrier(CLK_LOCAL_MEM_FENCE);
work_i=index.w*(index.x-pstartx);
}
}
float svd(float4 r0, float4 r1, float4 r2)
{
double det= r0.x * ( (r1.y*r2.z) - (r1.z*r2.y) )- r0.y * ( (r1.x*r2.z) - (r1.z*r2.x) ) + r0.z * ( (r1.x*r2.y) - (r1.y*r2.x) );
float d;
//lower triangular matrix rr
float rr0=dot(r0,r0); float rr1=dot(r1,r0);float rr2= dot(r1,r1);float rr3= dot(r2,r0);float rr4= dot(r2,r1);float rr5=dot(r2,r2);
double spur=(rr0+rr2+rr5)/ 3.0;
double cof=(rr2*rr5 - rr4*rr4 + rr0*rr5- rr3*rr3 + rr0*rr2 - rr1*rr1)*inv3;
double4 e;
e.x=spur; e.y=spur;e.z=spur;
double h=( spur > 0 )? spur*spur-cof : -1.0;
if(h>0)
{
double g = (spur*cof - det*det)/2.0 - spur*h;
double sqrth = sqrt(h);
double d1 = h*h*h - g*g;
d1= ( d1<0 ) ? atan2(0,-g) / 3.0 : atan2(sqrt(d1),-g)*inv3;
double cth = sqrth * cos(d1);
double sth = sqrth*sqrt3*sin(d1);
e.x=spur + cth + cth;e.y=spur - cth + sth;e.z=spur - cth - sth;
}
e.x=(e.x < 0) ? 0 : sqrt((float)e.x);
e.y=(e.y < 0) ? 0 : sqrt((float)e.y);
e.z=(e.z < 0) ? 0 : sqrt((float)e.z);
d=(det<0)? e.x + e.y -e.z : e.x + e.y+e.z;
return(d);
}