mogp_emulator/mogp_fpga/device/prediction.cl at ecbf2c71f496591c91af60f7a12dcc7bfa8fb760 · yeminaga/mogp_emulator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#define MAX_DIM 64
#define MAX_NX 128
#define MAX_NXSTAR 128

// Determine K(X,X*) for the squared exponential Kernel
//
// x is an (nx,dim) array representing nx training inputs of dimension dim
//
// xstar is an (nxstar,dim) array representing nxstar prediction inputs of
// dimension dim
//
// scale is a (dim) array or scaling parameters. The difference between
// dimension i, for each pair of x and y vectors are divided by scale[i]
//
// sigma is a prefactor, each element of K is multiplied by exp(sigma)
//
// The output is a (nxstar,nx) array which is written one column at a time to
// the pipe k1, or k1 and k2  when var=1
//
// Optionally the pairwise distances may also be sent to the pipe r (for
// derivative calculation) when deriv=1
//
// nx, nxstar are the number of vectors in x and xstar respectively
//
// dim is the number of dimensions in each of the vectors of x and xstar
kernel void sq_exp(global float* restrict x, global float* restrict xstar,
                   write_only pipe float __attribute__((blocking)) k1,
                   write_only pipe float __attribute__((blocking)) k2,
                   write_only pipe float __attribute__((blocking)) r,
                   global float* restrict scale, float sigma,
                   int nx, int nxstar, int dim, int var, int deriv){

    // Determine exponential of sigma
    //local float exp_sigma;
    //exp_sigma = exp(sigma);

    // Calculate one column of k at a time
    for(unsigned col=0; col<nxstar; col++){
        int col_stride = col*dim;
        // Declare local array for column of r
        for(unsigned row=0; row<nx; row++){
            int row_stride = row*dim;
            // Determine the element k[row,col], the squared euclidean
            // distance between x[row] and xstar[col]
            float elem = 0;
            float x_;
            float xstar_;
            for(unsigned i=0; i<dim; i++){
                x_ = x[row_stride+i];
                xstar_ = xstar[col_stride+i];
                float difference = x_ - xstar_;
                elem += (difference * difference) * exp(scale[i]);
            }

            // Calculate pairwise distance if needed for derivatives
            if (deriv){
                //float dist = sqrt(elem);
                float dist = elem;
                write_pipe(r, &dist);
            }

            // Calculate the element k[row,col] of the square exponential kernel
            elem = exp(-0.5f * elem + sigma);
            // Push the element to the pipe
            write_pipe(k1, &elem);
            if (var){
                write_pipe(k2, &elem);
            }
        }
    }
}

// Determine the expectation values of predictions
//
// k1 is a pipe which provides the square exponential kernel of X and X* in
// column major order
//
// invqt is an array of length nx calculated during training which is used to
// determine expectation values of the predictions.
// Q = K(X,X), invqt = Q^-1 * Y
//
// expectation is an array of length nxstar where the predictions' expectation
// values are written
//
// nx and nxstar, as before, are the number of inputs in X and X* respectively
kernel void expectation(
    read_only pipe float __attribute__((blocking)) k1,
    global float* restrict invqt,
    global float* restrict expectation,
    int nx, int nxstar
    ){
    // Calculate one predictions expectation value at time by finding the 'dot
    // product' of one column of k with invqt
    for (unsigned col=0; col<nxstar; col++){
        float sum = 0;
        for (unsigned i=0; i<nx; i++){
            float temp;
            read_pipe(k1, &temp); /*k_cache is only initialized for the first nx values.
                                Assuming that MAX_NX is larger than nx, it means that the remaining addresses have no value.
                                Also we don't need to have a separate array for k (no need for k_cache, because we use the values as soon as we read them and we don't need them later on.
                                So we reduced the number of for loops to 2 and got rid of the separate array.
                                But the compiler is smart enough to see that k_cache was not used after so it wasn't occupying extra resources
                                in the final hardware.*/
            sum += temp * invqt[i];
        }
        expectation[col] = sum;
    }
}

// Determine the variance of predictions
//
// k2 is a pipe which provides the square exponential kernel of X and X* in
// column major order
//
// variance is an array of length nxstar where the predictions' variance
// values are written
//
// cholesky_factor is the (nx,nx) lower triangular cholesky factor of K(X,X)
//
// sigma, as before, is the prefactor in the squared exponential kernel
//
// nx and nxstar, as before, are the number of inputs in X and X* respectively
kernel void variance(
    read_only pipe float __attribute__((blocking)) k2,
    global float* restrict variance,
    global float* restrict cholesky_factor, float sigma, int nx, int nxstar
    ){
// Take exponential of sigma
local float exp_sigma;
exp_sigma = exp(sigma);

local float chol_cache[MAX_NX*MAX_NX];

for (unsigned i=0; i<nx*nx; i++){
    chol_cache[i] = cholesky_factor[i];
}

// Calculate variance for one prediction per iteration
#pragma max_concurrency 1
for (unsigned col=0; col<nxstar; col++){
    // Cache column of k
    // Forward substitution variables x,y
    float k_cache[MAX_NX];
    float y[MAX_NX];
    float x[MAX_NX];

    for (unsigned i=0; i<nx; i++){
        read_pipe(k2, &k_cache[i]);
        y[i] = 0.0;
        x[i] = 0.0;
    }

    // Cholesky solve for one column of k
    // Forward substitution

    for (unsigned i=0; i<nx; i++){
        int offset = i*nx;
        float temp;
        temp = 0.0;
        for (int t=nx-1; t > -1; t--){
            temp += chol_cache[offset+t] * y[t];
        }
        temp = k_cache[i] - temp;
        y[i] = temp / chol_cache[offset+i];
    }
    for (int t=nx-1; t > -1; t--){
        float temp;
        temp = 0.0;
        for (unsigned j=0; j<nx; j++){
            temp += chol_cache[j*nx+t] * x[j];
        //printf("j*nx+t = %d\n", j*nx+t);
        }
        temp = y[t] - temp;
        x[t] = temp / chol_cache[t*nx+t];
    }

    float var = 0;
    for (unsigned i=0; i<nx; i++){
        var += k_cache[i] * x[i];
    }
    // Subtract from hyperparameter
    var = exp_sigma - var;
    variance[col] = max(var, 0.0f);
    }
}

// Determine the derivatives
//
// The pairwise distance matrix is passed, one column at a time, through the
// pipe r.
//
// The entire matrix is needed for parts of the calculation (?) so not all of
// this kernel can run concurrently with the others. However, the derivative of
// the kernel matrix with respect to r can be calculated as r values are read
// in.
//
// r is a pipe which supplies the distance between each pair of (X, X*) in
// column major order (as with the k pipes)
//
// deriv is an (nxstar,dim) array giving the derivative of each prediction with
// respect to each dimension of the inputs X*
//
// x is an (nx,dim) array representing nx training inputs of dimension dim
//
// xstar is an (nxstar,dim) array representing nxstar prediction inputs of
// dimension dim
//
// invqt is an array of length nx calculated during training which is used to
// determine expectation values of the predictions.
// Q = K(X,X), invqt = Q^-1 * Y
//
// scale is a (dim) array or scaling parameters. The difference between
// dimension i, for each pair of x and y vectors are divided by scale[i]
//
// sigma, as before, is the prefactor in the squared exponential kernel
//
// nx and nxstar, as before, are the number of inputs in X and X* respectively
//
// dim is the number of dimensions in each of the vectors of x and xstar
kernel void derivatives(
        read_only pipe float __attribute__((blocking)) r,
        global float* restrict deriv,
        global float* restrict x, global float* restrict xstar,
        global float* restrict invqt, global float* restrict scale,
        float sigma, int nx, int nxstar, int dim){

    // Determine exponential of sigma
    //local float exp_sigma;
    //exp_sigma = exp(sigma);

    // Cache entire r matrix and calculate dK/dr
    local float r_cache[MAX_NX*MAX_NXSTAR];
    for (int i=0; i<nx*nxstar; i++){
        read_pipe(r, &r_cache[i]);
    }
    // At this point the exp_sq kernel has finished and determined all pairwise
    // distances
    // For each dimension...

    #pragma max_concurrency 1  //used to limit the concurrency of the loop to 1, in order to reduce the amount of RAMs used. The max_concurrency pragma enables you to control the on-chip memory resources required to pipeline your loop.
    for (int d=0; d<dim; d++){
        float scale_d = exp(scale[d]);
        // Calculate dr/dx for dimension dim
        float drdx[MAX_NX*MAX_NXSTAR];
        for (int row=0; row<nxstar; row++){
            int row_stride = row*nx;
            for (int col=0; col<nx; col++){
                int index = row_stride + col;
                float value;
                float x_;
                float xstar_;
                x_ = x[col*dim+d];
                xstar_= xstar[row*dim+d];
                // xstar - x in dimension dim only
                value = xstar_ - x_;
                value *= scale_d;
                value *= rsqrt(r_cache[index]);
                drdx[index] = value;
            }
        }

        // Calculate d(expectation)/dx for each expectation value for dimension
        // dim as the dot product of dK/dx and InvQt
        // These vectors are the columns of deriv
        for (int row=0; row<nxstar; row++){
            float sum = 0;
            float dKdx;
            float dKdr;
            float dist;
            int row_stride = row*nx;
            for (int col=0; col<nx; col++){
                dist = r_cache[row_stride+col];
                dKdr = -1.0 * sqrt(dist) * exp(-0.5 * dist + sigma);
                dKdx = dKdr * drdx[row_stride+col];
                sum += dKdx * invqt[col];
            }
            deriv[row_stride+d] = sum;
        }
    }
}