-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathparallel_selection.cl
120 lines (120 loc) · 3.54 KB
/
parallel_selection.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include "FGPUlib.c"
#include "comparesf2.c"
__kernel void ParallelSelection(__global int * in,__global int* out){
int i = get_global_id(0); // current thread
int n = get_global_size(0); // input size
int ith = in[i];
// Compute position of in[i] in output
int pos = 0, j = 0;
do
{
int jth = in[j]; // broadcasted
bool smaller = (jth < ith);
bool equal_and_smaller = (jth == ith && j < i); // in[j] < in[i] ?
pos += smaller||equal_and_smaller;
j++;
}while(j != n);
out[pos] = ith;
}
__kernel void ParallelSelection_half(__global short* in,__global short* out){
int i = get_global_id(0); // current thread
int n = get_global_size(0); // input size
int ith = in[i];
// Compute position of in[i] in output
int pos = 0, j = 0;
do
{
int jth = in[j]; // broadcasted
bool smaller = (jth < ith);
bool equal_and_smaller = (jth == ith && j < i); // in[j] < in[i] ?
pos += smaller||equal_and_smaller;
j++;
}while(j != n);
out[pos] = ith;
}
__kernel void ParallelSelection_half_improved(__global short2* in,__global short* out){
int i = get_global_id(0); // current thread
int n = get_global_size(0); // input size
__global short *in_short = (__global short*) in;
int ith = in_short[i];
int pos = 0, j = 0;
do
{
short2 tmp = in[j>>1];
int jth = tmp.x;
bool smaller = (jth < ith);
bool equal_and_smaller = (jth == ith && j < i);
pos += smaller||equal_and_smaller;
j++;
jth = tmp.y;
smaller = (jth < ith);
equal_and_smaller = (jth == ith && j < i);
pos += smaller||equal_and_smaller;
j++;
}while(j != n);
out[pos] = ith;
}
__kernel void ParallelSelection_byte_improved(__global uchar4* in,__global uchar* out){
unsigned i = get_global_id(0); // current thread
unsigned n = get_global_size(0); // input size
__global unsigned char *in_char = (__global unsigned char*) in;
unsigned ith = in_char[i];
unsigned pos = 0, j = 0;
do
{
uchar4 tmp = in[j>>2];
unsigned jth = tmp.x;
bool smaller = (jth < ith);
bool equal_and_smaller = (jth == ith && j < i);
pos += smaller||equal_and_smaller;
j++;
jth = tmp.y;
smaller = (jth < ith);
equal_and_smaller = (jth == ith && j < i);
pos += smaller||equal_and_smaller;
j++;
jth = tmp.z;
smaller = (jth < ith);
equal_and_smaller = (jth == ith && j < i);
pos += smaller||equal_and_smaller;
j++;
jth = tmp.w;
smaller = (jth < ith);
equal_and_smaller = (jth == ith && j < i);
pos += smaller||equal_and_smaller;
j++;
}while(j != n);
out[pos] = ith;
}
__kernel void ParallelSelection_byte(__global unsigned char* in,__global unsigned char* out){
int i = get_global_id(0); // current thread
int n = get_global_size(0); // input size
unsigned ith = in[i];
// Compute position of in[i] in output
unsigned pos = 0, j = 0;
do
{
unsigned jth = in[j]; // broadcasted
bool smaller = (jth < ith);
bool equal_and_smaller = (jth == ith && j < i); // in[j] < in[i] ?
pos += smaller||equal_and_smaller;
j++;
}while(j != n);
out[pos] = ith;
}
__kernel void ParallelSelection_float(__global float *in,__global float *out){
int i = get_global_id(0); // current thread
int n = get_global_size(0); // input size
float ith = in[i];
// Compute position of in[i] in output
int pos = 0, j = 0;
do
{
float jth = in[j]; // broadcasted
bool smaller = (jth < ith);
bool equal_and_smaller = (jth == ith && j < i); // in[j] < in[i] ?
pos += smaller||equal_and_smaller;
j++;
}while(j != n);
out[pos] = ith;
}