-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpu_trace.cu
223 lines (189 loc) · 7.32 KB
/
gpu_trace.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <float.h>
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include "include/gpu_trace.hu"
/*
Error handling helpers
*/
#define gpu_err_check(ans) { gpu_assert((ans), __FILE__, __LINE__); }
inline void
gpu_assert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
__host__ __device__ uint32_t
get_idx_of_first_intersection(sphere *spheres,
uint32_t num_spheres,
vector ray_start,
vector ray_dir,
vector *intersection)
{
float ray_len = sqrt(ray_dir.x * ray_dir.x +
ray_dir.y * ray_dir.y +
ray_dir.z * ray_dir.z);
float ray_x_norm = ray_dir.x / ray_len;
float ray_y_norm = ray_dir.y / ray_len;
float ray_z_norm = ray_dir.z / ray_len;
float min_dist = FLT_MAX;
uint32_t winner_idx = num_spheres;
for (uint32_t sphere_idx = 0; sphere_idx < num_spheres; sphere_idx++){
sphere sphere = spheres[sphere_idx];
vector vec_to_center = {
ray_start.x - sphere.center.x,
ray_start.y - sphere.center.y,
ray_start.z - sphere.center.z,
};
float a = 1;
float b = 2*(ray_x_norm * vec_to_center.x +
ray_y_norm * vec_to_center.y +
ray_z_norm * vec_to_center.z);
float c = (vec_to_center.x * vec_to_center.x +
vec_to_center.y * vec_to_center.y +
vec_to_center.z * vec_to_center.z -
sphere.radius * sphere.radius);
float discriminant = b*b - 4*a*c;
if (discriminant < 0) {
continue;
}
float x1 = (-b - sqrt(discriminant))/(2*a);
float x2 = (-b + sqrt(discriminant))/(2*a);
float *x = NULL;
if (x1 > 0.1 && x1 < min_dist){
x = &x1;
}
else if (x2 > 0.1 && x2 < min_dist){ // x2 must be > x1!
x = &x2;
}
if (x != NULL) {
min_dist = *x;
winner_idx = sphere_idx;
*intersection = (vector){
ray_start.x + *x * ray_x_norm,
ray_start.y + *x * ray_y_norm,
ray_start.z + *x * ray_z_norm,
};
}
}
return winner_idx;
}
__global__ void
cuda_trace_kernel(sphere *spheres,
uint32_t num_spheres,
float canvas_min_x,
float canvas_max_x,
float canvas_min_z,
float canvas_max_z,
float canvas_y,
uint32_t num_pixels_x,
uint32_t num_pixels_z,
uint32_t *canvas)
{
// asm: eye is at (0,0,0)
// asm: canvas is aligned to y-axis
// asm: no object means black
uint32_t thread_index = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t grid_size = gridDim.x * blockDim.x;
while (thread_index < num_pixels_x * num_pixels_z) {
// initialize canvas to black
canvas[thread_index] = 67108863;
vector zero_vec = {0.0,0.0,0.0};
float canvas_width = canvas_max_x - canvas_min_x;
float canvas_x_incr = canvas_width / num_pixels_x;
float canvas_height = canvas_max_z - canvas_min_z;
float canvas_z_incr = canvas_height / num_pixels_z;
uint32_t x_idx = thread_index % num_pixels_x;
uint32_t z_idx = thread_index / num_pixels_x;
vector ray_dir = {
canvas_min_x + (x_idx+0.5) * canvas_x_incr,
canvas_y,
canvas_min_z + (z_idx+0.5) * canvas_z_incr
};
vector intersection;
uint32_t winner_sphere_idx =
get_idx_of_first_intersection(spheres,
num_spheres,
zero_vec,
ray_dir,
&intersection);
if (winner_sphere_idx < num_spheres) {
// intersecting sphere found
sphere winner_sphere = spheres[winner_sphere_idx];
// only intersection color
canvas[thread_index] = winner_sphere.color;
}
thread_index += grid_size;
}
}
/*
( 8) Multiprocessors, (192) CUDA Cores/MP: 1536 CUDA Cores
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
*/
void gpu_trace(sphere **spheres,
uint32_t num_spheres,
float canvas_min_x,
float canvas_max_x,
float canvas_min_z,
float canvas_max_z,
float canvas_y,
uint32_t num_pixels_x,
uint32_t num_pixels_z,
uint32_t *canvas)
{
// Allocate GPU memory for the spheres and copy them over
// TODO: use constant memory for the spheres
sphere *gpu_spheres;
gpu_err_check(cudaMalloc((void **) &gpu_spheres, sizeof(sphere) * num_spheres));
for (int i = 0; i < num_spheres; i++){
gpu_err_check(cudaMemcpy(&gpu_spheres[i],
spheres[i],
sizeof(sphere),
cudaMemcpyHostToDevice));
}
// Allocate GPU memory for the canvas
uint32_t *gpu_canvas;
gpu_err_check(cudaMalloc((void **) &gpu_canvas,
sizeof(uint32_t) * num_pixels_x * num_pixels_z));
// Call the kernel using one thread per pixel
// want multiple warps per block and block size should be
// a multiple of 32.
// simply choose 32 threads_per_block initially
// TODO: try out different threads_per_block
uint32_t num_threads = num_pixels_x * num_pixels_z;
uint32_t num_blocks = (num_threads + 31) / 32;
cuda_trace_kernel<<<num_blocks, 32>>>(gpu_spheres,
num_spheres,
canvas_min_x,
canvas_max_x,
canvas_min_z,
canvas_max_z,
canvas_y,
num_pixels_x,
num_pixels_z,
gpu_canvas);
// Check for errors on kernel call
gpu_err_check(cudaGetLastError());
// Copy the canvas back to host memory
gpu_err_check(cudaMemcpy(canvas,
gpu_canvas,
sizeof(uint32_t) * num_pixels_x * num_pixels_z,
cudaMemcpyDeviceToHost));
// Free the gpu resources
gpu_err_check(cudaFree(gpu_canvas));
gpu_err_check(cudaFree(gpu_spheres));
}