forked from NVIDIA/NvPipe
-
Notifications
You must be signed in to change notification settings - Fork 1
/
convert.cu
230 lines (213 loc) · 9.24 KB
/
convert.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
/* Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Performance note: Typically the color space conversions take up a
* negligible amount of run time. The following kernels have therefore
* not been optimized.
*/
#include <cassert>
#include <cstddef>
#include <cinttypes>
#include <device_functions.h>
#include <cuda.h>
static inline __device__ float
clamp(const float v, const float low, const float high) {
return v < low ? low : v > high ? high : v;
}
static inline __device__ float
rgb2y(const uint8_t r, const uint8_t g, const uint8_t b) {
return 0.299f*(float)r + 0.587f*(float)g + 0.114f*(float)b;
}
static inline __device__ float
rgb2u(const uint8_t r, const uint8_t g, const uint8_t b) {
const float y = rgb2y(r,g,b);
return clamp(-(((-(float)b + y) / 1.732446f) - 128.f), 0.f, 255.f);
}
static inline __device__ float
rgb2v(const uint8_t r, const uint8_t g, const uint8_t b) {
const float y = rgb2y(r,g,b);
const float u = rgb2u(r,g,b);
return clamp((y - (0.337633f*(u-128.f)) - (float)g) / 0.698001f + 128.f,
0.f, 255.f);
}
/* Converts from RGB data to NV12. NV12's "U" and "V" channels are interleaved
* and subsampled 2x2. Note the RGB data are not pitched. */
extern "C" __global__ void
rgb2yuv(const uint8_t* const __restrict rgb,
const uint32_t width, const uint32_t height,
const uint32_t widthUser, const uint32_t heightUser, const uint32_t c/*omponents*/,
uint8_t* const __restrict yuv, unsigned pitch) {
const uint32_t x = blockIdx.x*blockDim.x + threadIdx.x;
const uint32_t y = blockIdx.y*blockDim.y + threadIdx.y;
const uint32_t i = y*pitch + x;
if(x >= width || y >= height || i >= pitch*height)
return;
/* Repeat edge pixels for padded areas */
const uint32_t _x = min(x, widthUser - 1);
const uint32_t _y = min(y, heightUser - 1);
const uint32_t j = _y * widthUser + _x;
assert(pitch >= width);
assert(i < pitch * height);
assert(j < widthUser * heightUser);
assert(width <= 4096);
assert(height <= 4096);
assert(c == 3 || c == 4);
assert(pitch <= 4096);
uint8_t* __restrict Y = yuv;
Y[i] = (uint8_t)clamp(rgb2y(rgb[j*c+0], rgb[j*c+1], rgb[j*c+2]), 0, 255);
/* U+V are downsampled 2x per dimension. So kill off 3 of every 4 threads
* that reach here; only one will do the writes into U and V. */
/* thought: use x0 to write into U and x1 to write into V, to spread load? */
if(x&1 == 1 || y&1 == 1) {
return;
}
uint8_t* __restrict uv = yuv + pitch*height;
const uint32_t uvidx = y/2*(pitch/2) + x/2;
const uint32_t idx[4] = {
min((_y+0)*widthUser + _x+0, widthUser*heightUser - 1),
min((_y+0)*widthUser + _x+1, widthUser*heightUser - 1),
min((_y+1)*widthUser + _x+0, widthUser*heightUser - 1),
min((_y+1)*widthUser + _x+1, widthUser*heightUser - 1),
};
const float u[4] = {
rgb2u(rgb[idx[0]*c+0], rgb[idx[0]*c+1], rgb[idx[0]*c+2]),
rgb2u(rgb[idx[1]*c+0], rgb[idx[1]*c+1], rgb[idx[1]*c+2]),
rgb2u(rgb[idx[2]*c+0], rgb[idx[2]*c+1], rgb[idx[2]*c+2]),
rgb2u(rgb[idx[3]*c+0], rgb[idx[3]*c+1], rgb[idx[3]*c+2])
};
const float v[4] = {
rgb2v(rgb[idx[0]*c+0], rgb[idx[0]*c+1], rgb[idx[0]*c+2]),
rgb2v(rgb[idx[1]*c+0], rgb[idx[1]*c+1], rgb[idx[1]*c+2]),
rgb2v(rgb[idx[2]*c+0], rgb[idx[2]*c+1], rgb[idx[2]*c+2]),
rgb2v(rgb[idx[3]*c+0], rgb[idx[3]*c+1], rgb[idx[3]*c+2])
};
uv[uvidx*2+0] = (uint8_t)clamp((u[0] + u[1] + u[2] + u[3]) / 4.0, 0, 255);
uv[uvidx*2+1] = (uint8_t)clamp((v[0] + v[1] + v[2] + v[3]) / 4.0, 0, 255);
}
static inline __device__ float
yuv2r(const uint8_t y, const uint8_t u, const uint8_t v) {
(void)u;
return (y-16)*1.164f + (1.596f * (v-128));
}
static inline __device__ float
yuv2g(const uint8_t y, const uint8_t u, const uint8_t v) {
return (y-16)*1.164f + (u-128)*-0.392f + (v-128)*-0.813f;
}
static inline __device__ float
yuv2b(const uint8_t y, const uint8_t u, const uint8_t v) {
(void)v;
return (y-16)*1.164f + (u-128)*2.017f;
}
/* Convert back from NV12 to RGB(A). Note the RGB buffer is not pitched. */
extern "C" __global__ void
yuv2rgb(const uint8_t* const __restrict yuv,
const uint32_t width, const uint32_t height,
uint32_t widthUser, uint32_t heightUser, const uint32_t components,
unsigned pitch, uint8_t* const __restrict rgb) {
const uint32_t x = blockIdx.x*blockDim.x + threadIdx.x;
const uint32_t y = blockIdx.y*blockDim.y + threadIdx.y;
const uint32_t i = y*pitch + x;
const uint32_t j = y*widthUser + x;
if(x >= widthUser || y >= heightUser || x >= width || y >= height || i >= pitch*height || j >= widthUser * heightUser) {
return;
}
assert(i < pitch*height);
assert(j < widthUser * heightUser);
assert(width <= 4096);
assert(height <= 4096);
assert(pitch <= 4096);
const uint8_t* __restrict Y = yuv;
const uint8_t* __restrict uv = yuv + pitch*height;
const uint32_t xx = min(x+1, width-1);
const uint32_t yy = min(y+1, height-1);
const uint32_t idx[4] = {
y/2*pitch/2 + x/2,
y/2*pitch/2 + xx/2,
yy/2*pitch/2 + x/2,
yy/2*pitch/2 + xx/2,
};
const uint8_t u[4] = {
uv[idx[0]*2+0], uv[idx[1]*2+0], uv[idx[2]*2+0], uv[idx[3]*2+0]
};
const uint8_t v[4] = {
uv[idx[0]*2+1], uv[idx[1]*2+1], uv[idx[2]*2+1], uv[idx[3]*2+1]
};
rgb[j*components+0] = clamp(
(yuv2r(Y[i], u[0], v[0]) + yuv2r(Y[i], u[1], v[1]) +
yuv2r(Y[i], u[2], v[2]) + yuv2r(Y[i], u[3], v[3])) / 4.0, 0, 255
);
rgb[j*components+1] = clamp(
(yuv2g(Y[i], u[0], v[0]) + yuv2g(Y[i], u[1], v[1]) +
yuv2g(Y[i], u[2], v[2]) + yuv2g(Y[i], u[3], v[3])) / 4.0, 0, 255
);
rgb[j*components+2] = clamp(
(yuv2b(Y[i], u[0], v[0]) + yuv2b(Y[i], u[1], v[1]) +
yuv2b(Y[i], u[2], v[2]) + yuv2b(Y[i], u[3], v[3])) / 4.0, 0, 255
);
if (components == 4)
rgb[j*components+3] = 255;
}
extern "C" cudaError_t
launch_yuv2rgb(CUdeviceptr nv12, uint32_t width, uint32_t height,
uint32_t widthUser, uint32_t heightUser, const uint32_t components,
unsigned pitch, CUdeviceptr rgb, cudaStream_t strm) {
/* NvCodec maxes out at 8k anyway. */
assert(width <= 8192);
assert(height <= 8192);
/* We only support RGB and RGBA data. */
assert(components == 3 || components == 4);
/* NvCodec can't give us a height that isn't evenly divisible. */
assert(height%2 == 0);
const void* args[] = {
(void*)&nv12, &width, &height, &widthUser, &heightUser, &components, &pitch, (void*)&rgb, 0
};
const dim3 gdim = {(unsigned)(width/16)+1, (unsigned)(height/2), 1};
const dim3 bdim = {16, 2, 1};
const size_t shmem = 0;
return cudaLaunchKernel((const void**)yuv2rgb, gdim, bdim, (void**)args,
shmem, strm);
}
extern "C" cudaError_t
launch_rgb2yuv(CUdeviceptr rgb, uint32_t width, uint32_t height,
uint32_t widthUser, uint32_t heightUser, uint32_t ncomp,
CUdeviceptr nv12, unsigned pitch, cudaStream_t strm) {
/* NvCodec maxes out at 8k anyway. */
assert(width <= 8192);
assert(height <= 8192);
/* We only support RGB and RGBA data. */
assert(ncomp == 3 || ncomp == 4);
/* NvCodec can't give us a height that isn't evenly divisible. */
assert(height%2 == 0);
const void* args[] = {
(void*)&rgb, &width, &height, &widthUser, &heightUser, &ncomp, (void*)&nv12, &pitch,
};
dim3 gdim = {(unsigned)(width/16)+1, (unsigned)(height/2), 1};
dim3 bdim = {16, 2, 1};
const size_t shmem = 0;
return cudaLaunchKernel((const void*)rgb2yuv, gdim, bdim, (void**)args,
shmem, strm);
}