diff --git a/src/cuda/cuda-conversion.cu b/src/cuda/cuda-conversion.cu index a9c04d7dbc..838834ffdc 100644 --- a/src/cuda/cuda-conversion.cu +++ b/src/cuda/cuda-conversion.cu @@ -233,10 +233,12 @@ void rscuda::unpack_yuy2_cuda_helper(const uint8_t* h_src, uint8_t* h_dst, int n // How many super pixels do we have? int superPix = n / 2; - std::shared_ptr d_dst; - std::shared_ptr d_src = alloc_dev(superPix * 4); - auto result = cudaMemcpy(d_src.get(), h_src, superPix * sizeof(uint8_t) * 4, cudaMemcpyHostToDevice); + static DeviceBuffer d_dst; + static DeviceBuffer d_src; + d_src.reserve(superPix, 4); + + auto result = cudaMemcpy(d_src.data(), h_src, superPix * sizeof(uint8_t) * 4, cudaMemcpyHostToDevice); assert(result == cudaSuccess); int numBlocks = superPix / RS2_CUDA_THREADS_PER_BLOCK; @@ -253,28 +255,28 @@ void rscuda::unpack_yuy2_cuda_helper(const uint8_t* h_src, uint8_t* h_dst, int n */ case RS2_FORMAT_Y16: size = 2; - d_dst = alloc_dev(n * size); - kernel_unpack_yuy2_y16_cuda << > > (d_src.get(), d_dst.get(), superPix); + d_dst.reserve(n * size); + kernel_unpack_yuy2_y16_cuda << > > (d_src.data(), d_dst.data(), superPix); break; case RS2_FORMAT_RGB8: size = 3; - d_dst = alloc_dev(n * size); - kernel_unpack_yuy2_rgb8_cuda << > > (d_src.get(), d_dst.get(), superPix); + d_dst.reserve(n * size); + kernel_unpack_yuy2_rgb8_cuda << > > (d_src.data(), d_dst.data(), superPix); break; case RS2_FORMAT_BGR8: size = 3; - d_dst = alloc_dev(n * size); - kernel_unpack_yuy2_bgr8_cuda << > > (d_src.get(), d_dst.get(), superPix); + d_dst.reserve(n * size); + kernel_unpack_yuy2_bgr8_cuda << > > (d_src.data(), d_dst.data(), superPix); break; case RS2_FORMAT_RGBA8: size = 4; - d_dst = alloc_dev(n * size); - kernel_unpack_yuy2_rgba8_cuda << > > (d_src.get(), d_dst.get(), superPix); + d_dst.reserve(n * size); + kernel_unpack_yuy2_rgba8_cuda << > > (d_src.data(), d_dst.data(), superPix); break; case RS2_FORMAT_BGRA8: size = 4; - d_dst = alloc_dev(n * size); - kernel_unpack_yuy2_bgra8_cuda << > > (d_src.get(), d_dst.get(), superPix); + d_dst.reserve(n * size); + kernel_unpack_yuy2_bgra8_cuda << > > (d_src.data(), d_dst.data(), superPix); break; default: assert(false); @@ -284,7 +286,7 @@ void rscuda::unpack_yuy2_cuda_helper(const uint8_t* h_src, uint8_t* h_dst, int n cudaDeviceSynchronize(); - result = cudaMemcpy(h_dst, d_dst.get(), n * sizeof(uint8_t) * size, cudaMemcpyDeviceToHost); + result = cudaMemcpy(h_dst, d_dst.data(), n * sizeof(uint8_t) * size, cudaMemcpyDeviceToHost); assert(result == cudaSuccess); /* cudaEventRecord(stop); diff --git a/src/cuda/rscuda_utils.cuh b/src/cuda/rscuda_utils.cuh index 27a7f7f6ef..4bdd6236b9 100644 --- a/src/cuda/rscuda_utils.cuh +++ b/src/cuda/rscuda_utils.cuh @@ -15,6 +15,54 @@ namespace rscuda { + + template + class DeviceBuffer final + { + public: + DeviceBuffer() = default; + + explicit DeviceBuffer(std::size_t const num_elements) : + data_{DeviceBuffer::allocateBuffer(num_elements)}, + size_{num_elements} + {} + + DeviceBuffer(std::size_t const num_elements, std::size_t const number_of_channels = 1U) : + DeviceBuffer{num_elements * number_of_channels} + {} + + void reserve(std::size_t const reserve_size) + { + if (size_ < reserve_size) + { + cudaFree(data_); + data_ = DeviceBuffer::allocateBuffer(reserve_size); + size_ = reserve_size; + } + } + + void reserve(std::size_t const reserve_size, std::size_t const reserve_channels) + { + reserve(reserve_size * reserve_channels); + } + + std::size_t size() const { return size_; } + + T* data() { return data_; } + + private: + static T* allocateBuffer(std::size_t const reserve_size) + { + T* datatemp{nullptr}; + cudaMalloc(&datatemp, reserve_size * sizeof(T)); + return datatemp; + } + + T* data_{}; + // Size is in number of elements, not bytes. + std::size_t size_{}; + }; + template std::shared_ptr alloc_dev(int elements) {