Skip to content

Commit

Permalink
swap to sophus, upgrade precision to double
Browse files Browse the repository at this point in the history
  • Loading branch information
mp3guy committed May 1, 2019
1 parent 89c9492 commit a58da27
Show file tree
Hide file tree
Showing 29 changed files with 925 additions and 494 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
[submodule "third-party/Eigen"]
path = third-party/Eigen
url = https://github.com/eigenteam/eigen-git-mirror.git
[submodule "third-party/Sophus"]
path = third-party/Sophus
url = https://github.com/strasdat/Sophus.git
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/CMakeMod
set(Pangolin_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/Pangolin/build/src" CACHE PATH "Pangolin build directory")
set(OPENNI2_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/OpenNI2/Include" CACHE PATH "OpenNI2 Include directory")
set(OPENNI2_LIBRARY "${CMAKE_CURRENT_SOURCE_DIR}/third-party/OpenNI2/Bin/x64-Release/libOpenNI2.so" CACHE PATH "OpenNI2 library")
set(Sophus_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/Sophus" CACHE PATH "Sophus Include directory")
set(efusion_SHADER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/Core/Shaders" CACHE PATH "Where the shaders live")

find_package(LAPACK REQUIRED)
Expand All @@ -25,6 +26,7 @@ include_directories(${Pangolin_INCLUDE_DIRS})
include_directories(${CUDA_INCLUDE_DIRS})
include_directories(${OPENNI2_INCLUDE_DIR})
include_directories(${SUITESPARSE_INCLUDE_DIRS})
include_directories(${Sophus_INCLUDE_DIR})

if(WITH_REALSENSE)
include_directories(${REALSENSE_INCLUDE_DIR})
Expand All @@ -36,6 +38,8 @@ file(GLOB srcs *.cpp *.h *.cu *.cuh)
file(GLOB tools_srcs Tools/*.cpp Tools/*.h Tools/*.cu Tools/*.cuh)

add_definitions(-Dlinux=1)
add_definitions(-DEIGEN_MAX_ALIGN_BYTES=0)
add_definitions(-DEIGEN_MAX_STATIC_ALIGN_BYTES=0)

set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-O3 -msse2 -msse3 -Wall -std=c++11 -DSHADER_DIR=${efusion_SHADER_DIR}")
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -std=c++11 -DSHADER_DIR=${efusion_SHADER_DIR}")
Expand Down
2 changes: 1 addition & 1 deletion Core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ endforeach(NVCC_ARCH)
include("${CMAKE_MODULE_PATH}/CudaComputeTargetFlags.cmake")
APPEND_TARGET_ARCH_FLAGS()

set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fPIC;")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fPIC;--expt-relaxed-constexpr")

CUDA_COMPILE(cuda_objs ${cuda})

Expand Down
6 changes: 0 additions & 6 deletions Core/Cuda/cudafuncs.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,6 @@
#ifndef CUDA_CUDAFUNCS_CUH_
#define CUDA_CUDAFUNCS_CUH_

#if __CUDA_ARCH__ < 300
#define MAX_THREADS 512
#else
#define MAX_THREADS 1024
#endif

#include <cuda_runtime.h>
#include <stdint.h>
#include "containers/device_array.hpp"
Expand Down
247 changes: 247 additions & 0 deletions Core/Cuda/estimate.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
#include "convenience.cuh"
#include "estimate.cuh"

template <int D>
__inline__ __device__ void warpReduceSum(Eigen::Matrix<float, D, 1>& val) {
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
#pragma unroll
for (int i = 0; i < D; i++) {
val[i] += __shfl_down_sync(0xFFFFFFFF, val[i], offset);
}
}
}

template <int D>
__inline__ __device__ void blockReduceSum(Eigen::Matrix<float, D, 1>& val) {
// Allocate shared memory in two steps otherwise NVCC complains about Eigen's
// non-empty constructor
static __shared__ unsigned char sharedMem[32 * sizeof(Eigen::Matrix<float, D, 1>)];

Eigen::Matrix<float, D, 1>(&shared)[32] =
reinterpret_cast<Eigen::Matrix<float, D, 1>(&)[32]>(sharedMem);

int lane = threadIdx.x % warpSize;

int wid = threadIdx.x / warpSize;

warpReduceSum(val);

// write reduced value to shared memory
if (lane == 0) {
shared[wid] = val;
}
__syncthreads();

// ensure we only grab a value from shared memory if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : Eigen::Matrix<float, D, 1>::Zero();

if (wid == 0) {
warpReduceSum(val);
}
}

template <int D>
__global__ void reduceSum(Eigen::Matrix<float, D, 1>* in, Eigen::Matrix<float, D, 1>* out, int N) {
Eigen::Matrix<float, D, 1> sum = Eigen::Matrix<float, D, 1>::Zero();

for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
sum += in[i];
}

blockReduceSum(sum);

if (threadIdx.x == 0) {
out[blockIdx.x] = sum;
}
}

struct Reduction {
Eigen::Matrix3f R_prev_curr;
Eigen::Vector3f t_prev_curr;

CameraModel intr;

PtrStep<float> vmap_curr;
PtrStep<float> nmap_curr;

PtrStep<float> vmap_prev;
PtrStep<float> nmap_prev;

float dist_thresh;
float angle_thresh;

int cols;
int rows;
int N;

Eigen::Matrix<float, 29, 1>* out;

// And now for some template metaprogramming magic
template <int outer, int inner, int end>
struct SquareUpperTriangularProduct {
__device__ __forceinline__ static void apply(
Eigen::Matrix<float, 29, 1>& values,
const float (&rows)[end + 1]) {
values[((end + 1) * outer) + inner - (outer * (outer + 1) / 2)] = rows[outer] * rows[inner];

SquareUpperTriangularProduct<outer, inner + 1, end>::apply(values, rows);
}
};

// Inner loop base
template <int outer, int end>
struct SquareUpperTriangularProduct<outer, end, end> {
__device__ __forceinline__ static void apply(
Eigen::Matrix<float, 29, 1>& values,
const float (&rows)[end + 1]) {
values[((end + 1) * outer) + end - (outer * (outer + 1) / 2)] = rows[outer] * rows[end];

SquareUpperTriangularProduct<outer + 1, outer + 1, end>::apply(values, rows);
}
};

// Outer loop base
template <int end>
struct SquareUpperTriangularProduct<end, end, end> {
__device__ __forceinline__ static void apply(
Eigen::Matrix<float, 29, 1>& values,
const float (&rows)[end + 1]) {
values[((end + 1) * end) + end - (end * (end + 1) / 2)] = rows[end] * rows[end];
}
};

__device__ __forceinline__ void operator()() const {
Eigen::Matrix<float, 29, 1> sum = Eigen::Matrix<float, 29, 1>::Zero();

SquareUpperTriangularProduct<0, 0, 6> sutp;

Eigen::Matrix<float, 29, 1> values;

for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
const int y = i / cols;
const int x = i - (y * cols);

const Eigen::Vector3f v_curr(
vmap_curr.ptr(y)[x], vmap_curr.ptr(y + rows)[x], vmap_curr.ptr(y + 2 * rows)[x]);

const Eigen::Vector3f v_curr_in_prev = R_prev_curr * v_curr + t_prev_curr;

const Eigen::Matrix<int, 2, 1> p_curr_in_prev(
__float2int_rn(v_curr_in_prev(0) * intr.fx / v_curr_in_prev(2) + intr.cx),
__float2int_rn(v_curr_in_prev(1) * intr.fy / v_curr_in_prev(2) + intr.cy));

float row[7] = {0, 0, 0, 0, 0, 0, 0};

values[28] = 0;

if (p_curr_in_prev(0) >= 0 && p_curr_in_prev(1) >= 0 && p_curr_in_prev(0) < cols &&
p_curr_in_prev(1) < rows && v_curr(2) > 0 && v_curr_in_prev(2) > 0) {
const Eigen::Vector3f v_prev(
vmap_prev.ptr(p_curr_in_prev(1))[p_curr_in_prev(0)],
vmap_prev.ptr(p_curr_in_prev(1) + rows)[p_curr_in_prev(0)],
vmap_prev.ptr(p_curr_in_prev(1) + 2 * rows)[p_curr_in_prev(0)]);

const Eigen::Vector3f n_curr(
nmap_curr.ptr(y)[x], nmap_curr.ptr(y + rows)[x], nmap_curr.ptr(y + 2 * rows)[x]);

const Eigen::Vector3f n_curr_in_prev = R_prev_curr * n_curr;

const Eigen::Vector3f n_prev(
nmap_prev.ptr(p_curr_in_prev(1))[p_curr_in_prev(0)],
nmap_prev.ptr(p_curr_in_prev(1) + rows)[p_curr_in_prev(0)],
nmap_prev.ptr(p_curr_in_prev(1) + 2 * rows)[p_curr_in_prev(0)]);

if (n_curr_in_prev.cross(n_prev).norm() < angle_thresh &&
(v_prev - v_curr_in_prev).norm() < dist_thresh && !isnan(n_curr(0)) &&
!isnan(n_prev(0))) {
*(Eigen::Vector3f*)&row[0] = n_prev;
*(Eigen::Vector3f*)&row[3] = v_curr_in_prev.cross(n_prev);
row[6] = n_prev.dot(v_prev - v_curr_in_prev);

values[28] = 1;

sutp.apply(values, row);

sum += values;
}
}
}

blockReduceSum(sum);

if (threadIdx.x == 0) {
out[blockIdx.x] = sum;
}
}
};

__global__ void estimateKernel(const Reduction reduction) {
reduction();
}

void estimateStep(
const Eigen::Matrix3f& R_prev_curr,
const Eigen::Vector3f& t_prev_curr,
const DeviceArray2D<float>& vmap_curr,
const DeviceArray2D<float>& nmap_curr,
const CameraModel& intr,
const DeviceArray2D<float>& vmap_prev,
const DeviceArray2D<float>& nmap_prev,
float dist_thresh,
float angle_thresh,
DeviceArray<Eigen::Matrix<float, 29, 1>>& sum,
DeviceArray<Eigen::Matrix<float, 29, 1>>& out,
float* matrixA_host,
float* vectorB_host,
float* residual_inliers) {
int cols = vmap_curr.cols();
int rows = vmap_curr.rows() / 3;

Reduction reduction;

reduction.R_prev_curr = R_prev_curr;
reduction.t_prev_curr = t_prev_curr;

reduction.vmap_curr = vmap_curr;
reduction.nmap_curr = nmap_curr;

reduction.intr = intr;

reduction.vmap_prev = vmap_prev;
reduction.nmap_prev = nmap_prev;

reduction.dist_thresh = dist_thresh;
reduction.angle_thresh = angle_thresh;

reduction.cols = cols;
reduction.rows = rows;

reduction.N = cols * rows;
reduction.out = sum;

estimateKernel<<<reduceBlocks, reduceThreads>>>(reduction);

reduceSum<29><<<1, MAX_THREADS>>>(sum, out, reduceBlocks);

cudaSafeCall(cudaGetLastError());
cudaSafeCall(cudaDeviceSynchronize());

float host_data[29];
out.download((Eigen::Matrix<float, 29, 1>*)&host_data[0]);

int shift = 0;
for (int i = 0; i < 6; ++i) // rows
{
for (int j = i; j < 7; ++j) // cols + b
{
float value = host_data[shift++];
if (j == 6) // vector b
vectorB_host[i] = value;
else
matrixA_host[j * 6 + i] = matrixA_host[i * 6 + j] = value;
}
}

residual_inliers[0] = host_data[27];
residual_inliers[1] = host_data[28];
}
61 changes: 61 additions & 0 deletions Core/Cuda/estimate.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/*
* Software License Agreement (BSD License)
*
* Point Cloud Library (PCL) - www.pointclouds.org
* Copyright (c) 2011, Willow Garage, Inc.
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
* * Neither the name of Willow Garage, Inc. nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/

#pragma once

#include "containers/device_array.hpp"
#include "types.cuh"

#include <cuda_runtime_api.h>
#include <vector_types.h>
#include <Eigen/Eigen>

void estimateStep(
const Eigen::Matrix3f& R_prev_curr,
const Eigen::Vector3f& t_prev_curr,
const DeviceArray2D<float>& vmap_curr,
const DeviceArray2D<float>& nmap_curr,
const CameraModel& intr,
const DeviceArray2D<float>& vmap_prev,
const DeviceArray2D<float>& nmap_prev,
float dist_thresh,
float angle_thresh,
DeviceArray<Eigen::Matrix<float, 29, 1>>& sum,
DeviceArray<Eigen::Matrix<float, 29, 1>>& out,
float* matrixA_host,
float* vectorB_host,
float* residual_inliers);
3 changes: 0 additions & 3 deletions Core/Cuda/reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,6 @@
#include "cudafuncs.cuh"
#include "operators.cuh"

constexpr int reduceThreads = 256;
constexpr int reduceBlocks = 64;

__inline__ __device__ JtJJtrSE3 warpReduceSum(JtJJtrSE3 val) {
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
val.aa += __shfl_down_sync(0xFFFFFFFF, val.aa, offset);
Expand Down
5 changes: 5 additions & 0 deletions Core/Cuda/types.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@
#include <Eigen/Core>
#endif

#define MAX_THREADS 1024

constexpr int reduceThreads = 256;
constexpr int reduceBlocks = 64;

struct mat33 {
mat33() {}

Expand Down
Loading

0 comments on commit a58da27

Please sign in to comment.