swap to sophus, upgrade precision to double

mp3guy · May 1, 2019 · a58da27 · a58da27
1 parent 89c9492
commit a58da27
Show file tree

Hide file tree

Showing 29 changed files with 925 additions and 494 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "third-party/Eigen"]
 	path = third-party/Eigen
 	url = https://github.com/eigenteam/eigen-git-mirror.git
+[submodule "third-party/Sophus"]
+	path = third-party/Sophus
+	url = https://github.com/strasdat/Sophus.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,6 +7,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/CMakeMod
 set(Pangolin_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/Pangolin/build/src" CACHE PATH "Pangolin build directory")
 set(OPENNI2_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/OpenNI2/Include" CACHE PATH "OpenNI2 Include directory")
 set(OPENNI2_LIBRARY "${CMAKE_CURRENT_SOURCE_DIR}/third-party/OpenNI2/Bin/x64-Release/libOpenNI2.so" CACHE PATH "OpenNI2 library")
+set(Sophus_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/Sophus" CACHE PATH "Sophus Include directory")
 set(efusion_SHADER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/Core/Shaders" CACHE PATH "Where the shaders live")
 
 find_package(LAPACK REQUIRED)
@@ -25,6 +26,7 @@ include_directories(${Pangolin_INCLUDE_DIRS})
 include_directories(${CUDA_INCLUDE_DIRS})
 include_directories(${OPENNI2_INCLUDE_DIR})
 include_directories(${SUITESPARSE_INCLUDE_DIRS})
+include_directories(${Sophus_INCLUDE_DIR})
 
 if(WITH_REALSENSE)
   include_directories(${REALSENSE_INCLUDE_DIR})
@@ -36,6 +38,8 @@ file(GLOB srcs *.cpp *.h *.cu *.cuh)
 file(GLOB tools_srcs Tools/*.cpp Tools/*.h Tools/*.cu Tools/*.cuh)
 
 add_definitions(-Dlinux=1)
+add_definitions(-DEIGEN_MAX_ALIGN_BYTES=0)
+add_definitions(-DEIGEN_MAX_STATIC_ALIGN_BYTES=0)
 
 set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-O3 -msse2 -msse3 -Wall -std=c++11 -DSHADER_DIR=${efusion_SHADER_DIR}")
 #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall -std=c++11 -DSHADER_DIR=${efusion_SHADER_DIR}")

diff --git a/Core/CMakeLists.txt b/Core/CMakeLists.txt
@@ -25,7 +25,7 @@ endforeach(NVCC_ARCH)
 include("${CMAKE_MODULE_PATH}/CudaComputeTargetFlags.cmake")
 APPEND_TARGET_ARCH_FLAGS()
 
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}  "-Xcompiler;-fPIC;")           
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}  "-Xcompiler;-fPIC;--expt-relaxed-constexpr")
 
 CUDA_COMPILE(cuda_objs ${cuda})
 

diff --git a/Core/Cuda/cudafuncs.cuh b/Core/Cuda/cudafuncs.cuh
@@ -53,12 +53,6 @@
 #ifndef CUDA_CUDAFUNCS_CUH_
 #define CUDA_CUDAFUNCS_CUH_
 
-#if __CUDA_ARCH__ < 300
-#define MAX_THREADS 512
-#else
-#define MAX_THREADS 1024
-#endif
-
 #include <cuda_runtime.h>
 #include <stdint.h>
 #include "containers/device_array.hpp"

diff --git a/Core/Cuda/estimate.cu b/Core/Cuda/estimate.cu
@@ -0,0 +1,247 @@
+#include "convenience.cuh"
+#include "estimate.cuh"
+
+template <int D>
+__inline__ __device__ void warpReduceSum(Eigen::Matrix<float, D, 1>& val) {
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+#pragma unroll
+    for (int i = 0; i < D; i++) {
+      val[i] += __shfl_down_sync(0xFFFFFFFF, val[i], offset);
+    }
+  }
+}
+
+template <int D>
+__inline__ __device__ void blockReduceSum(Eigen::Matrix<float, D, 1>& val) {
+  // Allocate shared memory in two steps otherwise NVCC complains about Eigen's
+  // non-empty constructor
+  static __shared__ unsigned char sharedMem[32 * sizeof(Eigen::Matrix<float, D, 1>)];
+
+  Eigen::Matrix<float, D, 1>(&shared)[32] =
+      reinterpret_cast<Eigen::Matrix<float, D, 1>(&)[32]>(sharedMem);
+
+  int lane = threadIdx.x % warpSize;
+
+  int wid = threadIdx.x / warpSize;
+
+  warpReduceSum(val);
+
+  // write reduced value to shared memory
+  if (lane == 0) {
+    shared[wid] = val;
+  }
+  __syncthreads();
+
+  // ensure we only grab a value from shared memory if that warp existed
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : Eigen::Matrix<float, D, 1>::Zero();
+
+  if (wid == 0) {
+    warpReduceSum(val);
+  }
+}
+
+template <int D>
+__global__ void reduceSum(Eigen::Matrix<float, D, 1>* in, Eigen::Matrix<float, D, 1>* out, int N) {
+  Eigen::Matrix<float, D, 1> sum = Eigen::Matrix<float, D, 1>::Zero();
+
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
+    sum += in[i];
+  }
+
+  blockReduceSum(sum);
+
+  if (threadIdx.x == 0) {
+    out[blockIdx.x] = sum;
+  }
+}
+
+struct Reduction {
+  Eigen::Matrix3f R_prev_curr;
+  Eigen::Vector3f t_prev_curr;
+
+  CameraModel intr;
+
+  PtrStep<float> vmap_curr;
+  PtrStep<float> nmap_curr;
+
+  PtrStep<float> vmap_prev;
+  PtrStep<float> nmap_prev;
+
+  float dist_thresh;
+  float angle_thresh;
+
+  int cols;
+  int rows;
+  int N;
+
+  Eigen::Matrix<float, 29, 1>* out;
+
+  // And now for some template metaprogramming magic
+  template <int outer, int inner, int end>
+  struct SquareUpperTriangularProduct {
+    __device__ __forceinline__ static void apply(
+        Eigen::Matrix<float, 29, 1>& values,
+        const float (&rows)[end + 1]) {
+      values[((end + 1) * outer) + inner - (outer * (outer + 1) / 2)] = rows[outer] * rows[inner];
+
+      SquareUpperTriangularProduct<outer, inner + 1, end>::apply(values, rows);
+    }
+  };
+
+  // Inner loop base
+  template <int outer, int end>
+  struct SquareUpperTriangularProduct<outer, end, end> {
+    __device__ __forceinline__ static void apply(
+        Eigen::Matrix<float, 29, 1>& values,
+        const float (&rows)[end + 1]) {
+      values[((end + 1) * outer) + end - (outer * (outer + 1) / 2)] = rows[outer] * rows[end];
+
+      SquareUpperTriangularProduct<outer + 1, outer + 1, end>::apply(values, rows);
+    }
+  };
+
+  // Outer loop base
+  template <int end>
+  struct SquareUpperTriangularProduct<end, end, end> {
+    __device__ __forceinline__ static void apply(
+        Eigen::Matrix<float, 29, 1>& values,
+        const float (&rows)[end + 1]) {
+      values[((end + 1) * end) + end - (end * (end + 1) / 2)] = rows[end] * rows[end];
+    }
+  };
+
+  __device__ __forceinline__ void operator()() const {
+    Eigen::Matrix<float, 29, 1> sum = Eigen::Matrix<float, 29, 1>::Zero();
+
+    SquareUpperTriangularProduct<0, 0, 6> sutp;
+
+    Eigen::Matrix<float, 29, 1> values;
+
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
+      const int y = i / cols;
+      const int x = i - (y * cols);
+
+      const Eigen::Vector3f v_curr(
+          vmap_curr.ptr(y)[x], vmap_curr.ptr(y + rows)[x], vmap_curr.ptr(y + 2 * rows)[x]);
+
+      const Eigen::Vector3f v_curr_in_prev = R_prev_curr * v_curr + t_prev_curr;
+
+      const Eigen::Matrix<int, 2, 1> p_curr_in_prev(
+          __float2int_rn(v_curr_in_prev(0) * intr.fx / v_curr_in_prev(2) + intr.cx),
+          __float2int_rn(v_curr_in_prev(1) * intr.fy / v_curr_in_prev(2) + intr.cy));
+
+      float row[7] = {0, 0, 0, 0, 0, 0, 0};
+
+      values[28] = 0;
+
+      if (p_curr_in_prev(0) >= 0 && p_curr_in_prev(1) >= 0 && p_curr_in_prev(0) < cols &&
+          p_curr_in_prev(1) < rows && v_curr(2) > 0 && v_curr_in_prev(2) > 0) {
+        const Eigen::Vector3f v_prev(
+            vmap_prev.ptr(p_curr_in_prev(1))[p_curr_in_prev(0)],
+            vmap_prev.ptr(p_curr_in_prev(1) + rows)[p_curr_in_prev(0)],
+            vmap_prev.ptr(p_curr_in_prev(1) + 2 * rows)[p_curr_in_prev(0)]);
+
+        const Eigen::Vector3f n_curr(
+            nmap_curr.ptr(y)[x], nmap_curr.ptr(y + rows)[x], nmap_curr.ptr(y + 2 * rows)[x]);
+
+        const Eigen::Vector3f n_curr_in_prev = R_prev_curr * n_curr;
+
+        const Eigen::Vector3f n_prev(
+            nmap_prev.ptr(p_curr_in_prev(1))[p_curr_in_prev(0)],
+            nmap_prev.ptr(p_curr_in_prev(1) + rows)[p_curr_in_prev(0)],
+            nmap_prev.ptr(p_curr_in_prev(1) + 2 * rows)[p_curr_in_prev(0)]);
+
+        if (n_curr_in_prev.cross(n_prev).norm() < angle_thresh &&
+            (v_prev - v_curr_in_prev).norm() < dist_thresh && !isnan(n_curr(0)) &&
+            !isnan(n_prev(0))) {
+          *(Eigen::Vector3f*)&row[0] = n_prev;
+          *(Eigen::Vector3f*)&row[3] = v_curr_in_prev.cross(n_prev);
+          row[6] = n_prev.dot(v_prev - v_curr_in_prev);
+
+          values[28] = 1;
+
+          sutp.apply(values, row);
+
+          sum += values;
+        }
+      }
+    }
+
+    blockReduceSum(sum);
+
+    if (threadIdx.x == 0) {
+      out[blockIdx.x] = sum;
+    }
+  }
+};
+
+__global__ void estimateKernel(const Reduction reduction) {
+  reduction();
+}
+
+void estimateStep(
+    const Eigen::Matrix3f& R_prev_curr,
+    const Eigen::Vector3f& t_prev_curr,
+    const DeviceArray2D<float>& vmap_curr,
+    const DeviceArray2D<float>& nmap_curr,
+    const CameraModel& intr,
+    const DeviceArray2D<float>& vmap_prev,
+    const DeviceArray2D<float>& nmap_prev,
+    float dist_thresh,
+    float angle_thresh,
+    DeviceArray<Eigen::Matrix<float, 29, 1>>& sum,
+    DeviceArray<Eigen::Matrix<float, 29, 1>>& out,
+    float* matrixA_host,
+    float* vectorB_host,
+    float* residual_inliers) {
+  int cols = vmap_curr.cols();
+  int rows = vmap_curr.rows() / 3;
+
+  Reduction reduction;
+
+  reduction.R_prev_curr = R_prev_curr;
+  reduction.t_prev_curr = t_prev_curr;
+
+  reduction.vmap_curr = vmap_curr;
+  reduction.nmap_curr = nmap_curr;
+
+  reduction.intr = intr;
+
+  reduction.vmap_prev = vmap_prev;
+  reduction.nmap_prev = nmap_prev;
+
+  reduction.dist_thresh = dist_thresh;
+  reduction.angle_thresh = angle_thresh;
+
+  reduction.cols = cols;
+  reduction.rows = rows;
+
+  reduction.N = cols * rows;
+  reduction.out = sum;
+
+  estimateKernel<<<reduceBlocks, reduceThreads>>>(reduction);
+
+  reduceSum<29><<<1, MAX_THREADS>>>(sum, out, reduceBlocks);
+
+  cudaSafeCall(cudaGetLastError());
+  cudaSafeCall(cudaDeviceSynchronize());
+
+  float host_data[29];
+  out.download((Eigen::Matrix<float, 29, 1>*)&host_data[0]);
+
+  int shift = 0;
+  for (int i = 0; i < 6; ++i) // rows
+  {
+    for (int j = i; j < 7; ++j) // cols + b
+    {
+      float value = host_data[shift++];
+      if (j == 6) // vector b
+        vectorB_host[i] = value;
+      else
+        matrixA_host[j * 6 + i] = matrixA_host[i * 6 + j] = value;
+    }
+  }
+
+  residual_inliers[0] = host_data[27];
+  residual_inliers[1] = host_data[28];
+}
diff --git a/Core/Cuda/estimate.cuh b/Core/Cuda/estimate.cuh
@@ -0,0 +1,61 @@
+/*
+ * Software License Agreement (BSD License)
+ *
+ *  Point Cloud Library (PCL) - www.pointclouds.org
+ *  Copyright (c) 2011, Willow Garage, Inc.
+ *
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *   * Neither the name of Willow Garage, Inc. nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#pragma once
+
+#include "containers/device_array.hpp"
+#include "types.cuh"
+
+#include <cuda_runtime_api.h>
+#include <vector_types.h>
+#include <Eigen/Eigen>
+
+void estimateStep(
+    const Eigen::Matrix3f& R_prev_curr,
+    const Eigen::Vector3f& t_prev_curr,
+    const DeviceArray2D<float>& vmap_curr,
+    const DeviceArray2D<float>& nmap_curr,
+    const CameraModel& intr,
+    const DeviceArray2D<float>& vmap_prev,
+    const DeviceArray2D<float>& nmap_prev,
+    float dist_thresh,
+    float angle_thresh,
+    DeviceArray<Eigen::Matrix<float, 29, 1>>& sum,
+    DeviceArray<Eigen::Matrix<float, 29, 1>>& out,
+    float* matrixA_host,
+    float* vectorB_host,
+    float* residual_inliers);
diff --git a/Core/Cuda/reduce.cu b/Core/Cuda/reduce.cu
@@ -54,9 +54,6 @@
 #include "cudafuncs.cuh"
 #include "operators.cuh"
 
-constexpr int reduceThreads = 256;
-constexpr int reduceBlocks = 64;
-
 __inline__ __device__ JtJJtrSE3 warpReduceSum(JtJJtrSE3 val) {
   for (int offset = warpSize / 2; offset > 0; offset /= 2) {
     val.aa += __shfl_down_sync(0xFFFFFFFF, val.aa, offset);

diff --git a/Core/Cuda/types.cuh b/Core/Cuda/types.cuh
@@ -59,6 +59,11 @@
 #include <Eigen/Core>
 #endif
 
+#define MAX_THREADS 1024
+
+constexpr int reduceThreads = 256;
+constexpr int reduceBlocks = 64;
+
 struct mat33 {
   mat33() {}