add binary version of 'simdFor'; add example

kevinushey · kevinushey · commit 1970aad65c8f · 2016-01-27T13:04:22.000-08:00
diff --git a/inst/examples/boost-simd/boost-simd-dot-v2.cpp b/inst/examples/boost-simd/boost-simd-dot-v2.cpp
@@ -0,0 +1,56 @@
+// Shows how 'simdMapReduce()' could be used to compute
+// the dot product of two vectors.
+
+// [[Rcpp::depends(RcppNT2)]]
+#include <RcppNT2.h>
+using namespace RcppNT2;
+
+#include <Rcpp.h>
+using namespace Rcpp;
+
+struct DotProduct
+{
+public:
+  DotProduct()
+    : packed_(0.0), result_(0.0)
+  {}
+
+  void operator()(double lhs, double rhs)
+  {
+    result_ += lhs * rhs;
+  }
+
+  void operator()(const boost::simd::pack<double>& lhs,
+                  const boost::simd::pack<double>& rhs)
+  {
+    packed_ += lhs * rhs;
+  }
+
+  operator double() const {
+    return result_ + sum(packed_);
+  }
+
+private:
+  boost::simd::pack<double> packed_;
+  double result_;
+};
+
+// [[Rcpp::export]]
+double simdDot(NumericVector lhs, NumericVector rhs)
+{
+  return simdFor(lhs.begin(), lhs.end(), rhs.begin(), DotProduct());
+}
+
+/*** R
+set.seed(123)
+n <- 1024 * 1000
+lhs <- rnorm(n)
+rhs <- rnorm(n)
+stopifnot(all.equal(sum(lhs * rhs), simdDot(lhs, rhs)))
+
+library(microbenchmark)
+microbenchmark(
+  simd = simdDot(lhs, rhs),
+  R    = sum(lhs * rhs)
+)
+*/
diff --git a/inst/include/RcppNT2/algorithm.h b/inst/include/RcppNT2/algorithm.h
@@ -43,6 +43,39 @@ F simdFor(const T* it, const T* end, F&& f)
   return f;
 }
 
+template <typename T1, typename T2, typename F>
+F simdFor(const T1* begin1, const T1* end1, const T2* begin2, F&& f)
+{
+  typedef boost::simd::pack<T1> vT1;
+  typedef boost::simd::pack<T2> vT2;
+
+  static_assert(
+    vT1::static_size == vT2::static_size,
+    "T1 and T2 are not of the same size"
+  );
+
+  // Attempt to align on 'begin1', and use aligned loads
+  // when feasible.
+  static const std::size_t N = vT1::static_size;
+  const T1* aligned_begin = std::min(boost::simd::align_on(begin1, N * sizeof(T1)), end1);
+  const T1* aligned_end   = aligned_begin + (end1 - aligned_begin) / N * N;
+
+  // Initial un-aligned region
+  for (; begin1 != aligned_begin; ++begin1, ++begin2)
+    f(*begin1, *begin2);
+
+  // TODO: Somehow, profiling here suggested that using 'load'
+  // rather than 'aligned_load' was actually faster (!?)
+  for (; begin1 != aligned_end; begin1 += N, begin2 += N)
+    f(boost::simd::load<vT1>(begin1), boost::simd::load<vT2>(begin2));
+
+  // Final un-aligned region.
+  for (; begin1 != end1; ++begin1, ++begin2)
+    f(*begin1, *begin2);
+
+  return f;
+}
+
 template <typename T, typename U, typename F>
 U simdReduce(const T* begin, const T* end, U init, F&& f)
 {