Skip to content

Commit 1970aad

Browse files
committed
add binary version of 'simdFor'; add example
1 parent 08778ed commit 1970aad

File tree

2 files changed

+89
-0
lines changed

2 files changed

+89
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Shows how 'simdMapReduce()' could be used to compute
2+
// the dot product of two vectors.
3+
4+
// [[Rcpp::depends(RcppNT2)]]
5+
#include <RcppNT2.h>
6+
using namespace RcppNT2;
7+
8+
#include <Rcpp.h>
9+
using namespace Rcpp;
10+
11+
struct DotProduct
12+
{
13+
public:
14+
DotProduct()
15+
: packed_(0.0), result_(0.0)
16+
{}
17+
18+
void operator()(double lhs, double rhs)
19+
{
20+
result_ += lhs * rhs;
21+
}
22+
23+
void operator()(const boost::simd::pack<double>& lhs,
24+
const boost::simd::pack<double>& rhs)
25+
{
26+
packed_ += lhs * rhs;
27+
}
28+
29+
operator double() const {
30+
return result_ + sum(packed_);
31+
}
32+
33+
private:
34+
boost::simd::pack<double> packed_;
35+
double result_;
36+
};
37+
38+
// [[Rcpp::export]]
39+
double simdDot(NumericVector lhs, NumericVector rhs)
40+
{
41+
return simdFor(lhs.begin(), lhs.end(), rhs.begin(), DotProduct());
42+
}
43+
44+
/*** R
45+
set.seed(123)
46+
n <- 1024 * 1000
47+
lhs <- rnorm(n)
48+
rhs <- rnorm(n)
49+
stopifnot(all.equal(sum(lhs * rhs), simdDot(lhs, rhs)))
50+
51+
library(microbenchmark)
52+
microbenchmark(
53+
simd = simdDot(lhs, rhs),
54+
R = sum(lhs * rhs)
55+
)
56+
*/

inst/include/RcppNT2/algorithm.h

+33
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,39 @@ F simdFor(const T* it, const T* end, F&& f)
4343
return f;
4444
}
4545

46+
template <typename T1, typename T2, typename F>
47+
F simdFor(const T1* begin1, const T1* end1, const T2* begin2, F&& f)
48+
{
49+
typedef boost::simd::pack<T1> vT1;
50+
typedef boost::simd::pack<T2> vT2;
51+
52+
static_assert(
53+
vT1::static_size == vT2::static_size,
54+
"T1 and T2 are not of the same size"
55+
);
56+
57+
// Attempt to align on 'begin1', and use aligned loads
58+
// when feasible.
59+
static const std::size_t N = vT1::static_size;
60+
const T1* aligned_begin = std::min(boost::simd::align_on(begin1, N * sizeof(T1)), end1);
61+
const T1* aligned_end = aligned_begin + (end1 - aligned_begin) / N * N;
62+
63+
// Initial un-aligned region
64+
for (; begin1 != aligned_begin; ++begin1, ++begin2)
65+
f(*begin1, *begin2);
66+
67+
// TODO: Somehow, profiling here suggested that using 'load'
68+
// rather than 'aligned_load' was actually faster (!?)
69+
for (; begin1 != aligned_end; begin1 += N, begin2 += N)
70+
f(boost::simd::load<vT1>(begin1), boost::simd::load<vT2>(begin2));
71+
72+
// Final un-aligned region.
73+
for (; begin1 != end1; ++begin1, ++begin2)
74+
f(*begin1, *begin2);
75+
76+
return f;
77+
}
78+
4679
template <typename T, typename U, typename F>
4780
U simdReduce(const T* begin, const T* end, U init, F&& f)
4881
{

0 commit comments

Comments
 (0)