Skip to content

Commit 4e23fae

Browse files
committed
optimize branches switching
1 parent 1feefd3 commit 4e23fae

File tree

4 files changed

+119
-113
lines changed

4 files changed

+119
-113
lines changed

README.md

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,28 @@
22

33
support u8, i8, u16, i16, u32, i32, u64, i64
44

5-
### waiting for Benchmarks
5+
### waiting for Benchmarks
6+
7+
| size | std | sse | avx2 |
8+
|--------------|-----|-----|------|
9+
| 8bit * 4 | | | |
10+
| 8bit * 16 | | | |
11+
| 8bit * 128 | | | |
12+
| 16bit * 4 | | | |
13+
| 16bit * 16 | | | |
14+
| 16bit * 128 | | | |
15+
| 16bit * 512 | | | |
16+
| 16bit * 2048 | | | |
17+
| 16bit * 8192 | | | |
18+
| 32bit * 4 | | | |
19+
| 32bit * 16 | | | |
20+
| 32bit * 128 | | | |
21+
| 32bit * 512 | | | |
22+
| 32bit * 2048 | | | |
23+
| 32bit * 8192 | | | |
24+
| 64bit * 4 | | | |
25+
| 64bit * 16 | | | |
26+
| 64bit * 128 | | | |
27+
| 64bit * 512 | | | |
28+
| 64bit * 2048 | | | |
29+
| 64bit * 8192 | | | |

benches/bench_bst.rs

Lines changed: 87 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
#![allow(non_upper_case_globals)]
2+
13
#[macro_use]
24
extern crate criterion;
35
#[macro_use]
46
extern crate lazy_static;
57

6-
use criterion::{black_box, Bencher, BenchmarkId, Criterion, Throughput};
8+
use criterion::{black_box, Bencher, BenchmarkId, Criterion};
79

810
use bst_rs::*;
911

@@ -54,50 +56,17 @@ fn gen_u64s(size: usize) -> Vec<u64> {
5456
(0..size as u64).into_iter().collect::<Vec<_>>()
5557
}
5658

57-
fn do_8bit_bench(b: &mut Bencher, nums: &[u8]) {
58-
let mut r = 0usize;
59-
b.iter(|| {
60-
r = r.wrapping_mul(1664525).wrapping_add(1013904223);
61-
let i = r % nums.len();
62-
black_box(binary_search_auto(&nums, i as u8).is_some())
63-
});
64-
}
65-
66-
fn do_16_bench(b: &mut Bencher, nums: &[u16]) {
67-
let mut r = 0usize;
68-
b.iter(|| {
69-
r = r.wrapping_mul(1664525).wrapping_add(1013904223);
70-
let i = r % nums.len();
71-
black_box(binary_search_auto(&nums, i as u16).is_some())
72-
});
73-
}
74-
75-
fn do_32_bench(b: &mut Bencher, nums: &[u32]) {
76-
let mut r = 0usize;
59+
fn do_simd_bench<T: SIMDField>(b: &mut Bencher, nums: &[T]) {
60+
let last = nums.last().unwrap();
61+
let last = *last;
7762
b.iter(|| {
78-
r = r.wrapping_mul(1664525).wrapping_add(1013904223);
79-
let i = r % nums.len();
80-
black_box(binary_search_auto(&nums, i as u32).is_some())
81-
});
82-
}
83-
84-
fn do_64_bench(b: &mut Bencher, nums: &[u64]) {
85-
let mut r = 0usize;
86-
b.iter(|| {
87-
r = r.wrapping_mul(1664525).wrapping_add(1013904223);
88-
let i = r % nums.len();
89-
black_box(binary_search_auto(&nums, i as u64).is_some())
63+
black_box(binary_search_auto(&nums, last).is_some());
9064
});
9165
}
9266

9367
fn do_std_bench<T: num::Integer + num::FromPrimitive>(b: &mut Bencher, nums: &[T]) {
94-
let mut r = 0usize;
95-
b.iter(|| {
96-
r = r.wrapping_mul(1664525).wrapping_add(1013904223);
97-
let i = r % nums.len();
98-
let i = T::from_usize(i).unwrap();
99-
black_box(nums.binary_search(&i).is_ok())
100-
});
68+
let last = nums.last().unwrap();
69+
b.iter(|| black_box(nums.binary_search(last).is_ok()));
10170
}
10271

10372
fn optimize_bst_bench(c: &mut Criterion, label: &str) {
@@ -108,110 +77,110 @@ fn optimize_bst_bench(c: &mut Criterion, label: &str) {
10877
group.bench_with_input(
10978
BenchmarkId::new("optimize_on_8bit", 4),
11079
&**U8x4,
111-
do_8bit_bench,
80+
do_simd_bench,
11281
);
11382
group.bench_with_input(
11483
BenchmarkId::new("optimize_on_8bit", 16),
115-
&U8x16,
116-
do_8bit_bench,
84+
&**U8x16,
85+
do_simd_bench,
11786
);
11887
group.bench_with_input(
11988
BenchmarkId::new("optimize_on_8bit", 128),
120-
&U8x128,
121-
do_8bit_bench,
89+
&**U8x128,
90+
do_simd_bench,
12291
);
12392
//
12493
group.bench_with_input(
12594
BenchmarkId::new("optimize_on_16bit", 4),
126-
&U16x4,
127-
do_16_bench,
95+
&**U16x4,
96+
do_simd_bench,
12897
);
12998
group.bench_with_input(
13099
BenchmarkId::new("optimize_on_16bit", 16),
131-
&U16x16,
132-
do_16_bench,
100+
&**U16x16,
101+
do_simd_bench,
133102
);
134103
group.bench_with_input(
135104
BenchmarkId::new("optimize_on_16bit", 128),
136-
&U16x128,
137-
do_16_bench,
105+
&**U16x128,
106+
do_simd_bench,
138107
);
139108
group.bench_with_input(
140109
BenchmarkId::new("optimize_on_16bit", 512),
141-
&U16x512,
142-
do_16_bench,
110+
&**U16x512,
111+
do_simd_bench,
143112
);
144113
group.bench_with_input(
145114
BenchmarkId::new("optimize_on_16bit", 2048),
146-
&U16x2048,
147-
do_16_bench,
115+
&**U16x2048,
116+
do_simd_bench,
148117
);
149118
group.bench_with_input(
150119
BenchmarkId::new("optimize_on_16bit", 8192),
151-
&U16x8192,
152-
do_16_bench,
120+
&**U16x8192,
121+
do_simd_bench,
153122
);
154123
//
155124
group.bench_with_input(
156125
BenchmarkId::new("optimize_on_32bit", 4),
157-
&U32x4,
158-
do_32_bench,
126+
&**U32x4,
127+
do_simd_bench,
159128
);
160129
group.bench_with_input(
161130
BenchmarkId::new("optimize_on_32bit", 16),
162-
&U32x16,
163-
do_32_bench,
131+
&**U32x16,
132+
do_simd_bench,
164133
);
165134
group.bench_with_input(
166135
BenchmarkId::new("optimize_on_32bit", 128),
167-
&U32x128,
168-
do_32_bench,
136+
&**U32x128,
137+
do_simd_bench,
169138
);
170139
group.bench_with_input(
171140
BenchmarkId::new("optimize_on_32bit", 512),
172-
&U32x512,
173-
do_32_bench,
141+
&**U32x512,
142+
do_simd_bench,
174143
);
175144
group.bench_with_input(
176145
BenchmarkId::new("optimize_on_32bit", 2048),
177-
&U32x2048,
178-
do_32_bench,
146+
&**U32x2048,
147+
do_simd_bench,
179148
);
180149
group.bench_with_input(
181150
BenchmarkId::new("optimize_on_32bit", 8192),
182-
&U32x8192,
183-
do_32_bench,
151+
&**U32x8192,
152+
do_simd_bench,
184153
);
185154
//
186155
group.bench_with_input(
187156
BenchmarkId::new("optimize_on_64bit", 4),
188-
&U64x4,
189-
do_64_bench,
157+
&**U64x4,
158+
do_simd_bench,
190159
);
191160
group.bench_with_input(
192161
BenchmarkId::new("optimize_on_64bit", 16),
193-
&U64x16,
194-
do_64_bench,
162+
&**U64x16,
163+
do_simd_bench,
195164
);
196165
group.bench_with_input(
197166
BenchmarkId::new("optimize_on_64bit", 128),
198-
&U64x128,
199-
do_64_bench,
167+
&**U64x128,
168+
do_simd_bench,
200169
);
201170
group.bench_with_input(
202171
BenchmarkId::new("optimize_on_64bit", 512),
203-
&U64x512,
204-
do_64_bench,
172+
&**U64x512,
173+
do_simd_bench,
205174
);
206175
group.bench_with_input(
207176
BenchmarkId::new("optimize_on_64bit", 2048),
208-
&U64x2048,
209-
do_64_bench,
177+
&**U64x2048,
178+
do_simd_bench,
210179
);
211180
group.bench_with_input(
212181
BenchmarkId::new("optimize_on_64bit", 8192),
213-
&U64x8192,
214-
do_64_bench,
182+
&**U64x8192,
183+
do_simd_bench,
215184
);
216185
group.finish();
217186
}
@@ -221,77 +190,93 @@ fn std_bst_bench(c: &mut Criterion, label: &str) {
221190
group
222191
.warm_up_time(std::time::Duration::from_millis(500))
223192
.measurement_time(std::time::Duration::from_secs(10));
224-
group.bench_with_input(BenchmarkId::new("std_on_8bit", 4), &U8x4, do_std_bench);
225-
group.bench_with_input(BenchmarkId::new("std_on_8bit", 16), &U8x16, do_std_bench);
226-
group.bench_with_input(BenchmarkId::new("std_on_8bit", 128), &U8x128, do_std_bench);
193+
group.bench_with_input(BenchmarkId::new("std_on_8bit", 4), &**U8x4, do_std_bench);
194+
group.bench_with_input(BenchmarkId::new("std_on_8bit", 16), &**U8x16, do_std_bench);
195+
group.bench_with_input(
196+
BenchmarkId::new("std_on_8bit", 128),
197+
&**U8x128,
198+
do_std_bench,
199+
);
227200
//
228201
//
229-
group.bench_with_input(BenchmarkId::new("std_on_16bit", 4), &U16x4, do_std_bench);
230-
group.bench_with_input(BenchmarkId::new("std_on_16bit", 16), &U16x16, do_std_bench);
202+
group.bench_with_input(BenchmarkId::new("std_on_16bit", 4), &**U16x4, do_std_bench);
203+
group.bench_with_input(
204+
BenchmarkId::new("std_on_16bit", 16),
205+
&**U16x16,
206+
do_std_bench,
207+
);
231208
group.bench_with_input(
232209
BenchmarkId::new("std_on_16bit", 128),
233-
&U16x128,
210+
&**U16x128,
234211
do_std_bench,
235212
);
236213
group.bench_with_input(
237214
BenchmarkId::new("std_on_16bit", 512),
238-
&U16x512,
215+
&**U16x512,
239216
do_std_bench,
240217
);
241218
group.bench_with_input(
242219
BenchmarkId::new("std_on_16bit", 2048),
243-
&U16x2048,
220+
&**U16x2048,
244221
do_std_bench,
245222
);
246223
group.bench_with_input(
247224
BenchmarkId::new("std_on_16bit", 8192),
248-
&U16x8192,
225+
&**U16x8192,
249226
do_std_bench,
250227
);
251228
//
252-
group.bench_with_input(BenchmarkId::new("std_on_32bit", 4), &U32x4, do_std_bench);
253-
group.bench_with_input(BenchmarkId::new("std_on_32bit", 16), &U32x16, do_std_bench);
229+
group.bench_with_input(BenchmarkId::new("std_on_32bit", 4), &**U32x4, do_std_bench);
230+
group.bench_with_input(
231+
BenchmarkId::new("std_on_32bit", 16),
232+
&**U32x16,
233+
do_std_bench,
234+
);
254235
group.bench_with_input(
255236
BenchmarkId::new("std_on_32bit", 128),
256-
&U32x128,
237+
&**U32x128,
257238
do_std_bench,
258239
);
259240
group.bench_with_input(
260241
BenchmarkId::new("std_on_32bit", 512),
261-
&U32x512,
242+
&**U32x512,
262243
do_std_bench,
263244
);
264245
group.bench_with_input(
265246
BenchmarkId::new("std_on_32bit", 2048),
266-
&U32x2048,
247+
&**U32x2048,
267248
do_std_bench,
268249
);
269250
group.bench_with_input(
270251
BenchmarkId::new("std_on_32bit", 8192),
271-
&U32x8192,
252+
&**U32x8192,
272253
do_std_bench,
273254
);
274255
//
275-
group.bench_with_input(BenchmarkId::new("std_on_64bit", 4), &U64x4, do_std_bench);
276-
group.bench_with_input(BenchmarkId::new("std_on_64bit", 16), &U64x16, do_std_bench);
256+
group.bench_with_input(BenchmarkId::new("std_on_64bit", 4), &**U64x4, do_std_bench);
257+
group.bench_with_input(
258+
BenchmarkId::new("std_on_64bit", 16),
259+
&**U64x16,
260+
do_std_bench,
261+
);
277262
group.bench_with_input(
278263
BenchmarkId::new("std_on_64bit", 128),
279-
&U64x128,
264+
&**U64x128,
280265
do_std_bench,
281266
);
282267
group.bench_with_input(
283268
BenchmarkId::new("std_on_64bit", 512),
284-
&U64x512,
269+
&**U64x512,
285270
do_std_bench,
286271
);
287272
group.bench_with_input(
288273
BenchmarkId::new("std_on_64bit", 2048),
289-
&U64x2048,
274+
&**U64x2048,
290275
do_std_bench,
291276
);
292277
group.bench_with_input(
293278
BenchmarkId::new("std_on_64bit", 8192),
294-
&U64x8192,
279+
&**U64x8192,
295280
do_std_bench,
296281
);
297282
group.finish();

src/lib.rs

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::simd::SIMDField;
1+
pub use crate::simd::SIMDField;
22

33
mod simd;
44

@@ -7,10 +7,7 @@ pub fn binary_search_auto<T: SIMDField>(nums: &[T], target: T) -> Option<usize>
77
let field_size = T::size_in_bits();
88
let total_size = len as u64 * field_size as u64;
99
match total_size {
10-
total_size if total_size == field_size as u64 => {
11-
simd::linear_search_generic(nums, &target, 0)
12-
}
13-
total_size if total_size < 256 * 8 => simd::linear_search(nums, target),
10+
total_size if total_size <= 128 * 1024 => simd::linear_search(nums, target),
1411
_ => simd::binary_search(nums, target),
1512
}
1613
}

0 commit comments

Comments
 (0)