Skip to content

Commit

Permalink
fix(deflate): help the compiler evade two bounds checks to improve co…
Browse files Browse the repository at this point in the history
…mpression performance a little
  • Loading branch information
oyvindln committed Feb 25, 2025
1 parent 743ae50 commit 633e59f
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 6 deletions.
2 changes: 1 addition & 1 deletion miniz_oxide/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ compiler_builtins = { version = '0.1.2', optional = true }
[dev-dependencies]
## Messes with minimum rust version and drags in deps just for running tests
## so just comment out for now and enable manually when needed for enabling benches
#criterion = "0.5"
criterion = "0.5"

[[bench]]
name = "benchmark"
Expand Down
24 changes: 20 additions & 4 deletions miniz_oxide/src/deflate/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1254,7 +1254,20 @@ impl DictOxide {
let next_probe_pos = self.b.next[probe_pos] as usize;

dist = (lookahead_pos - next_probe_pos) & 0xFFFF;
if next_probe_pos == 0 || dist > max_dist {
// Optimization: The last condition should never be hit but helps the compiler by avoiding
// doing the bounds check in the read_u16_le call and adding the extra instructions
// for branching to a panic after that and instead just adds the extra instruction here
// instead saving some instructions and thus improving performance a bit.
// May want to investigate whether we can avoid it entirely but as of now the compiler
// isn't able to deduce that match_len is bounded to [1-257]
// Disable clippy lint as it needs to be written in this specific way
// rather than MAX_MATCH_LEN to work
// because the compiler isn't super smart....
#[allow(clippy::int_plus_one)]
if next_probe_pos == 0
|| dist > max_dist
|| match_len as usize - 1 >= MAX_MATCH_LEN
{
// We reached the end of the hash chain, or the next value is further away
// than the maximum allowed distance, so return the best match we found, if
// any.
Expand All @@ -1265,7 +1278,6 @@ impl DictOxide {
// position to match against.
probe_pos = next_probe_pos & LZ_DICT_SIZE_MASK;

// TODO: This bounds check does not get optimized out
if read_u16_le(&self.b.dict, probe_pos + match_len as usize - 1) == c01 {
break 'found;
}
Expand Down Expand Up @@ -1305,14 +1317,18 @@ impl DictOxide {
if probe_len > match_len as usize {
match_dist = dist as u32;
match_len = cmp::min(max_match_len, probe_len as u32);
if match_len == max_match_len {
if match_len >= max_match_len {
// We found a match that had the maximum allowed length,
// so there is now point searching further.
return (match_dist, match_len);
}
// We found a better match, so save the last two bytes for further match
// comparisons.
c01 = read_u16_le(&self.b.dict, pos + match_len as usize - 1);
// Optimization: use saturating_sub makes the compiler able to evade the bounds check
// at the cost of some extra instructions since it avoids any possibility of wraparound.
// need to see if we can find a better way to do this since this is still a bit costly.
c01 =
read_u16_le(&self.b.dict, (pos + match_len as usize).saturating_sub(1));
}
continue 'outer;
}
Expand Down
2 changes: 1 addition & 1 deletion miniz_oxide/src/inflate/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ fn init_tree(r: &mut DecompressorOxide, l: &mut LocalVars) -> Option<Action> {

let mut total_symbols = [0u16; 16];
let mut next_code = [0u32; 17];
const INVALID_CODE: i16 = 1 << 9 | 286;
const INVALID_CODE: i16 = (1 << 9) | 286;
// Set the values in the fast table to return a
// non-zero length and an invalid symbol instead of zero
// so that we do not have to have a check for a zero
Expand Down

0 comments on commit 633e59f

Please sign in to comment.