From 474393be8d6be7418699e0a9ef8e00fe2fd9cd75 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Wed, 9 Aug 2023 22:22:06 -0400
Subject: [PATCH] nfa,dfa: call shrink_to_vec in places

This commit reduces the *actual* heap memory used by noncontiguous
NFAs, contiguous NFAs and DFAs. Previously, all three would report
memory usage that only included what was actually being used within
each Vec. But each Vec might actually have a lot more heap memory
inside of it in the form of unused capacity. Potentially ~twice as
much. This is because a Vec amortizes its growth.

So when Vecs are used to build NFAs and DFAs, some of them wind up
with potentially a lot of additional capacity that goes unused and
unreported. This commit partially fixes that by calling shrink_to_vec
in a few strategic places. However, there are still some nested Vecs,
notably in the noncontiguous NFA, that remain unshrinked. I tried doing
it, but the perf impact it had on construction was non-trivial.

This is a good example of one of those things where you of course know
that Vecs amortize their growth by reserving potentially extra capacity
that yuo won't end up using, but still remain ignorant of the broader
effects of this strategy in aggregate. Especially at the extremes when
you start trying to build Aho-Corasick automatons with millions of
patterns.

We should think more carefully about using linked lists in the
non-contiguous NFA to reduce memory usage, or perhaps thinking of some
other way to avoid nested Vecs. It turns out that they end up
penalizing us quite a bit.

This commit was inspired by the investigation I did as part of this
issue: https://github.com/G-Research/ahocorasick_rs/issues/62
---
 src/dfa.rs               | 12 ++++++++++++
 src/nfa/contiguous.rs    |  5 +++++
 src/nfa/noncontiguous.rs |  3 ++-
 3 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/src/dfa.rs b/src/dfa.rs
index dd79079..c58d149 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -518,6 +518,18 @@ impl Builder {
             dfa.byte_classes.alphabet_len(),
             dfa.byte_classes.stride(),
         );
+        // The vectors can grow ~twice as big during construction because a
+        // Vec amortizes growth. But here, let's shrink things back down to
+        // what we actually need since we're never going to add more to it.
+        dfa.trans.shrink_to_fit();
+        dfa.pattern_lens.shrink_to_fit();
+        dfa.matches.shrink_to_fit();
+        // TODO: We might also want to shrink each Vec inside of `dfa.matches`,
+        // or even better, convert it to one contiguous allocation. But I think
+        // I went with nested allocs for good reason (can't remember), so this
+        // may be tricky to do. I decided not to shrink them here because it
+        // might require a fair bit of work to do. It's unclear whether it's
+        // worth it.
         Ok(dfa)
     }
 
diff --git a/src/nfa/contiguous.rs b/src/nfa/contiguous.rs
index 5d0c82d..6ced6a9 100644
--- a/src/nfa/contiguous.rs
+++ b/src/nfa/contiguous.rs
@@ -992,6 +992,11 @@ impl Builder {
             nfa.memory_usage(),
             nfa.byte_classes.alphabet_len(),
         );
+        // The vectors can grow ~twice as big during construction because a
+        // Vec amortizes growth. But here, let's shrink things back down to
+        // what we actually need since we're never going to add more to it.
+        nfa.repr.shrink_to_fit();
+        nfa.pattern_lens.shrink_to_fit();
         Ok(nfa)
     }
 
diff --git a/src/nfa/noncontiguous.rs b/src/nfa/noncontiguous.rs
index 790a434..49e6eac 100644
--- a/src/nfa/noncontiguous.rs
+++ b/src/nfa/noncontiguous.rs
@@ -1219,7 +1219,8 @@ impl<'a> Compiler<'a> {
     fn calculate_memory_usage(&mut self) {
         use core::mem::size_of;
 
-        for state in self.nfa.states.iter() {
+        self.nfa.states.shrink_to_fit();
+        for state in self.nfa.states.iter_mut() {
             self.nfa.memory_usage += size_of::<State>() + state.memory_usage();
         }
     }