Skip to content

Commit 22a1e7e

Browse files
rsclipalexfertel
andauthored
feat: add Bloom Filter data structure (#65)
Co-authored-by: Alexander Gonzalez <[email protected]>
1 parent 1ca5f59 commit 22a1e7e

File tree

3 files changed

+337
-0
lines changed

3 files changed

+337
-0
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ edition = "2018"
77

88
[dependencies]
99
paste = "1.0.12"
10+
bitvec = "1.0.1"

src/data_structures/bloom_filter.rs

Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
//! Bloom Filter is a probabilistic data structure designed to determine whether a given element
2+
//! is a member of a set. The main characteristic of the Bloom Filter is it may give false
3+
//! positives but never false negatives. In other words, a query returns either "possibly in set"
4+
//! or "definitely not in set".
5+
//!
6+
//! This uses the [BitVec](https://crates.io/crates/bitvec) crate to store the bits.
7+
//!
8+
//! Consider looking into [Fnv](https://crates.io/crates/fnv) crate for more efficient hashing.
9+
10+
use bitvec::prelude::*;
11+
use std::collections::hash_map::DefaultHasher;
12+
use std::hash::{Hash, Hasher};
13+
14+
/// Simple Bloom Filter implementation with a given size and number of hash functions.
15+
/// Multiple hash functions are used to reduce the probability of false positives.
16+
///
17+
/// Example usage:
18+
/// ```
19+
/// use std::collections::hash_map::DefaultHasher;
20+
/// use std::hash::{Hash, Hasher};
21+
/// use rust_algorithms::data_structures::BloomFilter;
22+
///
23+
/// fn main() {
24+
/// // Define hash functions
25+
/// let hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>> = vec![
26+
/// Box::new(|data| {
27+
/// let mut hasher = DefaultHasher::new();
28+
/// data.hash(&mut hasher);
29+
/// hasher.finish()
30+
/// }),
31+
/// Box::new(|data| {
32+
/// let mut hasher = DefaultHasher::new();
33+
/// data.hash(&mut hasher);
34+
/// hasher.finish() ^ 0xFFFFFFFFFFFFFFFF // XOR with a constant for diversity
35+
/// }),
36+
/// ];
37+
///
38+
/// // Create a new BloomFilter with a size of 100 bits and the hash functions
39+
/// let mut bloom_filter = BloomFilter::new(100, hash_functions);
40+
///
41+
/// // Insert elements into the BloomFilter
42+
/// bloom_filter.insert(&"apple");
43+
/// bloom_filter.insert(&"banana");
44+
/// bloom_filter.insert(&"cherry");
45+
///
46+
/// // Check if elements are contained in the BloomFilter
47+
/// println!("Contains 'apple': {}", bloom_filter.contains(&"apple")); // Should print true
48+
/// println!("Contains 'orange': {}", bloom_filter.contains(&"orange")); // Should print false
49+
/// println!("Contains 'cherry': {}", bloom_filter.contains(&"cherry")); // Should print true
50+
/// }
51+
/// ```
52+
53+
pub struct BloomFilter {
54+
/// Stores bits to indicate whether an element may be in the set
55+
bit_array: BitVec,
56+
/// Hash functions to use
57+
hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>>,
58+
}
59+
60+
impl BloomFilter {
61+
/// Creates a new Bloom Filter with the given size and hash functions
62+
pub fn new(size: usize, hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>>) -> Self {
63+
BloomFilter {
64+
bit_array: bitvec![0; size],
65+
hash_functions,
66+
}
67+
}
68+
69+
/// Inserts an element into the Bloom Filter
70+
/// Hashes the element using each hash function and sets the corresponding bit to true
71+
///
72+
/// Time Complexity: O(k) where k is the number of hash functions
73+
pub fn insert<T>(&mut self, item: &T)
74+
where
75+
T: AsRef<[u8]> + Hash,
76+
{
77+
for hash_function in &self.hash_functions {
78+
let hash = Self::hash(item, hash_function);
79+
let index = hash % self.bit_array.len() as u64;
80+
self.bit_array.set(index as usize, true);
81+
}
82+
}
83+
84+
/// Checks if an element may be in the Bloom Filter
85+
/// NOTE: `true` implies the element may be in the set, `false` implies the element is not in the set.
86+
/// The output is *not* deterministic.
87+
///
88+
/// Time Complexity: O(k) where k is the number of hash functions
89+
pub fn contains<T>(&self, item: &T) -> bool
90+
where
91+
T: AsRef<[u8]> + Hash,
92+
{
93+
for hash_function in &self.hash_functions {
94+
let hash = Self::hash(item, hash_function);
95+
let index = hash % self.bit_array.len() as u64;
96+
if !self.bit_array[index as usize] {
97+
return false;
98+
}
99+
}
100+
true
101+
}
102+
103+
/// Hashes an element using the given hash function
104+
fn hash<T>(item: &T, hash_function: &Box<dyn Fn(&[u8]) -> u64>) -> u64
105+
where
106+
T: AsRef<[u8]> + Hash,
107+
{
108+
let mut hasher = DefaultHasher::new();
109+
item.hash(&mut hasher);
110+
let hash = hasher.finish();
111+
hash_function(&hash.to_be_bytes())
112+
}
113+
}
114+
115+
#[cfg(test)]
116+
mod tests {
117+
use super::*;
118+
use std::collections::hash_map::DefaultHasher;
119+
use std::hash::{Hash, Hasher};
120+
121+
#[test]
122+
fn test_insert_and_contains() {
123+
let hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>> = vec![
124+
Box::new(|data| {
125+
let mut hasher = DefaultHasher::new();
126+
data.hash(&mut hasher);
127+
hasher.finish()
128+
}),
129+
Box::new(|data| {
130+
let mut hasher = DefaultHasher::new();
131+
data.hash(&mut hasher);
132+
hasher.finish() ^ 0xFFFFFFFFFFFFFFFF
133+
}),
134+
];
135+
136+
let mut bloom_filter = BloomFilter::new(100, hash_functions);
137+
138+
assert!(!bloom_filter.contains(&"apple"));
139+
assert!(!bloom_filter.contains(&"banana"));
140+
assert!(!bloom_filter.contains(&"cherry"));
141+
142+
bloom_filter.insert(&"apple");
143+
bloom_filter.insert(&"banana");
144+
bloom_filter.insert(&"cherry");
145+
146+
assert!(bloom_filter.contains(&"apple"));
147+
assert!(bloom_filter.contains(&"banana"));
148+
assert!(bloom_filter.contains(&"cherry"));
149+
150+
// Check that false positives are within an acceptable range
151+
assert!(!bloom_filter.contains(&"orange"));
152+
assert!(!bloom_filter.contains(&"grape"));
153+
assert!(!bloom_filter.contains(&"kiwi"));
154+
}
155+
156+
#[test]
157+
fn test_false_positive_probability() {
158+
// Test the false positive probability by inserting a known set of elements
159+
// and checking for false positives with additional elements
160+
161+
let hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>> = vec![
162+
Box::new(|data| {
163+
let mut hasher = DefaultHasher::new();
164+
data.hash(&mut hasher);
165+
hasher.finish()
166+
}),
167+
Box::new(|data| {
168+
let mut hasher = DefaultHasher::new();
169+
data.hash(&mut hasher);
170+
hasher.finish() ^ 0xFFFFFFFFFFFFFFFF
171+
}),
172+
];
173+
174+
let mut bloom_filter = BloomFilter::new(100, hash_functions);
175+
176+
// Insert known elements
177+
let known_elements = vec!["apple", "banana", "cherry"];
178+
for element in &known_elements {
179+
bloom_filter.insert(element);
180+
}
181+
182+
// Test false positives with additional elements
183+
let false_positive_elements = vec!["orange", "grape", "kiwi"];
184+
for element in &false_positive_elements {
185+
assert!(
186+
!bloom_filter.contains(element),
187+
"False positive for: {}",
188+
element
189+
);
190+
}
191+
}
192+
193+
#[test]
194+
fn test_hash_function_diversity() {
195+
// Test that hash functions produce diverse results for different elements
196+
197+
let hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>> = vec![
198+
Box::new(|data| {
199+
let mut hasher = DefaultHasher::new();
200+
data.hash(&mut hasher);
201+
hasher.finish()
202+
}),
203+
Box::new(|data| {
204+
let mut hasher = DefaultHasher::new();
205+
data.hash(&mut hasher);
206+
hasher.finish() ^ 0xFFFFFFFFFFFFFFFF
207+
}),
208+
];
209+
210+
let bloom_filter = BloomFilter::new(100, hash_functions);
211+
212+
let element1 = "apple";
213+
let element2 = "banana";
214+
215+
let hash1 = BloomFilter::hash(&element1, &bloom_filter.hash_functions[0]);
216+
let hash2 = BloomFilter::hash(&element2, &bloom_filter.hash_functions[0]);
217+
218+
assert_ne!(
219+
hash1, hash2,
220+
"Hash function 1 produces the same hash for different elements"
221+
);
222+
223+
let hash1 = BloomFilter::hash(&element1, &bloom_filter.hash_functions[1]);
224+
let hash2 = BloomFilter::hash(&element2, &bloom_filter.hash_functions[1]);
225+
226+
assert_ne!(
227+
hash1, hash2,
228+
"Hash function 2 produces the same hash for different elements"
229+
);
230+
}
231+
232+
#[test]
233+
fn test_hash_function_consistency() {
234+
// Test that hash functions produce consistent results for the same element
235+
236+
let hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>> = vec![
237+
Box::new(|data| {
238+
let mut hasher = DefaultHasher::new();
239+
data.hash(&mut hasher);
240+
hasher.finish()
241+
}),
242+
Box::new(|data| {
243+
let mut hasher = DefaultHasher::new();
244+
data.hash(&mut hasher);
245+
hasher.finish() ^ 0xFFFFFFFFFFFFFFFF
246+
}),
247+
];
248+
249+
let bloom_filter = BloomFilter::new(100, hash_functions);
250+
251+
let element = "apple";
252+
253+
let hash1 = BloomFilter::hash(&element, &bloom_filter.hash_functions[0]);
254+
let hash2 = BloomFilter::hash(&element, &bloom_filter.hash_functions[0]);
255+
256+
assert_eq!(
257+
hash1, hash2,
258+
"Hash function 1 produces different hashes for the same element"
259+
);
260+
261+
let hash1 = BloomFilter::hash(&element, &bloom_filter.hash_functions[1]);
262+
let hash2 = BloomFilter::hash(&element, &bloom_filter.hash_functions[1]);
263+
264+
assert_eq!(
265+
hash1, hash2,
266+
"Hash function 2 produces different hashes for the same element"
267+
);
268+
}
269+
270+
/// more extensive test and contains test
271+
#[test]
272+
fn test_bloom_filter_extended() {
273+
/// Get a vector of hash functions (since they are closures, we can't clone them)
274+
fn get_hash_functions() -> Vec<Box<dyn Fn(&[u8]) -> u64>> {
275+
vec![
276+
Box::new(|data| {
277+
let mut hasher = DefaultHasher::new();
278+
data.hash(&mut hasher);
279+
hasher.finish()
280+
}),
281+
Box::new(|data| {
282+
let mut hasher = DefaultHasher::new();
283+
data.hash(&mut hasher);
284+
hasher.finish() ^ 0xFFFFFFFFFFFFFFFF
285+
}),
286+
]
287+
}
288+
289+
let mut bloom_filter = BloomFilter::new(100, get_hash_functions());
290+
291+
// Ensure the filter is initially empty
292+
assert!(!bloom_filter.contains(&"apple"));
293+
assert!(!bloom_filter.contains(&"banana"));
294+
assert!(!bloom_filter.contains(&"cherry"));
295+
296+
// Insert items into the Bloom filter
297+
bloom_filter.insert(&"apple");
298+
bloom_filter.insert(&"banana");
299+
bloom_filter.insert(&"cherry");
300+
301+
// Check for false positives (items that were not inserted)
302+
assert!(!bloom_filter.contains(&"orange"));
303+
assert!(!bloom_filter.contains(&"grape"));
304+
assert!(!bloom_filter.contains(&"kiwi"));
305+
306+
// Check for false negatives (items that were inserted)
307+
assert!(bloom_filter.contains(&"apple"));
308+
assert!(bloom_filter.contains(&"banana"));
309+
assert!(bloom_filter.contains(&"cherry"));
310+
311+
// Create a new Bloom filter with a larger capacity
312+
let mut bloom_filter_large = BloomFilter::new(100, get_hash_functions());
313+
314+
// Insert items into the larger Bloom filter
315+
bloom_filter_large.insert(&"orange");
316+
bloom_filter_large.insert(&"grape");
317+
bloom_filter_large.insert(&"kiwi");
318+
319+
// Check for false positives in the larger filter
320+
assert!(bloom_filter_large.contains(&"orange"));
321+
assert!(bloom_filter_large.contains(&"grape"));
322+
assert!(bloom_filter_large.contains(&"kiwi"));
323+
324+
// Check for false negatives in the larger filter
325+
assert!(!bloom_filter_large.contains(&"apple"));
326+
assert!(!bloom_filter_large.contains(&"banana"));
327+
assert!(!bloom_filter_large.contains(&"cherry"));
328+
329+
// Check the accuracy of the original Bloom filter with new items
330+
assert!(!bloom_filter.contains(&"orange"));
331+
assert!(!bloom_filter.contains(&"grape"));
332+
assert!(!bloom_filter.contains(&"kiwi"));
333+
}
334+
}

src/data_structures/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
mod avl_tree;
22
mod b_tree;
33
mod binary_search_tree;
4+
mod bloom_filter;
45
mod fenwick_tree;
56
mod graph;
67
mod heap;
@@ -13,6 +14,7 @@ mod stack_using_singly_linked_list;
1314
mod trie;
1415
mod union_find;
1516

17+
pub use self::bloom_filter::BloomFilter;
1618
pub use self::heap::MaxHeap;
1719
pub use self::heap::MinHeap;
1820
pub use self::linked_list::LinkedList;

0 commit comments

Comments
 (0)