Skip to content

Commit

Permalink
feat: add Bloom Filter data structure (#65)
Browse files Browse the repository at this point in the history
Co-authored-by: Alexander Gonzalez <[email protected]>
  • Loading branch information
rsclip and alexfertel authored Dec 30, 2023
1 parent 1ca5f59 commit 22a1e7e
Show file tree
Hide file tree
Showing 3 changed files with 337 additions and 0 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ edition = "2018"

[dependencies]
paste = "1.0.12"
bitvec = "1.0.1"
334 changes: 334 additions & 0 deletions src/data_structures/bloom_filter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,334 @@
//! Bloom Filter is a probabilistic data structure designed to determine whether a given element
//! is a member of a set. The main characteristic of the Bloom Filter is it may give false
//! positives but never false negatives. In other words, a query returns either "possibly in set"
//! or "definitely not in set".
//!
//! This uses the [BitVec](https://crates.io/crates/bitvec) crate to store the bits.
//!
//! Consider looking into [Fnv](https://crates.io/crates/fnv) crate for more efficient hashing.
use bitvec::prelude::*;
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};

/// Simple Bloom Filter implementation with a given size and number of hash functions.
/// Multiple hash functions are used to reduce the probability of false positives.
///
/// Example usage:
/// ```
/// use std::collections::hash_map::DefaultHasher;
/// use std::hash::{Hash, Hasher};
/// use rust_algorithms::data_structures::BloomFilter;
///
/// fn main() {
/// // Define hash functions
/// let hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>> = vec![
/// Box::new(|data| {
/// let mut hasher = DefaultHasher::new();
/// data.hash(&mut hasher);
/// hasher.finish()
/// }),
/// Box::new(|data| {
/// let mut hasher = DefaultHasher::new();
/// data.hash(&mut hasher);
/// hasher.finish() ^ 0xFFFFFFFFFFFFFFFF // XOR with a constant for diversity
/// }),
/// ];
///
/// // Create a new BloomFilter with a size of 100 bits and the hash functions
/// let mut bloom_filter = BloomFilter::new(100, hash_functions);
///
/// // Insert elements into the BloomFilter
/// bloom_filter.insert(&"apple");
/// bloom_filter.insert(&"banana");
/// bloom_filter.insert(&"cherry");
///
/// // Check if elements are contained in the BloomFilter
/// println!("Contains 'apple': {}", bloom_filter.contains(&"apple")); // Should print true
/// println!("Contains 'orange': {}", bloom_filter.contains(&"orange")); // Should print false
/// println!("Contains 'cherry': {}", bloom_filter.contains(&"cherry")); // Should print true
/// }
/// ```
pub struct BloomFilter {
/// Stores bits to indicate whether an element may be in the set
bit_array: BitVec,
/// Hash functions to use
hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>>,
}

impl BloomFilter {
/// Creates a new Bloom Filter with the given size and hash functions
pub fn new(size: usize, hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>>) -> Self {
BloomFilter {
bit_array: bitvec![0; size],
hash_functions,
}
}

/// Inserts an element into the Bloom Filter
/// Hashes the element using each hash function and sets the corresponding bit to true
///
/// Time Complexity: O(k) where k is the number of hash functions
pub fn insert<T>(&mut self, item: &T)
where
T: AsRef<[u8]> + Hash,
{
for hash_function in &self.hash_functions {
let hash = Self::hash(item, hash_function);
let index = hash % self.bit_array.len() as u64;
self.bit_array.set(index as usize, true);
}
}

/// Checks if an element may be in the Bloom Filter
/// NOTE: `true` implies the element may be in the set, `false` implies the element is not in the set.
/// The output is *not* deterministic.
///
/// Time Complexity: O(k) where k is the number of hash functions
pub fn contains<T>(&self, item: &T) -> bool
where
T: AsRef<[u8]> + Hash,
{
for hash_function in &self.hash_functions {
let hash = Self::hash(item, hash_function);
let index = hash % self.bit_array.len() as u64;
if !self.bit_array[index as usize] {
return false;
}
}
true
}

/// Hashes an element using the given hash function
fn hash<T>(item: &T, hash_function: &Box<dyn Fn(&[u8]) -> u64>) -> u64
where
T: AsRef<[u8]> + Hash,
{
let mut hasher = DefaultHasher::new();
item.hash(&mut hasher);
let hash = hasher.finish();
hash_function(&hash.to_be_bytes())
}
}

#[cfg(test)]
mod tests {
use super::*;
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};

#[test]
fn test_insert_and_contains() {
let hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>> = vec![
Box::new(|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
hasher.finish()
}),
Box::new(|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
hasher.finish() ^ 0xFFFFFFFFFFFFFFFF
}),
];

let mut bloom_filter = BloomFilter::new(100, hash_functions);

assert!(!bloom_filter.contains(&"apple"));
assert!(!bloom_filter.contains(&"banana"));
assert!(!bloom_filter.contains(&"cherry"));

bloom_filter.insert(&"apple");
bloom_filter.insert(&"banana");
bloom_filter.insert(&"cherry");

assert!(bloom_filter.contains(&"apple"));
assert!(bloom_filter.contains(&"banana"));
assert!(bloom_filter.contains(&"cherry"));

// Check that false positives are within an acceptable range
assert!(!bloom_filter.contains(&"orange"));
assert!(!bloom_filter.contains(&"grape"));
assert!(!bloom_filter.contains(&"kiwi"));
}

#[test]
fn test_false_positive_probability() {
// Test the false positive probability by inserting a known set of elements
// and checking for false positives with additional elements

let hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>> = vec![
Box::new(|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
hasher.finish()
}),
Box::new(|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
hasher.finish() ^ 0xFFFFFFFFFFFFFFFF
}),
];

let mut bloom_filter = BloomFilter::new(100, hash_functions);

// Insert known elements
let known_elements = vec!["apple", "banana", "cherry"];
for element in &known_elements {
bloom_filter.insert(element);
}

// Test false positives with additional elements
let false_positive_elements = vec!["orange", "grape", "kiwi"];
for element in &false_positive_elements {
assert!(
!bloom_filter.contains(element),
"False positive for: {}",
element
);
}
}

#[test]
fn test_hash_function_diversity() {
// Test that hash functions produce diverse results for different elements

let hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>> = vec![
Box::new(|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
hasher.finish()
}),
Box::new(|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
hasher.finish() ^ 0xFFFFFFFFFFFFFFFF
}),
];

let bloom_filter = BloomFilter::new(100, hash_functions);

let element1 = "apple";
let element2 = "banana";

let hash1 = BloomFilter::hash(&element1, &bloom_filter.hash_functions[0]);
let hash2 = BloomFilter::hash(&element2, &bloom_filter.hash_functions[0]);

assert_ne!(
hash1, hash2,
"Hash function 1 produces the same hash for different elements"
);

let hash1 = BloomFilter::hash(&element1, &bloom_filter.hash_functions[1]);
let hash2 = BloomFilter::hash(&element2, &bloom_filter.hash_functions[1]);

assert_ne!(
hash1, hash2,
"Hash function 2 produces the same hash for different elements"
);
}

#[test]
fn test_hash_function_consistency() {
// Test that hash functions produce consistent results for the same element

let hash_functions: Vec<Box<dyn Fn(&[u8]) -> u64>> = vec![
Box::new(|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
hasher.finish()
}),
Box::new(|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
hasher.finish() ^ 0xFFFFFFFFFFFFFFFF
}),
];

let bloom_filter = BloomFilter::new(100, hash_functions);

let element = "apple";

let hash1 = BloomFilter::hash(&element, &bloom_filter.hash_functions[0]);
let hash2 = BloomFilter::hash(&element, &bloom_filter.hash_functions[0]);

assert_eq!(
hash1, hash2,
"Hash function 1 produces different hashes for the same element"
);

let hash1 = BloomFilter::hash(&element, &bloom_filter.hash_functions[1]);
let hash2 = BloomFilter::hash(&element, &bloom_filter.hash_functions[1]);

assert_eq!(
hash1, hash2,
"Hash function 2 produces different hashes for the same element"
);
}

/// more extensive test and contains test
#[test]
fn test_bloom_filter_extended() {
/// Get a vector of hash functions (since they are closures, we can't clone them)
fn get_hash_functions() -> Vec<Box<dyn Fn(&[u8]) -> u64>> {
vec![
Box::new(|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
hasher.finish()
}),
Box::new(|data| {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
hasher.finish() ^ 0xFFFFFFFFFFFFFFFF
}),
]
}

let mut bloom_filter = BloomFilter::new(100, get_hash_functions());

// Ensure the filter is initially empty
assert!(!bloom_filter.contains(&"apple"));
assert!(!bloom_filter.contains(&"banana"));
assert!(!bloom_filter.contains(&"cherry"));

// Insert items into the Bloom filter
bloom_filter.insert(&"apple");
bloom_filter.insert(&"banana");
bloom_filter.insert(&"cherry");

// Check for false positives (items that were not inserted)
assert!(!bloom_filter.contains(&"orange"));
assert!(!bloom_filter.contains(&"grape"));
assert!(!bloom_filter.contains(&"kiwi"));

// Check for false negatives (items that were inserted)
assert!(bloom_filter.contains(&"apple"));
assert!(bloom_filter.contains(&"banana"));
assert!(bloom_filter.contains(&"cherry"));

// Create a new Bloom filter with a larger capacity
let mut bloom_filter_large = BloomFilter::new(100, get_hash_functions());

// Insert items into the larger Bloom filter
bloom_filter_large.insert(&"orange");
bloom_filter_large.insert(&"grape");
bloom_filter_large.insert(&"kiwi");

// Check for false positives in the larger filter
assert!(bloom_filter_large.contains(&"orange"));
assert!(bloom_filter_large.contains(&"grape"));
assert!(bloom_filter_large.contains(&"kiwi"));

// Check for false negatives in the larger filter
assert!(!bloom_filter_large.contains(&"apple"));
assert!(!bloom_filter_large.contains(&"banana"));
assert!(!bloom_filter_large.contains(&"cherry"));

// Check the accuracy of the original Bloom filter with new items
assert!(!bloom_filter.contains(&"orange"));
assert!(!bloom_filter.contains(&"grape"));
assert!(!bloom_filter.contains(&"kiwi"));
}
}
2 changes: 2 additions & 0 deletions src/data_structures/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod avl_tree;
mod b_tree;
mod binary_search_tree;
mod bloom_filter;
mod fenwick_tree;
mod graph;
mod heap;
Expand All @@ -13,6 +14,7 @@ mod stack_using_singly_linked_list;
mod trie;
mod union_find;

pub use self::bloom_filter::BloomFilter;
pub use self::heap::MaxHeap;
pub use self::heap::MinHeap;
pub use self::linked_list::LinkedList;
Expand Down

0 comments on commit 22a1e7e

Please sign in to comment.