sui_indexer_alt_schema/blooms/
bloom.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use crate::blooms::hash::DoubleHasher;
5use crate::blooms::hash::set_bit;
6
7/// A standard bloom filter with bits spread across the entire filter.
8#[derive(Debug, Clone)]
9pub struct BloomFilter<const BYTES: usize, const HASHES: u32, const SEED: u128> {
10    bytes: [u8; BYTES],
11}
12
13impl<const BYTES: usize, const HASHES: u32, const SEED: u128> BloomFilter<BYTES, HASHES, SEED> {
14    /// Create a new bloom filter.
15    pub fn new() -> Self {
16        Self {
17            bytes: [0u8; BYTES],
18        }
19    }
20
21    pub fn insert(&mut self, key: &[u8]) {
22        for bit_idx in Self::hash(key) {
23            set_bit(&mut self.bytes, bit_idx);
24        }
25    }
26
27    pub fn popcount(&self) -> usize {
28        self.bytes.iter().map(|b| b.count_ones() as usize).sum()
29    }
30
31    /// Repeatedly halves the filter by ORing the upper half into the lower half
32    /// until density exceeds `max_fold_density` or size reaches `min_fold_bytes`.
33    /// This allows us to allocate a larger bloom filter to handle checkpoints with more items
34    /// while folding to handle sparse filters.
35    ///
36    /// To get the folded positions from the original bit positions,
37    /// folded_idx = `idx % folded_num_bits` where idx is the original bit position.
38    pub fn fold(self, min_fold_bytes: usize, max_fold_density: f64) -> Vec<u8> {
39        let mut bytes = self.bytes.to_vec();
40
41        while bytes.len().is_multiple_of(2) && bytes.len() >= 2 * min_fold_bytes {
42            // Stop if density exceeds threshold
43            let popcount: usize = bytes.iter().map(|b| b.count_ones() as usize).sum();
44            let density = popcount as f64 / (bytes.len() * 8) as f64;
45            if density > max_fold_density {
46                break;
47            }
48
49            // Fold: OR upper half into lower half
50            let half = bytes.len() / 2;
51            for i in 0..half {
52                bytes[i] |= bytes[half + i];
53            }
54            bytes.truncate(half);
55        }
56
57        bytes
58    }
59
60    /// Hashes a value to the bit positions to set/check.
61    pub fn hash(value: &[u8]) -> impl Iterator<Item = usize> {
62        let num_bits = BYTES * 8;
63        DoubleHasher::with_value(value, SEED)
64            .take(HASHES as usize)
65            .map(move |h| (h as usize) % num_bits)
66    }
67}
68
69impl<const BYTES: usize, const HASHES: u32, const SEED: u128> Default
70    for BloomFilter<BYTES, HASHES, SEED>
71{
72    fn default() -> Self {
73        Self::new()
74    }
75}
76
77impl<T: AsRef<[u8]>, const BYTES: usize, const HASHES: u32, const SEED: u128> Extend<T>
78    for BloomFilter<BYTES, HASHES, SEED>
79{
80    fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
81        for key in iter {
82            self.insert(key.as_ref());
83        }
84    }
85}
86
87#[cfg(test)]
88mod tests {
89    use crate::blooms::hash;
90    use crate::cp_blooms::MAX_FOLD_DENSITY;
91    use crate::cp_blooms::MIN_FOLD_BYTES;
92
93    use super::*;
94
95    // Test bloom filter with specific dimensions
96    type TestBloomFilter = BloomFilter<1024, 5, 67>;
97    type TestBloomFilter2048 = BloomFilter<2048, 5, 67>;
98
99    impl<const BYTES: usize, const HASHES: u32, const SEED: u128> BloomFilter<BYTES, HASHES, SEED> {
100        /// Check if a key might be in the given bytes (possibly folded).
101        pub fn contains(bytes: &[u8], key: &[u8]) -> bool {
102            let num_bits = bytes.len() * 8;
103            Self::hash(key).all(|pos| hash::check_bit(bytes, pos % num_bits))
104        }
105
106        /// Calculate the current bit density.
107        pub fn density(&self) -> f64 {
108            self.popcount() as f64 / (BYTES * 8) as f64
109        }
110    }
111
112    #[test]
113    fn test_insert_and_contains() {
114        let mut bloom = TestBloomFilter::new();
115
116        let key1 = b"hello";
117        let key2 = b"world";
118
119        bloom.insert(key1);
120        bloom.insert(key2);
121
122        assert!(
123            TestBloomFilter::contains(&bloom.bytes, key1),
124            "Should contain inserted key"
125        );
126        assert!(
127            TestBloomFilter::contains(&bloom.bytes, key2),
128            "Should contain inserted key"
129        );
130    }
131
132    #[test]
133    fn test_fold_preserves_membership() {
134        let mut bloom = TestBloomFilter::new();
135
136        let keys: Vec<Vec<u8>> = (0..50).map(|i| format!("key{}", i).into_bytes()).collect();
137        for key in &keys {
138            bloom.insert(key);
139        }
140
141        let folded_bytes = bloom.fold(MIN_FOLD_BYTES, MAX_FOLD_DENSITY);
142
143        // All original keys must still be found (no false negatives)
144        for key in &keys {
145            assert!(
146                TestBloomFilter::contains(&folded_bytes, key),
147                "Folded filter must contain all original keys"
148            );
149        }
150    }
151
152    #[test]
153    fn test_fold_minimum_size() {
154        let mut bloom = TestBloomFilter2048::new();
155
156        // Insert just one key - should fold down to minimum
157        bloom.insert(b"single");
158
159        let folded_bytes = bloom.fold(MIN_FOLD_BYTES, MAX_FOLD_DENSITY);
160
161        // Should fold down to MIN_FOLD_BYTES (1024 bytes)
162        assert!(
163            folded_bytes.len() >= MIN_FOLD_BYTES,
164            "Should not fold below minimum size"
165        );
166        assert_eq!(
167            folded_bytes.len(),
168            MIN_FOLD_BYTES,
169            "Very sparse filter should fold to minimum"
170        );
171        assert!(
172            TestBloomFilter2048::contains(&folded_bytes, b"single"),
173            "Should still contain the key"
174        );
175    }
176
177    #[test]
178    fn test_density() {
179        let mut bloom = TestBloomFilter::new();
180        assert_eq!(bloom.density(), 0.0, "Empty filter should have 0 density");
181
182        bloom.insert(b"key1");
183        let density = bloom.density();
184        assert!(density > 0.0, "Should have some bits set");
185        assert!(density < 0.01, "Few keys should have low density");
186    }
187}