sui_indexer_alt_schema/blooms/
blocked.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Blocked bloom filter implementation for efficient database queries.
5//!
6//! A blocked bloom filter partitions the filter into fixed-size blocks, where each element
7//! hashes to exactly one block for bits to be set, essentially a hash-indexed array of smaller bloom filters.
8//! Each block is stored as a separate row. This allows us to query specific blocks for membership instead of
9//! scanning the entire filter for improved IO.
10//!
11//! 1. **Targeted queries**: When checking membership, only the specific block for that element
12//!    needs to be loaded from the database, not the entire filter.
13//!
14//! 2. **Incremental updates**: Blocks can be OR'd together independently, allowing bloom filters
15//!    from multiple checkpoints to be merged at the block level without loading entire filters.
16//!    **Note**: Do not use folding with blocked bloom filters. Merging requires fixed-size blocks;
17//!    variable-sized blocks from folding would cause the merge to fail.
18//!
19//! 3. **Sparse storage**: Only blocks with bits set need to be stored. In practice, with well-chosen parameters,
20//!    values should be uniformly distributed across blocks.
21//!
22//! The filter uses double hashing: the first hash selects the block, and subsequent hashes
23//! (with a different seed to prevent correlation) select bit positions within that block.
24//!
25//!
26//! ```text
27//!   Blocked Bloom Filter (128 blocks × 2 KB each)
28//!   ┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┐
29//!   │  0  │  1  │  2  │  3  │ ... │ 125 │ 126 │ 127 │
30//!   └─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┘
31//!            ▲
32//!            │
33//!       hash(key, seed) selects block, then subsequent hash(key, seed1) sets bits within that block
34//! ```
35//!
36
37use crate::blooms::hash::DoubleHasher;
38use crate::blooms::hash::set_bit;
39
40/// Probe for checking membership in a blocked bloom filter.
41pub struct BlockedBloomProbe {
42    pub block_idx: usize,
43    pub byte_offsets: Vec<usize>,
44    pub bit_masks: Vec<usize>,
45}
46
47/// A generic blocked bloom filter.
48///
49/// # Type Parameters
50///
51/// - `BYTES`: Number of bytes per block.
52/// - `BLOCKS`: Number of blocks in the filter.
53/// - `HASHES`: Number of hash functions applied per element inserted. Each hash sets one bit within the selected block.
54#[derive(Clone)]
55pub struct BlockedBloomFilter<const BYTES: usize, const BLOCKS: usize, const HASHES: u32> {
56    blocks: Box<[[u8; BYTES]]>,
57    seed: u128,
58}
59
60impl<const BYTES: usize, const BLOCKS: usize, const HASHES: u32>
61    BlockedBloomFilter<BYTES, BLOCKS, HASHES>
62{
63    pub fn new(seed: u128) -> Self {
64        Self {
65            blocks: vec![[0u8; BYTES]; BLOCKS].into_boxed_slice(),
66            seed,
67        }
68    }
69
70    /// Insert a value into the bloom filter by setting the bits in the block that the
71    /// hash function produces. The first hash sets the block index, and the remaining hashes
72    /// set bits within that block. This allows us to query specific blocks for membership instead
73    /// of scanning the entire bloom filter.
74    pub fn insert(&mut self, value: &[u8]) {
75        let (block_idx, bit_positions) = Self::hash(value, self.seed);
76        let block = &mut self.blocks[block_idx];
77        for bit_idx in bit_positions {
78            set_bit(block, bit_idx);
79        }
80    }
81
82    /// Consumes the filter to return non-empty block indexes and corresponding data.
83    pub fn into_sparse_blocks(self) -> impl Iterator<Item = (usize, [u8; BYTES])> {
84        self.blocks
85            .into_iter()
86            .enumerate()
87            .filter(|(_, block)| block.iter().any(|&b| b != 0))
88    }
89
90    /// Compute the block index and bit positions within the block for a value.
91    /// Uses separate hashers for block selection and bit indexes to prevent correlated patterns.
92    pub fn hash(value: &[u8], seed: u128) -> (usize, impl Iterator<Item = usize>) {
93        let bits_per_block = BYTES * 8;
94        let block_idx = {
95            let mut hasher = DoubleHasher::with_value(value, seed);
96            (hasher.next_hash() as usize) % BLOCKS
97        };
98        let bit_iter = DoubleHasher::with_value(value, seed.wrapping_add(1))
99            .take(HASHES as usize)
100            .map(move |h| (h as usize) % bits_per_block);
101        (block_idx, bit_iter)
102    }
103}
104
105impl<T: AsRef<[u8]>, const BYTES: usize, const BLOCKS: usize, const HASHES: u32> Extend<T>
106    for BlockedBloomFilter<BYTES, BLOCKS, HASHES>
107{
108    fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
109        for key in iter {
110            self.insert(key.as_ref());
111        }
112    }
113}
114
115#[cfg(test)]
116mod tests {
117    use crate::blooms::hash::check_bit;
118    use crate::cp_bloom_blocks::BLOOM_BLOCK_BYTES;
119    use crate::cp_bloom_blocks::NUM_BLOOM_BLOCKS;
120    use crate::cp_bloom_blocks::NUM_HASHES;
121
122    use super::*;
123
124    type TestBloomFilter = BlockedBloomFilter<BLOOM_BLOCK_BYTES, NUM_BLOOM_BLOCKS, NUM_HASHES>;
125
126    impl<const BYTES: usize, const BLOCKS: usize, const HASHES: u32>
127        BlockedBloomFilter<BYTES, BLOCKS, HASHES>
128    {
129        /// Get a specific block by index.
130        pub fn block(&self, block_idx: usize) -> Option<&[u8; BYTES]> {
131            self.blocks.get(block_idx)
132        }
133
134        /// Check if a block contains any set bits.
135        pub fn is_block_nonzero(&self, block_idx: usize) -> bool {
136            self.blocks
137                .get(block_idx)
138                .is_some_and(|b| b.iter().any(|&x| x != 0))
139        }
140
141        /// Check if a key might be in the bloom filter.
142        pub fn contains(&self, key: &[u8]) -> bool {
143            let (block_idx, mut bit_idxs) = Self::hash(key, self.seed);
144            bit_idxs.all(|bit_idx| check_bit(&self.blocks[block_idx], bit_idx))
145        }
146    }
147
148    #[test]
149    fn test_blocked_bloom_basic() {
150        let mut bloom = TestBloomFilter::new(42);
151
152        let key1 = b"test_key_1";
153        let key2 = b"test_key_2";
154        let key3 = b"test_key_3";
155
156        // Initially empty
157        assert!(!bloom.contains(key1));
158
159        // Insert and check
160        bloom.insert(key1);
161        assert!(bloom.contains(key1));
162        assert!(!bloom.contains(key2));
163
164        // Insert more
165        bloom.insert(key2);
166        assert!(bloom.contains(key1));
167        assert!(bloom.contains(key2));
168        assert!(!bloom.contains(key3));
169    }
170
171    #[test]
172    fn test_sparse_blocks() {
173        let mut bloom = TestBloomFilter::new(42);
174
175        // Insert a few items
176        bloom.insert(b"key1");
177        bloom.insert(b"key2");
178        bloom.insert(b"key3");
179
180        let bloomed = bloom.clone();
181
182        let sparse: Vec<_> = bloom.into_sparse_blocks().collect();
183
184        // Should have far fewer than 128 blocks
185        assert!(sparse.len() <= 3);
186        assert!(!sparse.is_empty());
187
188        // All sparse blocks should be non-zero
189        for (idx, block) in &sparse {
190            assert!(bloomed.is_block_nonzero(*idx));
191            assert!(block.iter().any(|&b| b != 0));
192        }
193    }
194
195    #[test]
196    fn test_no_oversaturation() {
197        // Reproduce cp_block 9995 scenario: 1,292 items with seed 9995
198        let seed: u128 = 9995;
199        let num_items = 1292;
200        let mut bloom = TestBloomFilter::new(seed);
201
202        // Generate realistic items (32-byte addresses)
203        for i in 0..num_items {
204            let mut addr = [0u8; 32];
205            addr[0..8].copy_from_slice(&(i as u64).to_le_bytes());
206            bloom.insert(&addr);
207        }
208
209        let bloomed = bloom.clone();
210
211        // Analyze saturation across all non-zero blocks
212        let sparse_blocks: Vec<_> = bloom.into_sparse_blocks().collect();
213
214        let mut total_saturated_bytes = 0;
215        let mut max_saturated_bytes = 0;
216
217        for (block_idx, block_data) in &sparse_blocks {
218            let mut saturated_bytes = 0;
219            let mut nonzero_bytes = 0;
220
221            for &byte in block_data {
222                if byte != 0 {
223                    nonzero_bytes += 1;
224                }
225                if byte == 255 {
226                    saturated_bytes += 1;
227                }
228            }
229
230            total_saturated_bytes += saturated_bytes;
231            max_saturated_bytes = max_saturated_bytes.max(saturated_bytes);
232
233            // With ~10 items per block (1292/128), we expect ~7-15 bytes with at least one bit
234            // and 0-3 fully saturated bytes (healthy threshold)
235            assert!(
236                saturated_bytes <= 5,
237                "Block {} has {} saturated bytes (should be ≤5). \
238                 This indicates clustering! nonzero_bytes={}, block_data={:?}",
239                block_idx,
240                saturated_bytes,
241                nonzero_bytes,
242                &block_data[0..16] // Show first 16 bytes for debugging
243            );
244        }
245
246        let avg_saturated_bytes = total_saturated_bytes as f64 / sparse_blocks.len() as f64;
247
248        // Healthy thresholds for 1,292 items across 128 blocks (~10 items/block)
249        assert!(
250            avg_saturated_bytes < 3.0,
251            "Average saturation too high: {:.2} (should be <3.0)",
252            avg_saturated_bytes
253        );
254        assert!(
255            max_saturated_bytes <= 5,
256            "Max saturation too high: {} (should be ≤5)",
257            max_saturated_bytes
258        );
259
260        // Verify all items can be found (no false negatives)
261        for i in 0..num_items {
262            let mut addr = [0u8; 32];
263            addr[0..8].copy_from_slice(&(i as u64).to_le_bytes());
264            assert!(
265                bloomed.contains(&addr),
266                "False negative for item {} (bloom filter should never have false negatives)",
267                i
268            );
269        }
270    }
271
272    #[test]
273    fn test_single_item_sets_exactly_k_bits() {
274        // Verify that inserting ONE item sets exactly k bits (one per hash function)
275        let seed: u128 = 10001;
276        let mut bloom = TestBloomFilter::new(seed);
277
278        // Create a test key
279        let key = b"test_single_item";
280
281        // Count bits before insert
282        let mut bits_before = 0;
283        for block_idx in 0..NUM_BLOOM_BLOCKS {
284            if let Some(block) = bloom.block(block_idx) {
285                for &byte in block {
286                    bits_before += byte.count_ones();
287                }
288            }
289        }
290        assert_eq!(bits_before, 0, "Bloom should start with 0 bits set");
291
292        // Insert one item
293        bloom.insert(key);
294
295        // Count bits after insert
296        let mut bits_after = 0;
297        let mut bits_per_block = vec![0u32; NUM_BLOOM_BLOCKS];
298        for (block_idx, bits) in bits_per_block.iter_mut().enumerate() {
299            if let Some(block) = bloom.block(block_idx) {
300                for &byte in block {
301                    *bits += byte.count_ones();
302                    bits_after += byte.count_ones();
303                }
304            }
305        }
306
307        // Find which block has bits set
308        let non_zero_blocks: Vec<_> = bits_per_block
309            .iter()
310            .enumerate()
311            .filter(|&(_, count)| *count > 0)
312            .collect();
313
314        // Should be exactly NUM_HASHES bits, all in ONE block
315        assert_eq!(
316            bits_after, NUM_HASHES,
317            "Should set exactly {} bits (NUM_HASHES), got {}",
318            NUM_HASHES, bits_after
319        );
320        assert_eq!(
321            non_zero_blocks.len(),
322            1,
323            "Should affect only 1 block, got {}",
324            non_zero_blocks.len()
325        );
326    }
327}