sui_indexer_alt_schema/blooms/
blocked.rs1use crate::blooms::hash::DoubleHasher;
38use crate::blooms::hash::set_bit;
39
40pub struct BlockedBloomProbe {
42 pub block_idx: usize,
43 pub byte_offsets: Vec<usize>,
44 pub bit_masks: Vec<usize>,
45}
46
47#[derive(Clone)]
55pub struct BlockedBloomFilter<const BYTES: usize, const BLOCKS: usize, const HASHES: u32> {
56 blocks: Box<[[u8; BYTES]]>,
57 seed: u128,
58}
59
60impl<const BYTES: usize, const BLOCKS: usize, const HASHES: u32>
61 BlockedBloomFilter<BYTES, BLOCKS, HASHES>
62{
63 pub fn new(seed: u128) -> Self {
64 Self {
65 blocks: vec![[0u8; BYTES]; BLOCKS].into_boxed_slice(),
66 seed,
67 }
68 }
69
70 pub fn insert(&mut self, value: &[u8]) {
75 let (block_idx, bit_positions) = Self::hash(value, self.seed);
76 let block = &mut self.blocks[block_idx];
77 for bit_idx in bit_positions {
78 set_bit(block, bit_idx);
79 }
80 }
81
82 pub fn into_sparse_blocks(self) -> impl Iterator<Item = (usize, [u8; BYTES])> {
84 self.blocks
85 .into_iter()
86 .enumerate()
87 .filter(|(_, block)| block.iter().any(|&b| b != 0))
88 }
89
90 pub fn hash(value: &[u8], seed: u128) -> (usize, impl Iterator<Item = usize>) {
93 let bits_per_block = BYTES * 8;
94 let block_idx = {
95 let mut hasher = DoubleHasher::with_value(value, seed);
96 (hasher.next_hash() as usize) % BLOCKS
97 };
98 let bit_iter = DoubleHasher::with_value(value, seed.wrapping_add(1))
99 .take(HASHES as usize)
100 .map(move |h| (h as usize) % bits_per_block);
101 (block_idx, bit_iter)
102 }
103}
104
105impl<T: AsRef<[u8]>, const BYTES: usize, const BLOCKS: usize, const HASHES: u32> Extend<T>
106 for BlockedBloomFilter<BYTES, BLOCKS, HASHES>
107{
108 fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
109 for key in iter {
110 self.insert(key.as_ref());
111 }
112 }
113}
114
115#[cfg(test)]
116mod tests {
117 use crate::blooms::hash::check_bit;
118 use crate::cp_bloom_blocks::BLOOM_BLOCK_BYTES;
119 use crate::cp_bloom_blocks::NUM_BLOOM_BLOCKS;
120 use crate::cp_bloom_blocks::NUM_HASHES;
121
122 use super::*;
123
124 type TestBloomFilter = BlockedBloomFilter<BLOOM_BLOCK_BYTES, NUM_BLOOM_BLOCKS, NUM_HASHES>;
125
126 impl<const BYTES: usize, const BLOCKS: usize, const HASHES: u32>
127 BlockedBloomFilter<BYTES, BLOCKS, HASHES>
128 {
129 pub fn block(&self, block_idx: usize) -> Option<&[u8; BYTES]> {
131 self.blocks.get(block_idx)
132 }
133
134 pub fn is_block_nonzero(&self, block_idx: usize) -> bool {
136 self.blocks
137 .get(block_idx)
138 .is_some_and(|b| b.iter().any(|&x| x != 0))
139 }
140
141 pub fn contains(&self, key: &[u8]) -> bool {
143 let (block_idx, mut bit_idxs) = Self::hash(key, self.seed);
144 bit_idxs.all(|bit_idx| check_bit(&self.blocks[block_idx], bit_idx))
145 }
146 }
147
148 #[test]
149 fn test_blocked_bloom_basic() {
150 let mut bloom = TestBloomFilter::new(42);
151
152 let key1 = b"test_key_1";
153 let key2 = b"test_key_2";
154 let key3 = b"test_key_3";
155
156 assert!(!bloom.contains(key1));
158
159 bloom.insert(key1);
161 assert!(bloom.contains(key1));
162 assert!(!bloom.contains(key2));
163
164 bloom.insert(key2);
166 assert!(bloom.contains(key1));
167 assert!(bloom.contains(key2));
168 assert!(!bloom.contains(key3));
169 }
170
171 #[test]
172 fn test_sparse_blocks() {
173 let mut bloom = TestBloomFilter::new(42);
174
175 bloom.insert(b"key1");
177 bloom.insert(b"key2");
178 bloom.insert(b"key3");
179
180 let bloomed = bloom.clone();
181
182 let sparse: Vec<_> = bloom.into_sparse_blocks().collect();
183
184 assert!(sparse.len() <= 3);
186 assert!(!sparse.is_empty());
187
188 for (idx, block) in &sparse {
190 assert!(bloomed.is_block_nonzero(*idx));
191 assert!(block.iter().any(|&b| b != 0));
192 }
193 }
194
195 #[test]
196 fn test_no_oversaturation() {
197 let seed: u128 = 9995;
199 let num_items = 1292;
200 let mut bloom = TestBloomFilter::new(seed);
201
202 for i in 0..num_items {
204 let mut addr = [0u8; 32];
205 addr[0..8].copy_from_slice(&(i as u64).to_le_bytes());
206 bloom.insert(&addr);
207 }
208
209 let bloomed = bloom.clone();
210
211 let sparse_blocks: Vec<_> = bloom.into_sparse_blocks().collect();
213
214 let mut total_saturated_bytes = 0;
215 let mut max_saturated_bytes = 0;
216
217 for (block_idx, block_data) in &sparse_blocks {
218 let mut saturated_bytes = 0;
219 let mut nonzero_bytes = 0;
220
221 for &byte in block_data {
222 if byte != 0 {
223 nonzero_bytes += 1;
224 }
225 if byte == 255 {
226 saturated_bytes += 1;
227 }
228 }
229
230 total_saturated_bytes += saturated_bytes;
231 max_saturated_bytes = max_saturated_bytes.max(saturated_bytes);
232
233 assert!(
236 saturated_bytes <= 5,
237 "Block {} has {} saturated bytes (should be ≤5). \
238 This indicates clustering! nonzero_bytes={}, block_data={:?}",
239 block_idx,
240 saturated_bytes,
241 nonzero_bytes,
242 &block_data[0..16] );
244 }
245
246 let avg_saturated_bytes = total_saturated_bytes as f64 / sparse_blocks.len() as f64;
247
248 assert!(
250 avg_saturated_bytes < 3.0,
251 "Average saturation too high: {:.2} (should be <3.0)",
252 avg_saturated_bytes
253 );
254 assert!(
255 max_saturated_bytes <= 5,
256 "Max saturation too high: {} (should be ≤5)",
257 max_saturated_bytes
258 );
259
260 for i in 0..num_items {
262 let mut addr = [0u8; 32];
263 addr[0..8].copy_from_slice(&(i as u64).to_le_bytes());
264 assert!(
265 bloomed.contains(&addr),
266 "False negative for item {} (bloom filter should never have false negatives)",
267 i
268 );
269 }
270 }
271
272 #[test]
273 fn test_single_item_sets_exactly_k_bits() {
274 let seed: u128 = 10001;
276 let mut bloom = TestBloomFilter::new(seed);
277
278 let key = b"test_single_item";
280
281 let mut bits_before = 0;
283 for block_idx in 0..NUM_BLOOM_BLOCKS {
284 if let Some(block) = bloom.block(block_idx) {
285 for &byte in block {
286 bits_before += byte.count_ones();
287 }
288 }
289 }
290 assert_eq!(bits_before, 0, "Bloom should start with 0 bits set");
291
292 bloom.insert(key);
294
295 let mut bits_after = 0;
297 let mut bits_per_block = vec![0u32; NUM_BLOOM_BLOCKS];
298 for (block_idx, bits) in bits_per_block.iter_mut().enumerate() {
299 if let Some(block) = bloom.block(block_idx) {
300 for &byte in block {
301 *bits += byte.count_ones();
302 bits_after += byte.count_ones();
303 }
304 }
305 }
306
307 let non_zero_blocks: Vec<_> = bits_per_block
309 .iter()
310 .enumerate()
311 .filter(|&(_, count)| *count > 0)
312 .collect();
313
314 assert_eq!(
316 bits_after, NUM_HASHES,
317 "Should set exactly {} bits (NUM_HASHES), got {}",
318 NUM_HASHES, bits_after
319 );
320 assert_eq!(
321 non_zero_blocks.len(),
322 1,
323 "Should affect only 1 block, got {}",
324 non_zero_blocks.len()
325 );
326 }
327}