sui_indexer_alt_schema/blooms/
hash.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Double hashing for bloom filters.
5//!
6//! Bloom filters need k independent hash functions to set k bit positions per element.
7//! Rather than computing k separate cryptographic hashes, double hashing
8//! generates k positions from just two hash values without any loss to
9//! asymptotic false positive rate. [Kirsch-Mitzenmacher 2006]
10//!
11//! This hasher takes a single SipHash-1-3 call to produce one 64-bit hash, which is split into h1 and h2
12//! components. Subsequent positions use enhanced double hashing with rotation. [fastbloom]
13//!
14//! [Kirsch-Mitzenmacher 2006]: https://www.eecs.harvard.edu/~michaelm/postscripts/esa2006a.pdf
15//! [fastbloom]: https://github.com/tomtomwombat/fastbloom
16use std::hash::Hasher;
17
18use siphasher::sip::SipHasher13;
19
20/// Constant for deriving h2 from upper bits of hash.
21///
22/// Chosen as 2^64 / π for a large number with mixed bits for good distribution.
23const H2_MULTIPLIER: u64 = 0x517c_c1b7_2722_0a95;
24
25/// Double hasher for bloom filter position generation.
26///
27/// Produces an infinite sequence of hash values derived from a single
28/// source hash using the double hashing technique.
29#[derive(Clone, Copy)]
30pub struct DoubleHasher {
31    h1: u64,
32    h2: u64,
33}
34
35impl DoubleHasher {
36    /// Creates a DoubleHasher with two 64-bit hashes (h1, h2) from h1. Derives h2 by taking the upper 32 bits of h1 and multiplying it with
37    ///  H2_MULTIPLIER, a large number with mixed bits, to distribute the bits across h2.
38    pub fn new(h1: u64) -> Self {
39        let h2 = h1.wrapping_shr(32).wrapping_mul(H2_MULTIPLIER);
40        Self { h1, h2 }
41    }
42
43    /// Creates a DoubleHasher from a byte value we want to insert or check in the bloom filter (e.g., an object ID).
44    ///
45    /// Entry point for bloom filter operations. Iterate through the DoubleHasher to produce the index of the block and the bit positions set in that block for a value and seed so we can
46    /// - set bit positions in a bloom filter for a value
47    /// - check if a value is in the bloom filter
48    pub fn with_value(value: &[u8], seed: u128) -> Self {
49        let mut hasher = SipHasher13::new_with_keys(seed as u64, (seed >> 64) as u64);
50        hasher.write(value);
51        Self::new(hasher.finish())
52    }
53
54    /// Generate a new hash value from the current h1 and h2 with:
55    /// - modulo addition of two independent hashes with well distributed bits to ensure inputs diverge into uncorrelated bits.
56    /// - circular left shift by 5 (coprime to 64) to avoid evenly spaced distributions by mixing high and low bits after the addition.
57    pub fn next_hash(&mut self) -> u64 {
58        self.h1 = self.h1.wrapping_add(self.h2).rotate_left(5);
59        self.h1
60    }
61}
62
63impl Iterator for DoubleHasher {
64    type Item = u64;
65
66    fn next(&mut self) -> Option<u64> {
67        Some(self.next_hash())
68    }
69}
70
71pub fn set_bit(bits: &mut [u8], pos: usize) {
72    bits[pos / 8] |= 1 << (pos % 8);
73}
74
75pub fn check_bit(bits: &[u8], pos: usize) -> bool {
76    bits[pos / 8] & (1 << (pos % 8)) != 0
77}
78
79#[cfg(test)]
80mod tests {
81    use super::*;
82
83    #[test]
84    fn test_double_hasher_deterministic() {
85        let value = b"test_key";
86        let seed = 42u128;
87
88        let mut hasher_a = DoubleHasher::with_value(value, seed);
89        let mut hasher_b = DoubleHasher::with_value(value, seed);
90
91        for _ in 0..10 {
92            assert_eq!(hasher_a.next_hash(), hasher_b.next_hash());
93        }
94    }
95
96    #[test]
97    fn test_different_seeds_produce_different_hashes() {
98        let value = b"test_value";
99
100        let mut hasher_a = DoubleHasher::with_value(value, 1);
101        let mut hasher_b = DoubleHasher::with_value(value, 2);
102
103        assert_ne!(hasher_a.next_hash(), hasher_b.next_hash());
104    }
105
106    #[test]
107    fn test_double_hasher_iterator() {
108        let value = b"test_value";
109        let num_bits = 8192;
110        let num_hashes = 5;
111
112        let positions: Vec<usize> = DoubleHasher::with_value(value, 42)
113            .take(num_hashes)
114            .map(|h| (h as usize) % num_bits)
115            .collect();
116
117        assert_eq!(positions.len(), num_hashes);
118        for pos in positions {
119            assert!(pos < num_bits);
120        }
121    }
122
123    #[test]
124    fn test_first_hash_as_block_index() {
125        let value = b"test_value";
126        let num_blocks = 128;
127
128        for seed in 0..100 {
129            let mut hasher = DoubleHasher::with_value(value, seed as u128);
130            let block_idx = (hasher.next_hash() as usize) % num_blocks;
131            assert!(block_idx < num_blocks);
132        }
133    }
134
135    #[test]
136    fn test_hash_distribution() {
137        let num_samples = 10000;
138        let seed = 9995u128;
139
140        let mut hash_values = Vec::new();
141        for i in 0..num_samples {
142            let value = format!("test_value_{}", i).into_bytes();
143            let mut hasher = DoubleHasher::with_value(&value, seed);
144            hash_values.push(hasher.next_hash());
145        }
146
147        let min_h = hash_values.iter().min().unwrap();
148        let max_h = hash_values.iter().max().unwrap();
149        let range = max_h.wrapping_sub(*min_h);
150
151        // Range should be large (good distribution across 64-bit space)
152        assert!(
153            range > u64::MAX / 2,
154            "hash range too small: {} (expected > {})",
155            range,
156            u64::MAX / 2
157        );
158    }
159
160    #[test]
161    fn test_set_and_check_bit() {
162        let mut bits = vec![0u8; 16];
163
164        // Set some bits
165        set_bit(&mut bits, 0);
166        set_bit(&mut bits, 7);
167        set_bit(&mut bits, 8);
168        set_bit(&mut bits, 127);
169
170        // Check they're set
171        assert!(check_bit(&bits, 0));
172        assert!(check_bit(&bits, 7));
173        assert!(check_bit(&bits, 8));
174        assert!(check_bit(&bits, 127));
175
176        // Check others are not set
177        assert!(!check_bit(&bits, 1));
178        assert!(!check_bit(&bits, 64));
179        assert!(!check_bit(&bits, 126));
180
181        // Verify byte values
182        assert_eq!(bits[0], 0b1000_0001); // bits 0 and 7
183        assert_eq!(bits[1], 0b0000_0001); // bit 8
184        assert_eq!(bits[15], 0b1000_0000); // bit 127
185    }
186}