typed_store/rocks/
options.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::{collections::BTreeMap, env};
5
6use rocksdb::{BlockBasedOptions, Cache, MergeOperands, ReadOptions, compaction_filter::Decision};
7use tap::TapFallible;
8use tracing::{info, warn};
9
10// Write buffer size per RocksDB instance can be set via the env var below.
11// If the env var is not set, use the default value in MiB.
12const ENV_VAR_DB_WRITE_BUFFER_SIZE: &str = "DB_WRITE_BUFFER_SIZE_MB";
13const DEFAULT_DB_WRITE_BUFFER_SIZE: usize = 1024;
14
15// Write ahead log size per RocksDB instance can be set via the env var below.
16// If the env var is not set, use the default value in MiB.
17const ENV_VAR_DB_WAL_SIZE: &str = "DB_WAL_SIZE_MB";
18const DEFAULT_DB_WAL_SIZE: usize = 1024;
19
20// Environment variable to control behavior of write throughput optimized tables.
21const ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER: &str = "L0_NUM_FILES_COMPACTION_TRIGGER";
22const DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER: usize = 4;
23const DEFAULT_UNIVERSAL_COMPACTION_L0_NUM_FILES_COMPACTION_TRIGGER: usize = 80;
24const ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB: &str = "MAX_WRITE_BUFFER_SIZE_MB";
25const DEFAULT_MAX_WRITE_BUFFER_SIZE_MB: usize = 256;
26const ENV_VAR_MAX_WRITE_BUFFER_NUMBER: &str = "MAX_WRITE_BUFFER_NUMBER";
27const DEFAULT_MAX_WRITE_BUFFER_NUMBER: usize = 6;
28const ENV_VAR_TARGET_FILE_SIZE_BASE_MB: &str = "TARGET_FILE_SIZE_BASE_MB";
29const DEFAULT_TARGET_FILE_SIZE_BASE_MB: usize = 128;
30
31// Set to 1 to disable blob storage for transactions and effects.
32const ENV_VAR_DISABLE_BLOB_STORAGE: &str = "DISABLE_BLOB_STORAGE";
33const ENV_VAR_DB_PARALLELISM: &str = "DB_PARALLELISM";
34
35#[derive(Clone, Debug)]
36pub struct ReadWriteOptions {
37    pub ignore_range_deletions: bool,
38    /// When set, debug log the hash of the key and value bytes when inserting to
39    /// this table.
40    pub log_value_hash: bool,
41}
42
43impl ReadWriteOptions {
44    pub fn readopts(&self) -> ReadOptions {
45        let mut readopts = ReadOptions::default();
46        readopts.set_ignore_range_deletions(self.ignore_range_deletions);
47        readopts
48    }
49
50    pub fn set_ignore_range_deletions(mut self, ignore: bool) -> Self {
51        self.ignore_range_deletions = ignore;
52        self
53    }
54
55    pub fn set_log_value_hash(mut self, log_value_hash: bool) -> Self {
56        self.log_value_hash = log_value_hash;
57        self
58    }
59}
60
61impl Default for ReadWriteOptions {
62    fn default() -> Self {
63        Self {
64            ignore_range_deletions: true,
65            log_value_hash: false,
66        }
67    }
68}
69
70#[derive(Default, Clone)]
71pub struct DBOptions {
72    pub options: rocksdb::Options,
73    pub rw_options: ReadWriteOptions,
74}
75
76#[derive(Clone)]
77pub struct DBMapTableConfigMap(BTreeMap<String, DBOptions>);
78impl DBMapTableConfigMap {
79    pub fn new(map: BTreeMap<String, DBOptions>) -> Self {
80        Self(map)
81    }
82
83    pub fn to_map(&self) -> BTreeMap<String, DBOptions> {
84        self.0.clone()
85    }
86}
87
88impl DBOptions {
89    // Optimize lookup perf for tables where no scans are performed.
90    // If non-trivial number of values can be > 512B in size, it is beneficial to also
91    // specify optimize_for_large_values_no_scan().
92    pub fn optimize_for_point_lookup(mut self, block_cache_size_mb: usize) -> DBOptions {
93        // NOTE: this overwrites the block options.
94        self.options
95            .optimize_for_point_lookup(block_cache_size_mb as u64);
96        self
97    }
98
99    // Optimize write and lookup perf for tables which are rarely scanned, and have large values.
100    // https://rocksdb.org/blog/2021/05/26/integrated-blob-db.html
101    pub fn optimize_for_large_values_no_scan(mut self, min_blob_size: u64) -> DBOptions {
102        if env::var(ENV_VAR_DISABLE_BLOB_STORAGE).is_ok() {
103            info!("Large value blob storage optimization is disabled via env var.");
104            return self;
105        }
106
107        // Blob settings.
108        self.options.set_enable_blob_files(true);
109        self.options
110            .set_blob_compression_type(rocksdb::DBCompressionType::Lz4);
111        self.options.set_enable_blob_gc(true);
112        // Since each blob can have non-trivial size overhead, and compression does not work across blobs,
113        // set a min blob size in bytes to so small transactions and effects are kept in sst files.
114        self.options.set_min_blob_size(min_blob_size);
115
116        // Increase write buffer size to 256MiB.
117        let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB)
118            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB)
119            * 1024
120            * 1024;
121        self.options.set_write_buffer_size(write_buffer_size);
122        // Since large blobs are not in sst files, reduce the target file size and base level
123        // target size.
124        let target_file_size_base = 64 << 20;
125        self.options
126            .set_target_file_size_base(target_file_size_base);
127        // Level 1 default to 64MiB * 4 ~ 256MiB.
128        let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER)
129            .unwrap_or(DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER);
130        self.options
131            .set_max_bytes_for_level_base(target_file_size_base * max_level_zero_file_num as u64);
132
133        self
134    }
135
136    // Optimize tables with a mix of lookup and scan workloads.
137    pub fn optimize_for_read(mut self, block_cache_size_mb: usize) -> DBOptions {
138        self.options
139            .set_block_based_table_factory(&get_block_options(block_cache_size_mb, 16 << 10));
140        self
141    }
142
143    // Optimize DB receiving significant insertions.
144    pub fn optimize_db_for_write_throughput(
145        mut self,
146        db_max_write_buffer_gb: u64,
147        unlimited_open_files: bool,
148    ) -> DBOptions {
149        self.options
150            .set_db_write_buffer_size(db_max_write_buffer_gb as usize * 1024 * 1024 * 1024);
151        self.options
152            .set_max_total_wal_size(db_max_write_buffer_gb * 1024 * 1024 * 1024);
153        if unlimited_open_files {
154            self.options.set_max_open_files(-1);
155        }
156        self
157    }
158
159    // Optimize tables receiving significant insertions.
160    pub fn optimize_for_write_throughput(mut self) -> DBOptions {
161        // Increase write buffer size to 256MiB.
162        let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB)
163            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB)
164            * 1024
165            * 1024;
166        self.options.set_write_buffer_size(write_buffer_size);
167        // Increase write buffers to keep to 6 before slowing down writes.
168        let max_write_buffer_number = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_NUMBER)
169            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_NUMBER);
170        self.options
171            .set_max_write_buffer_number(max_write_buffer_number.try_into().unwrap());
172        // Keep 1 write buffer so recent writes can be read from memory.
173        self.options
174            .set_max_write_buffer_size_to_maintain((write_buffer_size).try_into().unwrap());
175
176        // Increase compaction trigger for level 0 to 6.
177        let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER)
178            .unwrap_or(DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER);
179        self.options.set_level_zero_file_num_compaction_trigger(
180            max_level_zero_file_num.try_into().unwrap(),
181        );
182        self.options.set_level_zero_slowdown_writes_trigger(
183            (max_level_zero_file_num * 12).try_into().unwrap(),
184        );
185        self.options
186            .set_level_zero_stop_writes_trigger((max_level_zero_file_num * 16).try_into().unwrap());
187
188        // Increase sst file size to 128MiB.
189        self.options.set_target_file_size_base(
190            read_size_from_env(ENV_VAR_TARGET_FILE_SIZE_BASE_MB)
191                .unwrap_or(DEFAULT_TARGET_FILE_SIZE_BASE_MB) as u64
192                * 1024
193                * 1024,
194        );
195
196        // Increase level 1 target size to 256MiB * 6 ~ 1.5GiB.
197        self.options
198            .set_max_bytes_for_level_base((write_buffer_size * max_level_zero_file_num) as u64);
199
200        self
201    }
202
203    // Optimize tables receiving significant insertions, without any deletions.
204    // Uses Universal compaction which is better suited for write-heavy workloads
205    // where data is never deleted (only dropped with the DB at epoch boundary).
206    pub fn optimize_for_write_throughput_no_deletion(mut self) -> DBOptions {
207        // Increase write buffer size to 256MiB.
208        let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB)
209            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB)
210            * 1024
211            * 1024;
212        self.options.set_write_buffer_size(write_buffer_size);
213        // Increase write buffers to keep to 6 before slowing down writes.
214        let max_write_buffer_number = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_NUMBER)
215            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_NUMBER);
216        self.options
217            .set_max_write_buffer_number(max_write_buffer_number.try_into().unwrap());
218        // Keep 1 write buffer so recent writes can be read from memory.
219        self.options
220            .set_max_write_buffer_size_to_maintain((write_buffer_size).try_into().unwrap());
221
222        // Switch to universal compactions.
223        self.options
224            .set_compaction_style(rocksdb::DBCompactionStyle::Universal);
225        let mut compaction_options = rocksdb::UniversalCompactOptions::default();
226        compaction_options.set_max_size_amplification_percent(10000);
227        compaction_options.set_stop_style(rocksdb::UniversalCompactionStopStyle::Similar);
228        self.options
229            .set_universal_compaction_options(&compaction_options);
230
231        let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER)
232            .unwrap_or(DEFAULT_UNIVERSAL_COMPACTION_L0_NUM_FILES_COMPACTION_TRIGGER);
233        self.options.set_level_zero_file_num_compaction_trigger(
234            max_level_zero_file_num.try_into().unwrap(),
235        );
236        self.options.set_level_zero_slowdown_writes_trigger(
237            (max_level_zero_file_num * 12).try_into().unwrap(),
238        );
239        self.options
240            .set_level_zero_stop_writes_trigger((max_level_zero_file_num * 16).try_into().unwrap());
241
242        // Increase sst file size to 128MiB.
243        self.options.set_target_file_size_base(
244            read_size_from_env(ENV_VAR_TARGET_FILE_SIZE_BASE_MB)
245                .unwrap_or(DEFAULT_TARGET_FILE_SIZE_BASE_MB) as u64
246                * 1024
247                * 1024,
248        );
249
250        // This should be a no-op for universal compaction but increasing it to be safe.
251        self.options
252            .set_max_bytes_for_level_base((write_buffer_size * max_level_zero_file_num) as u64);
253
254        self
255    }
256
257    // Optimize tables receiving significant insertions without any deletions, using FIFO compaction.
258    // These tables are dropped with the DBs, for example the epoch and consensus DBs.
259    pub fn optimize_for_no_deletion(mut self) -> DBOptions {
260        // Increase write buffer size to 256MiB.
261        let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB)
262            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB)
263            * 1024
264            * 1024;
265        self.options.set_write_buffer_size(write_buffer_size);
266        // Increase write buffers to keep to 6 before slowing down writes.
267        let max_write_buffer_number = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_NUMBER)
268            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_NUMBER);
269        self.options
270            .set_max_write_buffer_number(max_write_buffer_number.try_into().unwrap());
271        // Keep 1 write buffer so recent writes can be read from memory.
272        self.options
273            .set_max_write_buffer_size_to_maintain((write_buffer_size).try_into().unwrap());
274
275        // Switch to FIFO compaction.
276        self.options
277            .set_compaction_style(rocksdb::DBCompactionStyle::Fifo);
278        let mut compaction_options = rocksdb::FifoCompactOptions::default();
279        // Allow each consensus DB column family to grow unlimited, and never drop data because of size limits.
280        compaction_options.set_max_table_files_size(u64::MAX);
281        self.options
282            .set_fifo_compaction_options(&compaction_options);
283
284        let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER)
285            .unwrap_or(DEFAULT_UNIVERSAL_COMPACTION_L0_NUM_FILES_COMPACTION_TRIGGER);
286        self.options.set_level_zero_file_num_compaction_trigger(
287            max_level_zero_file_num.try_into().unwrap(),
288        );
289        self.options.set_level_zero_slowdown_writes_trigger(
290            (max_level_zero_file_num * 12).try_into().unwrap(),
291        );
292        self.options
293            .set_level_zero_stop_writes_trigger((max_level_zero_file_num * 16).try_into().unwrap());
294        self.options
295            .set_max_bytes_for_level_base((write_buffer_size * max_level_zero_file_num) as u64);
296
297        // Increase sst file size to 128MiB.
298        self.options.set_target_file_size_base(
299            read_size_from_env(ENV_VAR_TARGET_FILE_SIZE_BASE_MB)
300                .unwrap_or(DEFAULT_TARGET_FILE_SIZE_BASE_MB) as u64
301                * 1024
302                * 1024,
303        );
304
305        self
306    }
307
308    // Overrides the block options with different block cache size and block size.
309    pub fn set_block_options(
310        mut self,
311        block_cache_size_mb: usize,
312        block_size_bytes: usize,
313    ) -> DBOptions {
314        self.options
315            .set_block_based_table_factory(&get_block_options(
316                block_cache_size_mb,
317                block_size_bytes,
318            ));
319        self
320    }
321
322    // Disables write stalling and stopping based on pending compaction bytes.
323    pub fn disable_write_throttling(mut self) -> DBOptions {
324        self.options.set_soft_pending_compaction_bytes_limit(0);
325        self.options.set_hard_pending_compaction_bytes_limit(0);
326        self.options.set_level_zero_slowdown_writes_trigger(512);
327        self.options.set_level_zero_stop_writes_trigger(1024);
328        self
329    }
330
331    pub fn set_merge_operator_associative<F>(mut self, name: &str, merge_fn: F) -> DBOptions
332    where
333        F: Fn(&[u8], Option<&[u8]>, &MergeOperands) -> Option<Vec<u8>>
334            + Send
335            + Sync
336            + Clone
337            + 'static,
338    {
339        self.options.set_merge_operator_associative(name, merge_fn);
340        self
341    }
342
343    pub fn set_compaction_filter<F>(mut self, name: &str, filter_fn: F) -> DBOptions
344    where
345        F: FnMut(u32, &[u8], &[u8]) -> Decision + Send + 'static,
346    {
347        self.options.set_compaction_filter(name, filter_fn);
348        self
349    }
350}
351
352/// Creates a default RocksDB option, to be used when RocksDB option is unspecified.
353pub fn default_db_options() -> DBOptions {
354    let mut opt = rocksdb::Options::default();
355
356    // One common issue when running tests on Mac is that the default ulimit is too low,
357    // leading to I/O errors such as "Too many open files". Raising fdlimit to bypass it.
358    if let Some(limit) = fdlimit::raise_fd_limit() {
359        // on windows raise_fd_limit return None
360        opt.set_max_open_files((limit / 8) as i32);
361    }
362
363    // The table cache is locked for updates and this determines the number
364    // of shards, ie 2^10. Increase in case of lock contentions.
365    opt.set_table_cache_num_shard_bits(10);
366
367    // LSM compression settings
368    opt.set_compression_type(rocksdb::DBCompressionType::Lz4);
369    opt.set_bottommost_compression_type(rocksdb::DBCompressionType::Zstd);
370    opt.set_bottommost_zstd_max_train_bytes(1024 * 1024, true);
371
372    // Sui uses multiple RocksDB in a node, so total sizes of write buffers and WAL can be higher
373    // than the limits below.
374    //
375    // RocksDB also exposes the option to configure total write buffer size across multiple instances
376    // via `write_buffer_manager`. But the write buffer flush policy (flushing the buffer receiving
377    // the next write) may not work well. So sticking to per-db write buffer size limit for now.
378    //
379    // The environment variables are only meant to be emergency overrides. They may go away in future.
380    // It is preferable to update the default value, or override the option in code.
381    opt.set_db_write_buffer_size(
382        read_size_from_env(ENV_VAR_DB_WRITE_BUFFER_SIZE).unwrap_or(DEFAULT_DB_WRITE_BUFFER_SIZE)
383            * 1024
384            * 1024,
385    );
386    opt.set_max_total_wal_size(
387        read_size_from_env(ENV_VAR_DB_WAL_SIZE).unwrap_or(DEFAULT_DB_WAL_SIZE) as u64 * 1024 * 1024,
388    );
389
390    // Num threads for compactions and memtable flushes.
391    opt.increase_parallelism(read_size_from_env(ENV_VAR_DB_PARALLELISM).unwrap_or(8) as i32);
392
393    opt.set_enable_pipelined_write(true);
394
395    // Increase block size to 16KiB.
396    // https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks
397    opt.set_block_based_table_factory(&get_block_options(128, 16 << 10));
398
399    // Set memtable bloomfilter.
400    opt.set_memtable_prefix_bloom_ratio(0.02);
401
402    DBOptions {
403        options: opt,
404        rw_options: ReadWriteOptions::default(),
405    }
406}
407
408fn get_block_options(block_cache_size_mb: usize, block_size_bytes: usize) -> BlockBasedOptions {
409    // Set options mostly similar to those used in optimize_for_point_lookup(),
410    // except non-default binary and hash index, to hopefully reduce lookup latencies
411    // without causing any regression for scanning, with slightly more memory usages.
412    // https://github.com/facebook/rocksdb/blob/11cb6af6e5009c51794641905ca40ce5beec7fee/options/options.cc#L611-L621
413    let mut block_options = BlockBasedOptions::default();
414    // Overrides block size.
415    block_options.set_block_size(block_size_bytes);
416    // Configure a block cache.
417    block_options.set_block_cache(&Cache::new_lru_cache(block_cache_size_mb << 20));
418    // Set a bloomfilter with 1% false positive rate.
419    block_options.set_bloom_filter(10.0, false);
420    // From https://github.com/EighteenZi/rocksdb_wiki/blob/master/Block-Cache.md#caching-index-and-filter-blocks
421    block_options.set_pin_l0_filter_and_index_blocks_in_cache(true);
422    block_options
423}
424
425pub fn read_size_from_env(var_name: &str) -> Option<usize> {
426    env::var(var_name)
427        .ok()?
428        .parse::<usize>()
429        .tap_err(|e| {
430            warn!(
431                "Env var {} does not contain valid usize integer: {}",
432                var_name, e
433            )
434        })
435        .ok()
436}