typed_store/rocks/
options.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use rocksdb::{BlockBasedOptions, Cache, MergeOperands, ReadOptions, compaction_filter::Decision};
5use std::collections::BTreeMap;
6use std::env;
7use tap::TapFallible;
8use tracing::{info, warn};
9
10// Write buffer size per RocksDB instance can be set via the env var below.
11// If the env var is not set, use the default value in MiB.
12const ENV_VAR_DB_WRITE_BUFFER_SIZE: &str = "DB_WRITE_BUFFER_SIZE_MB";
13const DEFAULT_DB_WRITE_BUFFER_SIZE: usize = 1024;
14
15// Write ahead log size per RocksDB instance can be set via the env var below.
16// If the env var is not set, use the default value in MiB.
17const ENV_VAR_DB_WAL_SIZE: &str = "DB_WAL_SIZE_MB";
18const DEFAULT_DB_WAL_SIZE: usize = 1024;
19
20// Environment variable to control behavior of write throughput optimized tables.
21const ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER: &str = "L0_NUM_FILES_COMPACTION_TRIGGER";
22const DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER: usize = 4;
23const DEFAULT_UNIVERSAL_COMPACTION_L0_NUM_FILES_COMPACTION_TRIGGER: usize = 80;
24const ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB: &str = "MAX_WRITE_BUFFER_SIZE_MB";
25const DEFAULT_MAX_WRITE_BUFFER_SIZE_MB: usize = 256;
26const ENV_VAR_MAX_WRITE_BUFFER_NUMBER: &str = "MAX_WRITE_BUFFER_NUMBER";
27const DEFAULT_MAX_WRITE_BUFFER_NUMBER: usize = 6;
28const ENV_VAR_TARGET_FILE_SIZE_BASE_MB: &str = "TARGET_FILE_SIZE_BASE_MB";
29const DEFAULT_TARGET_FILE_SIZE_BASE_MB: usize = 128;
30
31// Set to 1 to disable blob storage for transactions and effects.
32const ENV_VAR_DISABLE_BLOB_STORAGE: &str = "DISABLE_BLOB_STORAGE";
33const ENV_VAR_DB_PARALLELISM: &str = "DB_PARALLELISM";
34
35#[derive(Clone, Debug)]
36pub struct ReadWriteOptions {
37    pub ignore_range_deletions: bool,
38    /// When set, debug log the hash of the key and value bytes when inserting to
39    /// this table.
40    pub log_value_hash: bool,
41    /// Whether to sync to disk on every write.
42    pub sync_writes: bool,
43}
44
45impl ReadWriteOptions {
46    pub fn readopts(&self) -> ReadOptions {
47        let mut readopts = ReadOptions::default();
48        readopts.set_ignore_range_deletions(self.ignore_range_deletions);
49        readopts
50    }
51
52    pub fn set_ignore_range_deletions(mut self, ignore: bool) -> Self {
53        self.ignore_range_deletions = ignore;
54        self
55    }
56
57    pub fn set_log_value_hash(mut self, log_value_hash: bool) -> Self {
58        self.log_value_hash = log_value_hash;
59        self
60    }
61
62    pub fn set_sync_writes(mut self, sync_writes: bool) -> Self {
63        self.sync_writes = sync_writes;
64        self
65    }
66}
67
68impl Default for ReadWriteOptions {
69    fn default() -> Self {
70        Self {
71            ignore_range_deletions: true,
72            log_value_hash: false,
73            sync_writes: false,
74        }
75    }
76}
77
78#[derive(Default, Clone)]
79pub struct DBOptions {
80    pub options: rocksdb::Options,
81    pub rw_options: ReadWriteOptions,
82}
83
84#[derive(Clone)]
85pub struct DBMapTableConfigMap(BTreeMap<String, DBOptions>);
86impl DBMapTableConfigMap {
87    pub fn new(map: BTreeMap<String, DBOptions>) -> Self {
88        Self(map)
89    }
90
91    pub fn to_map(&self) -> BTreeMap<String, DBOptions> {
92        self.0.clone()
93    }
94}
95
96impl DBOptions {
97    // Optimize lookup perf for tables where no scans are performed.
98    // If non-trivial number of values can be > 512B in size, it is beneficial to also
99    // specify optimize_for_large_values_no_scan().
100    pub fn optimize_for_point_lookup(mut self, block_cache_size_mb: usize) -> DBOptions {
101        // NOTE: this overwrites the block options.
102        self.options
103            .optimize_for_point_lookup(block_cache_size_mb as u64);
104        self
105    }
106
107    // Optimize write and lookup perf for tables which are rarely scanned, and have large values.
108    // https://rocksdb.org/blog/2021/05/26/integrated-blob-db.html
109    pub fn optimize_for_large_values_no_scan(mut self, min_blob_size: u64) -> DBOptions {
110        if env::var(ENV_VAR_DISABLE_BLOB_STORAGE).is_ok() {
111            info!("Large value blob storage optimization is disabled via env var.");
112            return self;
113        }
114
115        // Blob settings.
116        self.options.set_enable_blob_files(true);
117        self.options
118            .set_blob_compression_type(rocksdb::DBCompressionType::Lz4);
119        self.options.set_enable_blob_gc(true);
120        // Since each blob can have non-trivial size overhead, and compression does not work across blobs,
121        // set a min blob size in bytes to so small transactions and effects are kept in sst files.
122        self.options.set_min_blob_size(min_blob_size);
123
124        // Increase write buffer size to 256MiB.
125        let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB)
126            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB)
127            * 1024
128            * 1024;
129        self.options.set_write_buffer_size(write_buffer_size);
130        // Since large blobs are not in sst files, reduce the target file size and base level
131        // target size.
132        let target_file_size_base = 64 << 20;
133        self.options
134            .set_target_file_size_base(target_file_size_base);
135        // Level 1 default to 64MiB * 4 ~ 256MiB.
136        let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER)
137            .unwrap_or(DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER);
138        self.options
139            .set_max_bytes_for_level_base(target_file_size_base * max_level_zero_file_num as u64);
140
141        self
142    }
143
144    // Optimize tables with a mix of lookup and scan workloads.
145    pub fn optimize_for_read(mut self, block_cache_size_mb: usize) -> DBOptions {
146        self.options
147            .set_block_based_table_factory(&get_block_options(block_cache_size_mb, 16 << 10));
148        self
149    }
150
151    // Optimize DB receiving significant insertions.
152    pub fn optimize_db_for_write_throughput(mut self, db_max_write_buffer_gb: u64) -> DBOptions {
153        self.options
154            .set_db_write_buffer_size(db_max_write_buffer_gb as usize * 1024 * 1024 * 1024);
155        self.options
156            .set_max_total_wal_size(db_max_write_buffer_gb * 1024 * 1024 * 1024);
157        self
158    }
159
160    // Optimize tables receiving significant insertions.
161    pub fn optimize_for_write_throughput(mut self) -> DBOptions {
162        // Increase write buffer size to 256MiB.
163        let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB)
164            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB)
165            * 1024
166            * 1024;
167        self.options.set_write_buffer_size(write_buffer_size);
168        // Increase write buffers to keep to 6 before slowing down writes.
169        let max_write_buffer_number = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_NUMBER)
170            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_NUMBER);
171        self.options
172            .set_max_write_buffer_number(max_write_buffer_number.try_into().unwrap());
173        // Keep 1 write buffer so recent writes can be read from memory.
174        self.options
175            .set_max_write_buffer_size_to_maintain((write_buffer_size).try_into().unwrap());
176
177        // Increase compaction trigger for level 0 to 6.
178        let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER)
179            .unwrap_or(DEFAULT_L0_NUM_FILES_COMPACTION_TRIGGER);
180        self.options.set_level_zero_file_num_compaction_trigger(
181            max_level_zero_file_num.try_into().unwrap(),
182        );
183        self.options.set_level_zero_slowdown_writes_trigger(
184            (max_level_zero_file_num * 12).try_into().unwrap(),
185        );
186        self.options
187            .set_level_zero_stop_writes_trigger((max_level_zero_file_num * 16).try_into().unwrap());
188
189        // Increase sst file size to 128MiB.
190        self.options.set_target_file_size_base(
191            read_size_from_env(ENV_VAR_TARGET_FILE_SIZE_BASE_MB)
192                .unwrap_or(DEFAULT_TARGET_FILE_SIZE_BASE_MB) as u64
193                * 1024
194                * 1024,
195        );
196
197        // Increase level 1 target size to 256MiB * 6 ~ 1.5GiB.
198        self.options
199            .set_max_bytes_for_level_base((write_buffer_size * max_level_zero_file_num) as u64);
200
201        self
202    }
203
204    // Optimize tables receiving significant insertions, without any deletions.
205    // TODO: merge this function with optimize_for_write_throughput(), and use a flag to
206    // indicate if deletion is received.
207    pub fn optimize_for_write_throughput_no_deletion(mut self) -> DBOptions {
208        // Increase write buffer size to 256MiB.
209        let write_buffer_size = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_SIZE_MB)
210            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_SIZE_MB)
211            * 1024
212            * 1024;
213        self.options.set_write_buffer_size(write_buffer_size);
214        // Increase write buffers to keep to 6 before slowing down writes.
215        let max_write_buffer_number = read_size_from_env(ENV_VAR_MAX_WRITE_BUFFER_NUMBER)
216            .unwrap_or(DEFAULT_MAX_WRITE_BUFFER_NUMBER);
217        self.options
218            .set_max_write_buffer_number(max_write_buffer_number.try_into().unwrap());
219        // Keep 1 write buffer so recent writes can be read from memory.
220        self.options
221            .set_max_write_buffer_size_to_maintain((write_buffer_size).try_into().unwrap());
222
223        // Switch to universal compactions.
224        self.options
225            .set_compaction_style(rocksdb::DBCompactionStyle::Universal);
226        let mut compaction_options = rocksdb::UniversalCompactOptions::default();
227        compaction_options.set_max_size_amplification_percent(10000);
228        compaction_options.set_stop_style(rocksdb::UniversalCompactionStopStyle::Similar);
229        self.options
230            .set_universal_compaction_options(&compaction_options);
231
232        let max_level_zero_file_num = read_size_from_env(ENV_VAR_L0_NUM_FILES_COMPACTION_TRIGGER)
233            .unwrap_or(DEFAULT_UNIVERSAL_COMPACTION_L0_NUM_FILES_COMPACTION_TRIGGER);
234        self.options.set_level_zero_file_num_compaction_trigger(
235            max_level_zero_file_num.try_into().unwrap(),
236        );
237        self.options.set_level_zero_slowdown_writes_trigger(
238            (max_level_zero_file_num * 12).try_into().unwrap(),
239        );
240        self.options
241            .set_level_zero_stop_writes_trigger((max_level_zero_file_num * 16).try_into().unwrap());
242
243        // Increase sst file size to 128MiB.
244        self.options.set_target_file_size_base(
245            read_size_from_env(ENV_VAR_TARGET_FILE_SIZE_BASE_MB)
246                .unwrap_or(DEFAULT_TARGET_FILE_SIZE_BASE_MB) as u64
247                * 1024
248                * 1024,
249        );
250
251        // This should be a no-op for universal compaction but increasing it to be safe.
252        self.options
253            .set_max_bytes_for_level_base((write_buffer_size * max_level_zero_file_num) as u64);
254
255        self
256    }
257
258    // Overrides the block options with different block cache size and block size.
259    pub fn set_block_options(
260        mut self,
261        block_cache_size_mb: usize,
262        block_size_bytes: usize,
263    ) -> DBOptions {
264        self.options
265            .set_block_based_table_factory(&get_block_options(
266                block_cache_size_mb,
267                block_size_bytes,
268            ));
269        self
270    }
271
272    // Disables write stalling and stopping based on pending compaction bytes.
273    pub fn disable_write_throttling(mut self) -> DBOptions {
274        self.options.set_soft_pending_compaction_bytes_limit(0);
275        self.options.set_hard_pending_compaction_bytes_limit(0);
276        self.options.set_level_zero_slowdown_writes_trigger(512);
277        self.options.set_level_zero_stop_writes_trigger(1024);
278        self
279    }
280
281    pub fn set_sync_writes(mut self, sync_writes: bool) -> DBOptions {
282        self.rw_options.sync_writes = sync_writes;
283        self
284    }
285
286    pub fn set_merge_operator_associative<F>(mut self, name: &str, merge_fn: F) -> DBOptions
287    where
288        F: Fn(&[u8], Option<&[u8]>, &MergeOperands) -> Option<Vec<u8>>
289            + Send
290            + Sync
291            + Clone
292            + 'static,
293    {
294        self.options.set_merge_operator_associative(name, merge_fn);
295        self
296    }
297
298    pub fn set_compaction_filter<F>(mut self, name: &str, filter_fn: F) -> DBOptions
299    where
300        F: FnMut(u32, &[u8], &[u8]) -> Decision + Send + 'static,
301    {
302        self.options.set_compaction_filter(name, filter_fn);
303        self
304    }
305}
306
307/// Creates a default RocksDB option, to be used when RocksDB option is unspecified.
308pub fn default_db_options() -> DBOptions {
309    let mut opt = rocksdb::Options::default();
310
311    // One common issue when running tests on Mac is that the default ulimit is too low,
312    // leading to I/O errors such as "Too many open files". Raising fdlimit to bypass it.
313    if let Some(limit) = fdlimit::raise_fd_limit() {
314        // on windows raise_fd_limit return None
315        opt.set_max_open_files((limit / 8) as i32);
316    }
317
318    // The table cache is locked for updates and this determines the number
319    // of shards, ie 2^10. Increase in case of lock contentions.
320    opt.set_table_cache_num_shard_bits(10);
321
322    // LSM compression settings
323    opt.set_compression_type(rocksdb::DBCompressionType::Lz4);
324    opt.set_bottommost_compression_type(rocksdb::DBCompressionType::Zstd);
325    opt.set_bottommost_zstd_max_train_bytes(1024 * 1024, true);
326
327    // Sui uses multiple RocksDB in a node, so total sizes of write buffers and WAL can be higher
328    // than the limits below.
329    //
330    // RocksDB also exposes the option to configure total write buffer size across multiple instances
331    // via `write_buffer_manager`. But the write buffer flush policy (flushing the buffer receiving
332    // the next write) may not work well. So sticking to per-db write buffer size limit for now.
333    //
334    // The environment variables are only meant to be emergency overrides. They may go away in future.
335    // It is preferable to update the default value, or override the option in code.
336    opt.set_db_write_buffer_size(
337        read_size_from_env(ENV_VAR_DB_WRITE_BUFFER_SIZE).unwrap_or(DEFAULT_DB_WRITE_BUFFER_SIZE)
338            * 1024
339            * 1024,
340    );
341    opt.set_max_total_wal_size(
342        read_size_from_env(ENV_VAR_DB_WAL_SIZE).unwrap_or(DEFAULT_DB_WAL_SIZE) as u64 * 1024 * 1024,
343    );
344
345    // Num threads for compactions and memtable flushes.
346    opt.increase_parallelism(read_size_from_env(ENV_VAR_DB_PARALLELISM).unwrap_or(8) as i32);
347
348    opt.set_enable_pipelined_write(true);
349
350    // Increase block size to 16KiB.
351    // https://github.com/EighteenZi/rocksdb_wiki/blob/master/Memory-usage-in-RocksDB.md#indexes-and-filter-blocks
352    opt.set_block_based_table_factory(&get_block_options(128, 16 << 10));
353
354    // Set memtable bloomfilter.
355    opt.set_memtable_prefix_bloom_ratio(0.02);
356
357    DBOptions {
358        options: opt,
359        rw_options: ReadWriteOptions::default(),
360    }
361}
362
363fn get_block_options(block_cache_size_mb: usize, block_size_bytes: usize) -> BlockBasedOptions {
364    // Set options mostly similar to those used in optimize_for_point_lookup(),
365    // except non-default binary and hash index, to hopefully reduce lookup latencies
366    // without causing any regression for scanning, with slightly more memory usages.
367    // https://github.com/facebook/rocksdb/blob/11cb6af6e5009c51794641905ca40ce5beec7fee/options/options.cc#L611-L621
368    let mut block_options = BlockBasedOptions::default();
369    // Overrides block size.
370    block_options.set_block_size(block_size_bytes);
371    // Configure a block cache.
372    block_options.set_block_cache(&Cache::new_lru_cache(block_cache_size_mb << 20));
373    // Set a bloomfilter with 1% false positive rate.
374    block_options.set_bloom_filter(10.0, false);
375    // From https://github.com/EighteenZi/rocksdb_wiki/blob/master/Block-Cache.md#caching-index-and-filter-blocks
376    block_options.set_pin_l0_filter_and_index_blocks_in_cache(true);
377    block_options
378}
379
380pub fn read_size_from_env(var_name: &str) -> Option<usize> {
381    env::var(var_name)
382        .ok()?
383        .parse::<usize>()
384        .tap_err(|e| {
385            warn!(
386                "Env var {} does not contain valid usize integer: {}",
387                var_name, e
388            )
389        })
390        .ok()
391}