typed_store/
metrics.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::cell::RefCell;
5use std::sync::Arc;
6use std::sync::atomic::{AtomicU64, Ordering};
7use std::time::Duration;
8
9use mysten_metrics::RegistryService;
10use once_cell::sync::OnceCell;
11use prometheus::{
12    HistogramVec, IntCounterVec, IntGaugeVec, Registry, register_histogram_vec_with_registry,
13    register_int_counter_vec_with_registry, register_int_gauge_vec_with_registry,
14};
15use rocksdb::perf::set_perf_stats;
16use rocksdb::{PerfContext, PerfMetric, PerfStatsLevel};
17use tap::TapFallible;
18use tracing::warn;
19
20thread_local! {
21    static PER_THREAD_ROCKS_PERF_CONTEXT: std::cell::RefCell<rocksdb::PerfContext>  = RefCell::new(PerfContext::default());
22}
23
24const LATENCY_SEC_BUCKETS: &[f64] = &[
25    0.00001, 0.00005, // 10 mcs, 50 mcs
26    0.0001, 0.0002, 0.0003, 0.0004, 0.0005, // 100..500 mcs
27    0.001, 0.002, 0.003, 0.004, 0.005, // 1..5ms
28    0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10.,
29];
30
31#[derive(Debug, Clone)]
32// A struct for sampling based on number of operations or duration.
33// Sampling happens if the duration expires and after number of operations
34pub struct SamplingInterval {
35    // Sample once every time duration
36    pub once_every_duration: Duration,
37    // Sample once every number of operations
38    pub after_num_ops: u64,
39    // Counter for keeping track of previous sample
40    pub counter: Arc<AtomicU64>,
41}
42
43impl Default for SamplingInterval {
44    fn default() -> Self {
45        // Enabled with 60 second interval
46        SamplingInterval::new(Duration::from_secs(60), 0)
47    }
48}
49
50impl SamplingInterval {
51    pub fn new(once_every_duration: Duration, after_num_ops: u64) -> Self {
52        let counter = Arc::new(AtomicU64::new(1));
53        if !once_every_duration.is_zero() {
54            let counter = counter.clone();
55            tokio::task::spawn(async move {
56                loop {
57                    if counter.load(Ordering::SeqCst) > after_num_ops {
58                        counter.store(0, Ordering::SeqCst);
59                    }
60                    tokio::time::sleep(once_every_duration).await;
61                }
62            });
63        }
64        SamplingInterval {
65            once_every_duration,
66            after_num_ops,
67            counter,
68        }
69    }
70    pub fn new_from_self(&self) -> SamplingInterval {
71        SamplingInterval::new(self.once_every_duration, self.after_num_ops)
72    }
73    pub fn sample(&self) -> bool {
74        if self.once_every_duration.is_zero() {
75            self.counter
76                .fetch_add(1, Ordering::Relaxed)
77                .is_multiple_of(self.after_num_ops + 1)
78        } else {
79            self.counter.fetch_add(1, Ordering::Relaxed) == 0
80        }
81    }
82}
83
84#[derive(Debug)]
85pub struct ColumnFamilyMetrics {
86    pub rocksdb_total_sst_files_size: IntGaugeVec,
87    pub rocksdb_total_blob_files_size: IntGaugeVec,
88    pub rocksdb_total_num_files: IntGaugeVec,
89    pub rocksdb_num_level0_files: IntGaugeVec,
90    pub rocksdb_current_size_active_mem_tables: IntGaugeVec,
91    pub rocksdb_size_all_mem_tables: IntGaugeVec,
92    pub rocksdb_num_snapshots: IntGaugeVec,
93    pub rocksdb_oldest_snapshot_time: IntGaugeVec,
94    pub rocksdb_actual_delayed_write_rate: IntGaugeVec,
95    pub rocksdb_is_write_stopped: IntGaugeVec,
96    pub rocksdb_block_cache_capacity: IntGaugeVec,
97    pub rocksdb_block_cache_usage: IntGaugeVec,
98    pub rocksdb_block_cache_pinned_usage: IntGaugeVec,
99    pub rocksdb_estimate_table_readers_mem: IntGaugeVec,
100    pub rocksdb_num_immutable_mem_tables: IntGaugeVec,
101    pub rocksdb_mem_table_flush_pending: IntGaugeVec,
102    pub rocksdb_compaction_pending: IntGaugeVec,
103    pub rocksdb_estimate_pending_compaction_bytes: IntGaugeVec,
104    pub rocksdb_num_running_compactions: IntGaugeVec,
105    pub rocksdb_num_running_flushes: IntGaugeVec,
106    pub rocksdb_estimate_oldest_key_time: IntGaugeVec,
107    pub rocksdb_background_errors: IntGaugeVec,
108    pub rocksdb_estimated_num_keys: IntGaugeVec,
109    pub rocksdb_base_level: IntGaugeVec,
110}
111
112impl ColumnFamilyMetrics {
113    pub(crate) fn new(registry: &Registry) -> Self {
114        ColumnFamilyMetrics {
115            rocksdb_total_sst_files_size: register_int_gauge_vec_with_registry!(
116                "rocksdb_total_sst_files_size",
117                "The storage size occupied by the sst files in the column family",
118                &["cf_name"],
119                registry,
120            )
121            .unwrap(),
122            rocksdb_total_blob_files_size: register_int_gauge_vec_with_registry!(
123                "rocksdb_total_blob_files_size",
124                "The storage size occupied by the blob files in the column family",
125                &["cf_name"],
126                registry,
127            )
128            .unwrap(),
129            rocksdb_total_num_files: register_int_gauge_vec_with_registry!(
130                "rocksdb_total_num_files",
131                "Total number of files used in the column family",
132                &["cf_name"],
133                registry,
134            )
135            .unwrap(),
136            rocksdb_num_level0_files: register_int_gauge_vec_with_registry!(
137                "rocksdb_num_level0_files",
138                "Number of level 0 files in the column family",
139                &["cf_name"],
140                registry,
141            )
142            .unwrap(),
143            rocksdb_current_size_active_mem_tables: register_int_gauge_vec_with_registry!(
144                "rocksdb_current_size_active_mem_tables",
145                "The current approximate size of active memtable (bytes).",
146                &["cf_name"],
147                registry,
148            )
149            .unwrap(),
150            rocksdb_size_all_mem_tables: register_int_gauge_vec_with_registry!(
151                "rocksdb_size_all_mem_tables",
152                "The memory size occupied by the column family's in-memory buffer",
153                &["cf_name"],
154                registry,
155            )
156            .unwrap(),
157            rocksdb_num_snapshots: register_int_gauge_vec_with_registry!(
158                "rocksdb_num_snapshots",
159                "Number of snapshots held for the column family",
160                &["cf_name"],
161                registry,
162            )
163            .unwrap(),
164            rocksdb_oldest_snapshot_time: register_int_gauge_vec_with_registry!(
165                "rocksdb_oldest_snapshot_time",
166                "Unit timestamp of the oldest unreleased snapshot",
167                &["cf_name"],
168                registry,
169            )
170            .unwrap(),
171            rocksdb_actual_delayed_write_rate: register_int_gauge_vec_with_registry!(
172                "rocksdb_actual_delayed_write_rate",
173                "The current actual delayed write rate. 0 means no delay",
174                &["cf_name"],
175                registry,
176            )
177            .unwrap(),
178            rocksdb_is_write_stopped: register_int_gauge_vec_with_registry!(
179                "rocksdb_is_write_stopped",
180                "A flag indicating whether writes are stopped on this column family. 1 indicates writes have been stopped.",
181                &["cf_name"],
182                registry,
183            )
184            .unwrap(),
185            rocksdb_block_cache_capacity: register_int_gauge_vec_with_registry!(
186                "rocksdb_block_cache_capacity",
187                "The block cache capacity of the column family.",
188                &["cf_name"],
189                registry,
190            )
191            .unwrap(),
192            rocksdb_block_cache_usage: register_int_gauge_vec_with_registry!(
193                "rocksdb_block_cache_usage",
194                "The memory size used by the column family in the block cache.",
195                &["cf_name"],
196                registry,
197            )
198            .unwrap(),
199            rocksdb_block_cache_pinned_usage: register_int_gauge_vec_with_registry!(
200                "rocksdb_block_cache_pinned_usage",
201                "The memory size used by the column family in the block cache where entries are pinned",
202                &["cf_name"],
203                registry,
204            )
205            .unwrap(),
206            rocksdb_estimate_table_readers_mem: register_int_gauge_vec_with_registry!(
207                "rocksdb_estimate_table_readers_mem",
208                "The estimated memory size used for reading SST tables in this column
209                family such as filters and index blocks. Note that this number does not
210                include the memory used in block cache.",
211                &["cf_name"],
212                registry,
213            )
214            .unwrap(),
215            rocksdb_num_immutable_mem_tables: register_int_gauge_vec_with_registry!(
216                "rocksdb_num_immutable_mem_tables",
217                "The number of immutable memtables that have not yet been flushed.",
218                &["cf_name"],
219                registry,
220            )
221            .unwrap(),
222            rocksdb_mem_table_flush_pending: register_int_gauge_vec_with_registry!(
223                "rocksdb_mem_table_flush_pending",
224                "A 1 or 0 flag indicating whether a memtable flush is pending.
225                If this number is 1, it means a memtable is waiting for being flushed,
226                but there might be too many L0 files that prevents it from being flushed.",
227                &["cf_name"],
228                registry,
229            )
230            .unwrap(),
231            rocksdb_compaction_pending: register_int_gauge_vec_with_registry!(
232                "rocksdb_compaction_pending",
233                "A 1 or 0 flag indicating whether a compaction job is pending.
234                If this number is 1, it means some part of the column family requires
235                compaction in order to maintain shape of LSM tree, but the compaction
236                is pending because the desired compaction job is either waiting for
237                other dependent compactions to be finished or waiting for an available
238                compaction thread.",
239                &["cf_name"],
240                registry,
241            )
242            .unwrap(),
243            rocksdb_estimate_pending_compaction_bytes: register_int_gauge_vec_with_registry!(
244                "rocksdb_estimate_pending_compaction_bytes",
245                "Estimated total number of bytes compaction needs to rewrite to get all levels down
246                to under target size. Not valid for other compactions than level-based.",
247                &["cf_name"],
248                registry,
249            )
250            .unwrap(),
251            rocksdb_num_running_compactions: register_int_gauge_vec_with_registry!(
252                "rocksdb_num_running_compactions",
253                "The number of compactions that are currently running for the column family.",
254                &["cf_name"],
255                registry,
256            )
257            .unwrap(),
258            rocksdb_num_running_flushes: register_int_gauge_vec_with_registry!(
259                "rocksdb_num_running_flushes",
260                "The number of flushes that are currently running for the column family.",
261                &["cf_name"],
262                registry,
263            )
264            .unwrap(),
265            rocksdb_estimate_oldest_key_time: register_int_gauge_vec_with_registry!(
266                "rocksdb_estimate_oldest_key_time",
267                "Estimation of the oldest key timestamp in the DB. Only available
268                for FIFO compaction with compaction_options_fifo.allow_compaction = false.",
269                &["cf_name"],
270                registry,
271            )
272            .unwrap(),
273            rocksdb_estimated_num_keys: register_int_gauge_vec_with_registry!(
274                "rocksdb_estimated_num_keys",
275                "The estimated number of keys in the table",
276                &["cf_name"],
277                registry,
278            )
279            .unwrap(),
280            rocksdb_background_errors: register_int_gauge_vec_with_registry!(
281                "rocksdb_background_errors",
282                "The accumulated number of RocksDB background errors.",
283                &["cf_name"],
284                registry,
285            )
286            .unwrap(),
287            rocksdb_base_level: register_int_gauge_vec_with_registry!(
288                "rocksdb_base_level",
289                "The number of level to which L0 data will be compacted.",
290                &["cf_name"],
291                registry,
292            )
293            .unwrap(),
294        }
295    }
296}
297
298#[derive(Debug)]
299pub struct OperationMetrics {
300    pub rocksdb_iter_latency_seconds: HistogramVec,
301    pub rocksdb_iter_bytes: HistogramVec,
302    pub rocksdb_iter_keys: HistogramVec,
303    pub rocksdb_get_latency_seconds: HistogramVec,
304    pub rocksdb_get_bytes: HistogramVec,
305    pub rocksdb_multiget_latency_seconds: HistogramVec,
306    pub rocksdb_multiget_bytes: HistogramVec,
307    pub rocksdb_put_latency_seconds: HistogramVec,
308    pub rocksdb_put_bytes: HistogramVec,
309    pub rocksdb_batch_put_bytes: HistogramVec,
310    pub rocksdb_delete_latency_seconds: HistogramVec,
311    pub rocksdb_deletes: IntCounterVec,
312    pub rocksdb_batch_commit_latency_seconds: HistogramVec,
313    pub rocksdb_batch_commit_bytes: HistogramVec,
314    pub rocksdb_num_active_db_handles: IntGaugeVec,
315    pub rocksdb_very_slow_batch_writes_count: IntCounterVec,
316    pub rocksdb_very_slow_batch_writes_duration_ms: IntCounterVec,
317    pub rocksdb_very_slow_puts_count: IntCounterVec,
318    pub rocksdb_very_slow_puts_duration_ms: IntCounterVec,
319}
320
321impl OperationMetrics {
322    pub(crate) fn new(registry: &Registry) -> Self {
323        OperationMetrics {
324            rocksdb_iter_latency_seconds: register_histogram_vec_with_registry!(
325                "rocksdb_iter_latency_seconds",
326                "Rocksdb iter latency in seconds",
327                &["cf_name"],
328                LATENCY_SEC_BUCKETS.to_vec(),
329                registry,
330            )
331            .unwrap(),
332            rocksdb_iter_bytes: register_histogram_vec_with_registry!(
333                "rocksdb_iter_bytes",
334                "Rocksdb iter size in bytes",
335                &["cf_name"],
336                prometheus::exponential_buckets(1.0, 4.0, 15)
337                    .unwrap()
338                    .to_vec(),
339                registry,
340            )
341            .unwrap(),
342            rocksdb_iter_keys: register_histogram_vec_with_registry!(
343                "rocksdb_iter_keys",
344                "Rocksdb iter num keys",
345                &["cf_name"],
346                registry,
347            )
348            .unwrap(),
349            rocksdb_get_latency_seconds: register_histogram_vec_with_registry!(
350                "rocksdb_get_latency_seconds",
351                "Rocksdb get latency in seconds",
352                &["cf_name"],
353                LATENCY_SEC_BUCKETS.to_vec(),
354                registry,
355            )
356            .unwrap(),
357            rocksdb_get_bytes: register_histogram_vec_with_registry!(
358                "rocksdb_get_bytes",
359                "Rocksdb get call returned data size in bytes",
360                &["cf_name"],
361                prometheus::exponential_buckets(1.0, 4.0, 15)
362                    .unwrap()
363                    .to_vec(),
364                registry
365            )
366            .unwrap(),
367            rocksdb_multiget_latency_seconds: register_histogram_vec_with_registry!(
368                "rocksdb_multiget_latency_seconds",
369                "Rocksdb multiget latency in seconds",
370                &["cf_name"],
371                LATENCY_SEC_BUCKETS.to_vec(),
372                registry,
373            )
374            .unwrap(),
375            rocksdb_multiget_bytes: register_histogram_vec_with_registry!(
376                "rocksdb_multiget_bytes",
377                "Rocksdb multiget call returned data size in bytes",
378                &["cf_name"],
379                prometheus::exponential_buckets(1.0, 4.0, 15)
380                    .unwrap()
381                    .to_vec(),
382                registry,
383            )
384            .unwrap(),
385            rocksdb_put_latency_seconds: register_histogram_vec_with_registry!(
386                "rocksdb_put_latency_seconds",
387                "Rocksdb put latency in seconds",
388                &["cf_name"],
389                LATENCY_SEC_BUCKETS.to_vec(),
390                registry,
391            )
392            .unwrap(),
393            rocksdb_put_bytes: register_histogram_vec_with_registry!(
394                "rocksdb_put_bytes",
395                "Rocksdb put call puts data size in bytes",
396                &["cf_name"],
397                prometheus::exponential_buckets(1.0, 4.0, 15)
398                    .unwrap()
399                    .to_vec(),
400                registry,
401            )
402            .unwrap(),
403            rocksdb_batch_put_bytes: register_histogram_vec_with_registry!(
404                "rocksdb_batch_put_bytes",
405                "Rocksdb batch put call puts data size in bytes",
406                &["cf_name"],
407                prometheus::exponential_buckets(1.0, 4.0, 15)
408                    .unwrap()
409                    .to_vec(),
410                registry,
411            )
412            .unwrap(),
413            rocksdb_delete_latency_seconds: register_histogram_vec_with_registry!(
414                "rocksdb_delete_latency_seconds",
415                "Rocksdb delete latency in seconds",
416                &["cf_name"],
417                LATENCY_SEC_BUCKETS.to_vec(),
418                registry,
419            )
420            .unwrap(),
421            rocksdb_deletes: register_int_counter_vec_with_registry!(
422                "rocksdb_deletes",
423                "Rocksdb delete calls",
424                &["cf_name"],
425                registry
426            )
427            .unwrap(),
428            rocksdb_batch_commit_latency_seconds: register_histogram_vec_with_registry!(
429                "rocksdb_write_batch_commit_latency_seconds",
430                "Rocksdb schema batch commit latency in seconds",
431                &["db_name"],
432                LATENCY_SEC_BUCKETS.to_vec(),
433                registry,
434            )
435            .unwrap(),
436            rocksdb_batch_commit_bytes: register_histogram_vec_with_registry!(
437                "rocksdb_batch_commit_bytes",
438                "Rocksdb schema batch commit size in bytes",
439                &["db_name"],
440                prometheus::exponential_buckets(1.0, 4.0, 15)
441                    .unwrap()
442                    .to_vec(),
443                registry,
444            )
445            .unwrap(),
446            rocksdb_num_active_db_handles: register_int_gauge_vec_with_registry!(
447                "rocksdb_num_active_db_handles",
448                "Number of active db handles",
449                &["db_name"],
450                registry,
451            )
452            .unwrap(),
453            rocksdb_very_slow_batch_writes_count: register_int_counter_vec_with_registry!(
454                "rocksdb_num_very_slow_batch_writes",
455                "Number of batch writes that took more than 1 second",
456                &["db_name"],
457                registry,
458            )
459            .unwrap(),
460            rocksdb_very_slow_batch_writes_duration_ms: register_int_counter_vec_with_registry!(
461                "rocksdb_very_slow_batch_writes_duration",
462                "Total duration of batch writes that took more than 1 second",
463                &["db_name"],
464                registry,
465            )
466            .unwrap(),
467            rocksdb_very_slow_puts_count: register_int_counter_vec_with_registry!(
468                "rocksdb_num_very_slow_puts",
469                "Number of puts that took more than 1 second",
470                &["cf_name"],
471                registry,
472            )
473            .unwrap(),
474            rocksdb_very_slow_puts_duration_ms: register_int_counter_vec_with_registry!(
475                "rocksdb_very_slow_puts_duration",
476                "Total duration of puts that took more than 1 second",
477                &["cf_name"],
478                registry,
479            )
480            .unwrap(),
481        }
482    }
483}
484
485pub struct RocksDBPerfContext;
486
487impl Default for RocksDBPerfContext {
488    fn default() -> Self {
489        set_perf_stats(PerfStatsLevel::EnableTime);
490        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context| {
491            perf_context.borrow_mut().reset();
492        });
493        RocksDBPerfContext {}
494    }
495}
496
497impl Drop for RocksDBPerfContext {
498    fn drop(&mut self) {
499        set_perf_stats(PerfStatsLevel::Disable);
500    }
501}
502
503#[derive(Debug)]
504pub struct ReadPerfContextMetrics {
505    pub user_key_comparison_count: IntCounterVec,
506    pub block_cache_hit_count: IntCounterVec,
507    pub block_read_count: IntCounterVec,
508    pub block_read_byte: IntCounterVec,
509    pub block_read_nanos: IntCounterVec,
510    pub block_checksum_nanos: IntCounterVec,
511    pub block_decompress_nanos: IntCounterVec,
512    pub get_read_bytes: IntCounterVec,
513    pub multiget_read_bytes: IntCounterVec,
514    pub get_snapshot_nanos: IntCounterVec,
515    pub get_from_memtable_nanos: IntCounterVec,
516    pub get_from_memtable_count: IntCounterVec,
517    pub get_post_process_nanos: IntCounterVec,
518    pub get_from_output_files_nanos: IntCounterVec,
519    pub db_mutex_lock_nanos: IntCounterVec,
520    pub db_condition_wait_nanos: IntCounterVec,
521    pub merge_operator_nanos: IntCounterVec,
522    pub read_index_block_nanos: IntCounterVec,
523    pub read_filter_block_nanos: IntCounterVec,
524    pub new_table_block_iter_nanos: IntCounterVec,
525    pub block_seek_nanos: IntCounterVec,
526    pub find_table_nanos: IntCounterVec,
527    pub bloom_memtable_hit_count: IntCounterVec,
528    pub bloom_memtable_miss_count: IntCounterVec,
529    pub bloom_sst_hit_count: IntCounterVec,
530    pub bloom_sst_miss_count: IntCounterVec,
531    pub key_lock_wait_time: IntCounterVec,
532    pub key_lock_wait_count: IntCounterVec,
533    pub internal_delete_skipped_count: IntCounterVec,
534    pub internal_skipped_count: IntCounterVec,
535}
536
537impl ReadPerfContextMetrics {
538    pub(crate) fn new(registry: &Registry) -> Self {
539        ReadPerfContextMetrics {
540            user_key_comparison_count: register_int_counter_vec_with_registry!(
541                "user_key_comparison_count",
542                "Helps us figure out whether too many comparisons in binary search can be a problem,
543                especially when a more expensive comparator is used. Moreover, since number of comparisons
544                is usually uniform based on the memtable size, the SST file size for Level 0 and size of other
545                levels, an significant increase of the counter can indicate unexpected LSM-tree shape.
546                You may want to check whether flush/compaction can keep up with the write speed",
547                &["cf_name"],
548                registry,
549            )
550            .unwrap(),
551            block_cache_hit_count: register_int_counter_vec_with_registry!(
552                "block_cache_hit_count",
553                "Tells us how many times we read data blocks from block cache, and block_read_count tells us how many
554                times we have to read blocks from the file system (either block cache is disabled or it is a cache miss).
555                We can evaluate the block cache efficiency by looking at the two counters over time.",
556                &["cf_name"],
557                registry,
558            )
559            .unwrap(),
560            block_read_count: register_int_counter_vec_with_registry!(
561                "block_read_count",
562                "Tells us how many times we have to read blocks from the file system (either block cache is disabled or it is a cache miss)",
563                &["cf_name"],
564                registry,
565            )
566            .unwrap(),
567            block_read_byte: register_int_counter_vec_with_registry!(
568                "block_read_byte",
569                "Tells us how many total bytes we read from the file system. It can tell us whether a slow query can be caused by reading
570                large blocks from the file system. Index and bloom filter blocks are usually large blocks. A large block can also be the result
571                of a very large key or value",
572                &["cf_name"],
573                registry,
574            )
575            .unwrap(),
576            block_read_nanos: register_int_counter_vec_with_registry!(
577                "block_read_nanos",
578                "Total nanos spent on block reads",
579                &["cf_name"],
580                registry,
581            )
582            .unwrap(),
583            block_checksum_nanos: register_int_counter_vec_with_registry!(
584                "block_checksum_nanos",
585                "Total nanos spent on verifying block checksum",
586                &["cf_name"],
587                registry,
588            )
589            .unwrap(),
590            block_decompress_nanos: register_int_counter_vec_with_registry!(
591                "block_decompress_nanos",
592                "Total nanos spent on decompressing a block",
593                &["cf_name"],
594                registry,
595            )
596            .unwrap(),
597            get_read_bytes: register_int_counter_vec_with_registry!(
598                "get_read_bytes",
599                "Total bytes for values returned by Get",
600                &["cf_name"],
601                registry,
602            )
603            .unwrap(),
604            multiget_read_bytes: register_int_counter_vec_with_registry!(
605                "multiget_read_bytes",
606                "Total bytes for values returned by MultiGet.",
607                &["cf_name"],
608                registry,
609            )
610            .unwrap(),
611            get_snapshot_nanos: register_int_counter_vec_with_registry!(
612                "get_snapshot_nanos",
613                "Time spent in getting snapshot.",
614                &["cf_name"],
615                registry,
616            )
617            .unwrap(),
618            get_from_memtable_nanos: register_int_counter_vec_with_registry!(
619                "get_from_memtable_nanos",
620                "Time spent on reading data from memtable.",
621                &["cf_name"],
622                registry,
623            )
624            .unwrap(),
625            get_from_memtable_count: register_int_counter_vec_with_registry!(
626                "get_from_memtable_count",
627                "Number of memtables queried",
628                &["cf_name"],
629                registry,
630            )
631            .unwrap(),
632            get_post_process_nanos: register_int_counter_vec_with_registry!(
633                "get_post_process_nanos",
634                "Total nanos spent after Get() finds a key",
635                &["cf_name"],
636                registry,
637            )
638            .unwrap(),
639            get_from_output_files_nanos: register_int_counter_vec_with_registry!(
640                "get_from_output_files_nanos",
641                "Total nanos reading from output files",
642                &["cf_name"],
643                registry,
644            )
645            .unwrap(),
646            db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
647                "db_mutex_lock_nanos",
648                "Time spent on acquiring db mutex",
649                &["cf_name"],
650                registry,
651            )
652            .unwrap(),
653            db_condition_wait_nanos: register_int_counter_vec_with_registry!(
654                "db_condition_wait_nanos",
655                "Time spent waiting with a condition variable created with DB Mutex.",
656                &["cf_name"],
657                registry,
658            )
659            .unwrap(),
660            merge_operator_nanos: register_int_counter_vec_with_registry!(
661                "merge_operator_nanos",
662                "Time spent on merge operator.",
663                &["cf_name"],
664                registry,
665            )
666            .unwrap(),
667            read_index_block_nanos: register_int_counter_vec_with_registry!(
668                "read_index_block_nanos",
669                "Time spent on reading index block from block cache or SST file",
670                &["cf_name"],
671                registry,
672            )
673            .unwrap(),
674            read_filter_block_nanos: register_int_counter_vec_with_registry!(
675                "read_filter_block_nanos",
676                "Time spent on reading filter block from block cache or SST file",
677                &["cf_name"],
678                registry,
679            )
680            .unwrap(),
681            new_table_block_iter_nanos: register_int_counter_vec_with_registry!(
682                "new_table_block_iter_nanos",
683                "Time spent on creating data block iterator",
684                &["cf_name"],
685                registry,
686            )
687            .unwrap(),
688            block_seek_nanos: register_int_counter_vec_with_registry!(
689                "block_seek_nanos",
690                "Time spent on seeking a key in data/index blocks",
691                &["cf_name"],
692                registry,
693            )
694            .unwrap(),
695            find_table_nanos: register_int_counter_vec_with_registry!(
696                "find_table_nanos",
697                "Time spent on finding or creating a table reader",
698                &["cf_name"],
699                registry,
700            )
701            .unwrap(),
702            bloom_memtable_hit_count: register_int_counter_vec_with_registry!(
703                "bloom_memtable_hit_count",
704                "Total number of mem table bloom hits",
705                &["cf_name"],
706                registry,
707            )
708            .unwrap(),
709            bloom_memtable_miss_count: register_int_counter_vec_with_registry!(
710                "bloom_memtable_miss_count",
711                "Total number of mem table bloom misses",
712                &["cf_name"],
713                registry,
714            )
715            .unwrap(),
716            bloom_sst_hit_count: register_int_counter_vec_with_registry!(
717                "bloom_sst_hit_count",
718                "Total number of SST table bloom hits",
719                &["cf_name"],
720                registry,
721            )
722            .unwrap(),
723            bloom_sst_miss_count: register_int_counter_vec_with_registry!(
724                "bloom_sst_miss_count",
725                "Total number of SST table bloom misses",
726                &["cf_name"],
727                registry,
728            )
729            .unwrap(),
730            key_lock_wait_time: register_int_counter_vec_with_registry!(
731                "key_lock_wait_time",
732                "Time spent waiting on key locks in transaction lock manager",
733                &["cf_name"],
734                registry,
735            )
736            .unwrap(),
737            key_lock_wait_count: register_int_counter_vec_with_registry!(
738                "key_lock_wait_count",
739                "Number of times acquiring a lock was blocked by another transaction",
740                &["cf_name"],
741                registry,
742            )
743            .unwrap(),
744            internal_delete_skipped_count: register_int_counter_vec_with_registry!(
745                "internal_delete_skipped_count",
746                "Total number of deleted keys skipped during iteration",
747                &["cf_name"],
748                registry,
749            )
750                .unwrap(),
751            internal_skipped_count: register_int_counter_vec_with_registry!(
752                "internal_skipped_count",
753                "Totall number of internal keys skipped during iteration",
754                &["cf_name"],
755                registry,
756            )
757                .unwrap(),
758        }
759    }
760
761    pub fn report_metrics(&self, cf_name: &str) {
762        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
763            set_perf_stats(PerfStatsLevel::Disable);
764            let perf_context = perf_context_cell.borrow();
765            self.user_key_comparison_count
766                .with_label_values(&[cf_name])
767                .inc_by(perf_context.metric(PerfMetric::UserKeyComparisonCount));
768            self.block_cache_hit_count
769                .with_label_values(&[cf_name])
770                .inc_by(perf_context.metric(PerfMetric::BlockCacheHitCount));
771            self.block_read_count
772                .with_label_values(&[cf_name])
773                .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
774            self.block_read_byte
775                .with_label_values(&[cf_name])
776                .inc_by(perf_context.metric(PerfMetric::BlockReadByte));
777            self.block_read_nanos
778                .with_label_values(&[cf_name])
779                .inc_by(perf_context.metric(PerfMetric::BlockReadTime));
780            self.block_read_count
781                .with_label_values(&[cf_name])
782                .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
783            self.block_checksum_nanos
784                .with_label_values(&[cf_name])
785                .inc_by(perf_context.metric(PerfMetric::BlockChecksumTime));
786            self.block_decompress_nanos
787                .with_label_values(&[cf_name])
788                .inc_by(perf_context.metric(PerfMetric::BlockDecompressTime));
789            self.get_read_bytes
790                .with_label_values(&[cf_name])
791                .inc_by(perf_context.metric(PerfMetric::GetReadBytes));
792            self.multiget_read_bytes
793                .with_label_values(&[cf_name])
794                .inc_by(perf_context.metric(PerfMetric::MultigetReadBytes));
795            self.get_snapshot_nanos
796                .with_label_values(&[cf_name])
797                .inc_by(perf_context.metric(PerfMetric::GetSnapshotTime));
798            self.get_from_memtable_nanos
799                .with_label_values(&[cf_name])
800                .inc_by(perf_context.metric(PerfMetric::GetFromMemtableTime));
801            self.get_from_memtable_count
802                .with_label_values(&[cf_name])
803                .inc_by(perf_context.metric(PerfMetric::GetFromMemtableCount));
804            self.get_post_process_nanos
805                .with_label_values(&[cf_name])
806                .inc_by(perf_context.metric(PerfMetric::GetPostProcessTime));
807            self.get_from_output_files_nanos
808                .with_label_values(&[cf_name])
809                .inc_by(perf_context.metric(PerfMetric::GetFromOutputFilesTime));
810            self.db_mutex_lock_nanos
811                .with_label_values(&[cf_name])
812                .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
813            self.db_condition_wait_nanos
814                .with_label_values(&[cf_name])
815                .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
816            self.merge_operator_nanos
817                .with_label_values(&[cf_name])
818                .inc_by(perf_context.metric(PerfMetric::MergeOperatorTimeNanos));
819            self.read_index_block_nanos
820                .with_label_values(&[cf_name])
821                .inc_by(perf_context.metric(PerfMetric::ReadIndexBlockNanos));
822            self.read_filter_block_nanos
823                .with_label_values(&[cf_name])
824                .inc_by(perf_context.metric(PerfMetric::ReadFilterBlockNanos));
825            self.new_table_block_iter_nanos
826                .with_label_values(&[cf_name])
827                .inc_by(perf_context.metric(PerfMetric::NewTableBlockIterNanos));
828            self.block_seek_nanos
829                .with_label_values(&[cf_name])
830                .inc_by(perf_context.metric(PerfMetric::BlockSeekNanos));
831            self.find_table_nanos
832                .with_label_values(&[cf_name])
833                .inc_by(perf_context.metric(PerfMetric::FindTableNanos));
834            self.bloom_memtable_hit_count
835                .with_label_values(&[cf_name])
836                .inc_by(perf_context.metric(PerfMetric::BloomMemtableHitCount));
837            self.bloom_memtable_miss_count
838                .with_label_values(&[cf_name])
839                .inc_by(perf_context.metric(PerfMetric::BloomMemtableMissCount));
840            self.bloom_sst_hit_count
841                .with_label_values(&[cf_name])
842                .inc_by(perf_context.metric(PerfMetric::BloomSstHitCount));
843            self.bloom_sst_miss_count
844                .with_label_values(&[cf_name])
845                .inc_by(perf_context.metric(PerfMetric::BloomSstMissCount));
846            self.key_lock_wait_time
847                .with_label_values(&[cf_name])
848                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
849            self.key_lock_wait_count
850                .with_label_values(&[cf_name])
851                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
852            self.internal_delete_skipped_count
853                .with_label_values(&[cf_name])
854                .inc_by(perf_context.metric(PerfMetric::InternalDeleteSkippedCount));
855            self.internal_skipped_count
856                .with_label_values(&[cf_name])
857                .inc_by(perf_context.metric(PerfMetric::InternalKeySkippedCount));
858        });
859    }
860}
861
862#[derive(Debug)]
863pub struct WritePerfContextMetrics {
864    pub write_wal_nanos: IntCounterVec,
865    pub write_memtable_nanos: IntCounterVec,
866    pub write_delay_nanos: IntCounterVec,
867    pub write_pre_and_post_process_nanos: IntCounterVec,
868    pub write_db_mutex_lock_nanos: IntCounterVec,
869    pub write_db_condition_wait_nanos: IntCounterVec,
870    pub write_key_lock_wait_nanos: IntCounterVec,
871    pub write_key_lock_wait_count: IntCounterVec,
872}
873
874impl WritePerfContextMetrics {
875    pub(crate) fn new(registry: &Registry) -> Self {
876        WritePerfContextMetrics {
877            write_wal_nanos: register_int_counter_vec_with_registry!(
878                "write_wal_nanos",
879                "Total nanos spent on writing to WAL",
880                &["cf_name"],
881                registry,
882            )
883            .unwrap(),
884            write_memtable_nanos: register_int_counter_vec_with_registry!(
885                "write_memtable_nanos",
886                "Total nanos spent on writing to memtable",
887                &["cf_name"],
888                registry,
889            )
890            .unwrap(),
891            write_delay_nanos: register_int_counter_vec_with_registry!(
892                "write_delay_nanos",
893                "Total nanos spent on delaying or throttling write",
894                &["cf_name"],
895                registry,
896            )
897            .unwrap(),
898            write_pre_and_post_process_nanos: register_int_counter_vec_with_registry!(
899                "write_pre_and_post_process_nanos",
900                "Total nanos spent on writing a record, excluding the above four things",
901                &["cf_name"],
902                registry,
903            )
904            .unwrap(),
905            write_db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
906                "write_db_mutex_lock_nanos",
907                "Time spent on acquiring db mutex",
908                &["cf_name"],
909                registry,
910            )
911            .unwrap(),
912            write_db_condition_wait_nanos: register_int_counter_vec_with_registry!(
913                "write_db_condition_wait_nanos",
914                "Time spent waiting with a condition variable created with DB Mutex.",
915                &["cf_name"],
916                registry,
917            )
918            .unwrap(),
919            write_key_lock_wait_nanos: register_int_counter_vec_with_registry!(
920                "write_key_lock_wait_time",
921                "Time spent waiting on key locks in transaction lock manager",
922                &["cf_name"],
923                registry,
924            )
925            .unwrap(),
926            write_key_lock_wait_count: register_int_counter_vec_with_registry!(
927                "write_key_lock_wait_count",
928                "Number of times acquiring a lock was blocked by another transaction",
929                &["cf_name"],
930                registry,
931            )
932            .unwrap(),
933        }
934    }
935    pub fn report_metrics(&self, db_name: &str) {
936        PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
937            set_perf_stats(PerfStatsLevel::Disable);
938            let perf_context = perf_context_cell.borrow();
939            self.write_wal_nanos
940                .with_label_values(&[db_name])
941                .inc_by(perf_context.metric(PerfMetric::WriteWalTime));
942            self.write_memtable_nanos
943                .with_label_values(&[db_name])
944                .inc_by(perf_context.metric(PerfMetric::WriteMemtableTime));
945            self.write_delay_nanos
946                .with_label_values(&[db_name])
947                .inc_by(perf_context.metric(PerfMetric::WriteDelayTime));
948            self.write_pre_and_post_process_nanos
949                .with_label_values(&[db_name])
950                .inc_by(perf_context.metric(PerfMetric::WritePreAndPostProcessTime));
951            self.write_db_mutex_lock_nanos
952                .with_label_values(&[db_name])
953                .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
954            self.write_db_condition_wait_nanos
955                .with_label_values(&[db_name])
956                .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
957            self.write_key_lock_wait_nanos
958                .with_label_values(&[db_name])
959                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
960            self.write_key_lock_wait_count
961                .with_label_values(&[db_name])
962                .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
963        });
964    }
965}
966
967#[derive(Debug)]
968pub struct DBMetrics {
969    pub op_metrics: OperationMetrics,
970    pub cf_metrics: ColumnFamilyMetrics,
971    pub read_perf_ctx_metrics: ReadPerfContextMetrics,
972    pub write_perf_ctx_metrics: WritePerfContextMetrics,
973    pub registry_serivce: RegistryService,
974}
975
976static ONCE: OnceCell<Arc<DBMetrics>> = OnceCell::new();
977
978impl DBMetrics {
979    fn new(registry_service: RegistryService) -> Self {
980        let registry = registry_service.default_registry();
981        DBMetrics {
982            op_metrics: OperationMetrics::new(&registry),
983            cf_metrics: ColumnFamilyMetrics::new(&registry),
984            read_perf_ctx_metrics: ReadPerfContextMetrics::new(&registry),
985            write_perf_ctx_metrics: WritePerfContextMetrics::new(&registry),
986            registry_serivce: registry_service,
987        }
988    }
989
990    // TODO: Remove static initialization (init() and get()) by constructing DBMetrics
991    // and accessing it without static variables.
992    pub fn init(registry_service: RegistryService) {
993        // Initialize this before creating any instance of DBMap
994        let _ = ONCE
995            .set(Arc::new(DBMetrics::new(registry_service)))
996            // this happens many times during tests
997            .tap_err(|_| warn!("DBMetrics registry overwritten"));
998    }
999
1000    pub fn increment_num_active_dbs(&self, db_name: &str) {
1001        self.op_metrics
1002            .rocksdb_num_active_db_handles
1003            .with_label_values(&[db_name])
1004            .inc();
1005    }
1006
1007    pub fn decrement_num_active_dbs(&self, db_name: &str) {
1008        self.op_metrics
1009            .rocksdb_num_active_db_handles
1010            .with_label_values(&[db_name])
1011            .dec();
1012    }
1013
1014    pub fn get() -> &'static Arc<DBMetrics> {
1015        ONCE.get_or_init(|| {
1016            Arc::new(DBMetrics::new(RegistryService::new(
1017                prometheus::default_registry().clone(),
1018            )))
1019        })
1020    }
1021}