1use std::cell::RefCell;
5use std::sync::Arc;
6use std::sync::atomic::{AtomicU64, Ordering};
7use std::time::Duration;
8
9use mysten_metrics::RegistryService;
10use once_cell::sync::OnceCell;
11use prometheus::{
12 HistogramVec, IntCounterVec, IntGaugeVec, Registry, register_histogram_vec_with_registry,
13 register_int_counter_vec_with_registry, register_int_gauge_vec_with_registry,
14};
15use rocksdb::perf::set_perf_stats;
16use rocksdb::{PerfContext, PerfMetric, PerfStatsLevel};
17use tap::TapFallible;
18use tracing::warn;
19
20thread_local! {
21 static PER_THREAD_ROCKS_PERF_CONTEXT: std::cell::RefCell<rocksdb::PerfContext> = RefCell::new(PerfContext::default());
22}
23
24const LATENCY_SEC_BUCKETS: &[f64] = &[
25 0.00001, 0.00005, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.001, 0.002, 0.003, 0.004, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10.,
29];
30
31#[derive(Debug, Clone)]
32pub struct SamplingInterval {
35 pub once_every_duration: Duration,
37 pub after_num_ops: u64,
39 pub counter: Arc<AtomicU64>,
41}
42
43impl Default for SamplingInterval {
44 fn default() -> Self {
45 SamplingInterval::new(Duration::from_secs(60), 0)
47 }
48}
49
50impl SamplingInterval {
51 pub fn new(once_every_duration: Duration, after_num_ops: u64) -> Self {
52 let counter = Arc::new(AtomicU64::new(1));
53 if !once_every_duration.is_zero() {
54 let counter = counter.clone();
55 tokio::task::spawn(async move {
56 loop {
57 if counter.load(Ordering::SeqCst) > after_num_ops {
58 counter.store(0, Ordering::SeqCst);
59 }
60 tokio::time::sleep(once_every_duration).await;
61 }
62 });
63 }
64 SamplingInterval {
65 once_every_duration,
66 after_num_ops,
67 counter,
68 }
69 }
70 pub fn new_from_self(&self) -> SamplingInterval {
71 SamplingInterval::new(self.once_every_duration, self.after_num_ops)
72 }
73 pub fn sample(&self) -> bool {
74 if self.once_every_duration.is_zero() {
75 self.counter
76 .fetch_add(1, Ordering::Relaxed)
77 .is_multiple_of(self.after_num_ops + 1)
78 } else {
79 self.counter.fetch_add(1, Ordering::Relaxed) == 0
80 }
81 }
82}
83
84#[derive(Debug)]
85pub struct ColumnFamilyMetrics {
86 pub rocksdb_total_sst_files_size: IntGaugeVec,
87 pub rocksdb_total_blob_files_size: IntGaugeVec,
88 pub rocksdb_total_num_files: IntGaugeVec,
89 pub rocksdb_num_level0_files: IntGaugeVec,
90 pub rocksdb_current_size_active_mem_tables: IntGaugeVec,
91 pub rocksdb_size_all_mem_tables: IntGaugeVec,
92 pub rocksdb_num_snapshots: IntGaugeVec,
93 pub rocksdb_oldest_snapshot_time: IntGaugeVec,
94 pub rocksdb_actual_delayed_write_rate: IntGaugeVec,
95 pub rocksdb_is_write_stopped: IntGaugeVec,
96 pub rocksdb_block_cache_capacity: IntGaugeVec,
97 pub rocksdb_block_cache_usage: IntGaugeVec,
98 pub rocksdb_block_cache_pinned_usage: IntGaugeVec,
99 pub rocksdb_estimate_table_readers_mem: IntGaugeVec,
100 pub rocksdb_num_immutable_mem_tables: IntGaugeVec,
101 pub rocksdb_mem_table_flush_pending: IntGaugeVec,
102 pub rocksdb_compaction_pending: IntGaugeVec,
103 pub rocksdb_estimate_pending_compaction_bytes: IntGaugeVec,
104 pub rocksdb_num_running_compactions: IntGaugeVec,
105 pub rocksdb_num_running_flushes: IntGaugeVec,
106 pub rocksdb_estimate_oldest_key_time: IntGaugeVec,
107 pub rocksdb_background_errors: IntGaugeVec,
108 pub rocksdb_estimated_num_keys: IntGaugeVec,
109 pub rocksdb_base_level: IntGaugeVec,
110}
111
112impl ColumnFamilyMetrics {
113 pub(crate) fn new(registry: &Registry) -> Self {
114 ColumnFamilyMetrics {
115 rocksdb_total_sst_files_size: register_int_gauge_vec_with_registry!(
116 "rocksdb_total_sst_files_size",
117 "The storage size occupied by the sst files in the column family",
118 &["cf_name"],
119 registry,
120 )
121 .unwrap(),
122 rocksdb_total_blob_files_size: register_int_gauge_vec_with_registry!(
123 "rocksdb_total_blob_files_size",
124 "The storage size occupied by the blob files in the column family",
125 &["cf_name"],
126 registry,
127 )
128 .unwrap(),
129 rocksdb_total_num_files: register_int_gauge_vec_with_registry!(
130 "rocksdb_total_num_files",
131 "Total number of files used in the column family",
132 &["cf_name"],
133 registry,
134 )
135 .unwrap(),
136 rocksdb_num_level0_files: register_int_gauge_vec_with_registry!(
137 "rocksdb_num_level0_files",
138 "Number of level 0 files in the column family",
139 &["cf_name"],
140 registry,
141 )
142 .unwrap(),
143 rocksdb_current_size_active_mem_tables: register_int_gauge_vec_with_registry!(
144 "rocksdb_current_size_active_mem_tables",
145 "The current approximate size of active memtable (bytes).",
146 &["cf_name"],
147 registry,
148 )
149 .unwrap(),
150 rocksdb_size_all_mem_tables: register_int_gauge_vec_with_registry!(
151 "rocksdb_size_all_mem_tables",
152 "The memory size occupied by the column family's in-memory buffer",
153 &["cf_name"],
154 registry,
155 )
156 .unwrap(),
157 rocksdb_num_snapshots: register_int_gauge_vec_with_registry!(
158 "rocksdb_num_snapshots",
159 "Number of snapshots held for the column family",
160 &["cf_name"],
161 registry,
162 )
163 .unwrap(),
164 rocksdb_oldest_snapshot_time: register_int_gauge_vec_with_registry!(
165 "rocksdb_oldest_snapshot_time",
166 "Unit timestamp of the oldest unreleased snapshot",
167 &["cf_name"],
168 registry,
169 )
170 .unwrap(),
171 rocksdb_actual_delayed_write_rate: register_int_gauge_vec_with_registry!(
172 "rocksdb_actual_delayed_write_rate",
173 "The current actual delayed write rate. 0 means no delay",
174 &["cf_name"],
175 registry,
176 )
177 .unwrap(),
178 rocksdb_is_write_stopped: register_int_gauge_vec_with_registry!(
179 "rocksdb_is_write_stopped",
180 "A flag indicating whether writes are stopped on this column family. 1 indicates writes have been stopped.",
181 &["cf_name"],
182 registry,
183 )
184 .unwrap(),
185 rocksdb_block_cache_capacity: register_int_gauge_vec_with_registry!(
186 "rocksdb_block_cache_capacity",
187 "The block cache capacity of the column family.",
188 &["cf_name"],
189 registry,
190 )
191 .unwrap(),
192 rocksdb_block_cache_usage: register_int_gauge_vec_with_registry!(
193 "rocksdb_block_cache_usage",
194 "The memory size used by the column family in the block cache.",
195 &["cf_name"],
196 registry,
197 )
198 .unwrap(),
199 rocksdb_block_cache_pinned_usage: register_int_gauge_vec_with_registry!(
200 "rocksdb_block_cache_pinned_usage",
201 "The memory size used by the column family in the block cache where entries are pinned",
202 &["cf_name"],
203 registry,
204 )
205 .unwrap(),
206 rocksdb_estimate_table_readers_mem: register_int_gauge_vec_with_registry!(
207 "rocksdb_estimate_table_readers_mem",
208 "The estimated memory size used for reading SST tables in this column
209 family such as filters and index blocks. Note that this number does not
210 include the memory used in block cache.",
211 &["cf_name"],
212 registry,
213 )
214 .unwrap(),
215 rocksdb_num_immutable_mem_tables: register_int_gauge_vec_with_registry!(
216 "rocksdb_num_immutable_mem_tables",
217 "The number of immutable memtables that have not yet been flushed.",
218 &["cf_name"],
219 registry,
220 )
221 .unwrap(),
222 rocksdb_mem_table_flush_pending: register_int_gauge_vec_with_registry!(
223 "rocksdb_mem_table_flush_pending",
224 "A 1 or 0 flag indicating whether a memtable flush is pending.
225 If this number is 1, it means a memtable is waiting for being flushed,
226 but there might be too many L0 files that prevents it from being flushed.",
227 &["cf_name"],
228 registry,
229 )
230 .unwrap(),
231 rocksdb_compaction_pending: register_int_gauge_vec_with_registry!(
232 "rocksdb_compaction_pending",
233 "A 1 or 0 flag indicating whether a compaction job is pending.
234 If this number is 1, it means some part of the column family requires
235 compaction in order to maintain shape of LSM tree, but the compaction
236 is pending because the desired compaction job is either waiting for
237 other dependent compactions to be finished or waiting for an available
238 compaction thread.",
239 &["cf_name"],
240 registry,
241 )
242 .unwrap(),
243 rocksdb_estimate_pending_compaction_bytes: register_int_gauge_vec_with_registry!(
244 "rocksdb_estimate_pending_compaction_bytes",
245 "Estimated total number of bytes compaction needs to rewrite to get all levels down
246 to under target size. Not valid for other compactions than level-based.",
247 &["cf_name"],
248 registry,
249 )
250 .unwrap(),
251 rocksdb_num_running_compactions: register_int_gauge_vec_with_registry!(
252 "rocksdb_num_running_compactions",
253 "The number of compactions that are currently running for the column family.",
254 &["cf_name"],
255 registry,
256 )
257 .unwrap(),
258 rocksdb_num_running_flushes: register_int_gauge_vec_with_registry!(
259 "rocksdb_num_running_flushes",
260 "The number of flushes that are currently running for the column family.",
261 &["cf_name"],
262 registry,
263 )
264 .unwrap(),
265 rocksdb_estimate_oldest_key_time: register_int_gauge_vec_with_registry!(
266 "rocksdb_estimate_oldest_key_time",
267 "Estimation of the oldest key timestamp in the DB. Only available
268 for FIFO compaction with compaction_options_fifo.allow_compaction = false.",
269 &["cf_name"],
270 registry,
271 )
272 .unwrap(),
273 rocksdb_estimated_num_keys: register_int_gauge_vec_with_registry!(
274 "rocksdb_estimated_num_keys",
275 "The estimated number of keys in the table",
276 &["cf_name"],
277 registry,
278 )
279 .unwrap(),
280 rocksdb_background_errors: register_int_gauge_vec_with_registry!(
281 "rocksdb_background_errors",
282 "The accumulated number of RocksDB background errors.",
283 &["cf_name"],
284 registry,
285 )
286 .unwrap(),
287 rocksdb_base_level: register_int_gauge_vec_with_registry!(
288 "rocksdb_base_level",
289 "The number of level to which L0 data will be compacted.",
290 &["cf_name"],
291 registry,
292 )
293 .unwrap(),
294 }
295 }
296}
297
298#[derive(Debug)]
299pub struct OperationMetrics {
300 pub rocksdb_iter_latency_seconds: HistogramVec,
301 pub rocksdb_iter_bytes: HistogramVec,
302 pub rocksdb_iter_keys: HistogramVec,
303 pub rocksdb_get_latency_seconds: HistogramVec,
304 pub rocksdb_get_bytes: HistogramVec,
305 pub rocksdb_multiget_latency_seconds: HistogramVec,
306 pub rocksdb_multiget_bytes: HistogramVec,
307 pub rocksdb_put_latency_seconds: HistogramVec,
308 pub rocksdb_put_bytes: HistogramVec,
309 pub rocksdb_batch_put_bytes: HistogramVec,
310 pub rocksdb_delete_latency_seconds: HistogramVec,
311 pub rocksdb_deletes: IntCounterVec,
312 pub rocksdb_batch_commit_latency_seconds: HistogramVec,
313 pub rocksdb_batch_commit_bytes: HistogramVec,
314 pub rocksdb_num_active_db_handles: IntGaugeVec,
315 pub rocksdb_very_slow_batch_writes_count: IntCounterVec,
316 pub rocksdb_very_slow_batch_writes_duration_ms: IntCounterVec,
317 pub rocksdb_very_slow_puts_count: IntCounterVec,
318 pub rocksdb_very_slow_puts_duration_ms: IntCounterVec,
319}
320
321impl OperationMetrics {
322 pub(crate) fn new(registry: &Registry) -> Self {
323 OperationMetrics {
324 rocksdb_iter_latency_seconds: register_histogram_vec_with_registry!(
325 "rocksdb_iter_latency_seconds",
326 "Rocksdb iter latency in seconds",
327 &["cf_name"],
328 LATENCY_SEC_BUCKETS.to_vec(),
329 registry,
330 )
331 .unwrap(),
332 rocksdb_iter_bytes: register_histogram_vec_with_registry!(
333 "rocksdb_iter_bytes",
334 "Rocksdb iter size in bytes",
335 &["cf_name"],
336 prometheus::exponential_buckets(1.0, 4.0, 15)
337 .unwrap()
338 .to_vec(),
339 registry,
340 )
341 .unwrap(),
342 rocksdb_iter_keys: register_histogram_vec_with_registry!(
343 "rocksdb_iter_keys",
344 "Rocksdb iter num keys",
345 &["cf_name"],
346 registry,
347 )
348 .unwrap(),
349 rocksdb_get_latency_seconds: register_histogram_vec_with_registry!(
350 "rocksdb_get_latency_seconds",
351 "Rocksdb get latency in seconds",
352 &["cf_name"],
353 LATENCY_SEC_BUCKETS.to_vec(),
354 registry,
355 )
356 .unwrap(),
357 rocksdb_get_bytes: register_histogram_vec_with_registry!(
358 "rocksdb_get_bytes",
359 "Rocksdb get call returned data size in bytes",
360 &["cf_name"],
361 prometheus::exponential_buckets(1.0, 4.0, 15)
362 .unwrap()
363 .to_vec(),
364 registry
365 )
366 .unwrap(),
367 rocksdb_multiget_latency_seconds: register_histogram_vec_with_registry!(
368 "rocksdb_multiget_latency_seconds",
369 "Rocksdb multiget latency in seconds",
370 &["cf_name"],
371 LATENCY_SEC_BUCKETS.to_vec(),
372 registry,
373 )
374 .unwrap(),
375 rocksdb_multiget_bytes: register_histogram_vec_with_registry!(
376 "rocksdb_multiget_bytes",
377 "Rocksdb multiget call returned data size in bytes",
378 &["cf_name"],
379 prometheus::exponential_buckets(1.0, 4.0, 15)
380 .unwrap()
381 .to_vec(),
382 registry,
383 )
384 .unwrap(),
385 rocksdb_put_latency_seconds: register_histogram_vec_with_registry!(
386 "rocksdb_put_latency_seconds",
387 "Rocksdb put latency in seconds",
388 &["cf_name"],
389 LATENCY_SEC_BUCKETS.to_vec(),
390 registry,
391 )
392 .unwrap(),
393 rocksdb_put_bytes: register_histogram_vec_with_registry!(
394 "rocksdb_put_bytes",
395 "Rocksdb put call puts data size in bytes",
396 &["cf_name"],
397 prometheus::exponential_buckets(1.0, 4.0, 15)
398 .unwrap()
399 .to_vec(),
400 registry,
401 )
402 .unwrap(),
403 rocksdb_batch_put_bytes: register_histogram_vec_with_registry!(
404 "rocksdb_batch_put_bytes",
405 "Rocksdb batch put call puts data size in bytes",
406 &["cf_name"],
407 prometheus::exponential_buckets(1.0, 4.0, 15)
408 .unwrap()
409 .to_vec(),
410 registry,
411 )
412 .unwrap(),
413 rocksdb_delete_latency_seconds: register_histogram_vec_with_registry!(
414 "rocksdb_delete_latency_seconds",
415 "Rocksdb delete latency in seconds",
416 &["cf_name"],
417 LATENCY_SEC_BUCKETS.to_vec(),
418 registry,
419 )
420 .unwrap(),
421 rocksdb_deletes: register_int_counter_vec_with_registry!(
422 "rocksdb_deletes",
423 "Rocksdb delete calls",
424 &["cf_name"],
425 registry
426 )
427 .unwrap(),
428 rocksdb_batch_commit_latency_seconds: register_histogram_vec_with_registry!(
429 "rocksdb_write_batch_commit_latency_seconds",
430 "Rocksdb schema batch commit latency in seconds",
431 &["db_name"],
432 LATENCY_SEC_BUCKETS.to_vec(),
433 registry,
434 )
435 .unwrap(),
436 rocksdb_batch_commit_bytes: register_histogram_vec_with_registry!(
437 "rocksdb_batch_commit_bytes",
438 "Rocksdb schema batch commit size in bytes",
439 &["db_name"],
440 prometheus::exponential_buckets(1.0, 4.0, 15)
441 .unwrap()
442 .to_vec(),
443 registry,
444 )
445 .unwrap(),
446 rocksdb_num_active_db_handles: register_int_gauge_vec_with_registry!(
447 "rocksdb_num_active_db_handles",
448 "Number of active db handles",
449 &["db_name"],
450 registry,
451 )
452 .unwrap(),
453 rocksdb_very_slow_batch_writes_count: register_int_counter_vec_with_registry!(
454 "rocksdb_num_very_slow_batch_writes",
455 "Number of batch writes that took more than 1 second",
456 &["db_name"],
457 registry,
458 )
459 .unwrap(),
460 rocksdb_very_slow_batch_writes_duration_ms: register_int_counter_vec_with_registry!(
461 "rocksdb_very_slow_batch_writes_duration",
462 "Total duration of batch writes that took more than 1 second",
463 &["db_name"],
464 registry,
465 )
466 .unwrap(),
467 rocksdb_very_slow_puts_count: register_int_counter_vec_with_registry!(
468 "rocksdb_num_very_slow_puts",
469 "Number of puts that took more than 1 second",
470 &["cf_name"],
471 registry,
472 )
473 .unwrap(),
474 rocksdb_very_slow_puts_duration_ms: register_int_counter_vec_with_registry!(
475 "rocksdb_very_slow_puts_duration",
476 "Total duration of puts that took more than 1 second",
477 &["cf_name"],
478 registry,
479 )
480 .unwrap(),
481 }
482 }
483}
484
485pub struct RocksDBPerfContext;
486
487impl Default for RocksDBPerfContext {
488 fn default() -> Self {
489 set_perf_stats(PerfStatsLevel::EnableTime);
490 PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context| {
491 perf_context.borrow_mut().reset();
492 });
493 RocksDBPerfContext {}
494 }
495}
496
497impl Drop for RocksDBPerfContext {
498 fn drop(&mut self) {
499 set_perf_stats(PerfStatsLevel::Disable);
500 }
501}
502
503#[derive(Debug)]
504pub struct ReadPerfContextMetrics {
505 pub user_key_comparison_count: IntCounterVec,
506 pub block_cache_hit_count: IntCounterVec,
507 pub block_read_count: IntCounterVec,
508 pub block_read_byte: IntCounterVec,
509 pub block_read_nanos: IntCounterVec,
510 pub block_checksum_nanos: IntCounterVec,
511 pub block_decompress_nanos: IntCounterVec,
512 pub get_read_bytes: IntCounterVec,
513 pub multiget_read_bytes: IntCounterVec,
514 pub get_snapshot_nanos: IntCounterVec,
515 pub get_from_memtable_nanos: IntCounterVec,
516 pub get_from_memtable_count: IntCounterVec,
517 pub get_post_process_nanos: IntCounterVec,
518 pub get_from_output_files_nanos: IntCounterVec,
519 pub db_mutex_lock_nanos: IntCounterVec,
520 pub db_condition_wait_nanos: IntCounterVec,
521 pub merge_operator_nanos: IntCounterVec,
522 pub read_index_block_nanos: IntCounterVec,
523 pub read_filter_block_nanos: IntCounterVec,
524 pub new_table_block_iter_nanos: IntCounterVec,
525 pub block_seek_nanos: IntCounterVec,
526 pub find_table_nanos: IntCounterVec,
527 pub bloom_memtable_hit_count: IntCounterVec,
528 pub bloom_memtable_miss_count: IntCounterVec,
529 pub bloom_sst_hit_count: IntCounterVec,
530 pub bloom_sst_miss_count: IntCounterVec,
531 pub key_lock_wait_time: IntCounterVec,
532 pub key_lock_wait_count: IntCounterVec,
533 pub internal_delete_skipped_count: IntCounterVec,
534 pub internal_skipped_count: IntCounterVec,
535}
536
537impl ReadPerfContextMetrics {
538 pub(crate) fn new(registry: &Registry) -> Self {
539 ReadPerfContextMetrics {
540 user_key_comparison_count: register_int_counter_vec_with_registry!(
541 "user_key_comparison_count",
542 "Helps us figure out whether too many comparisons in binary search can be a problem,
543 especially when a more expensive comparator is used. Moreover, since number of comparisons
544 is usually uniform based on the memtable size, the SST file size for Level 0 and size of other
545 levels, an significant increase of the counter can indicate unexpected LSM-tree shape.
546 You may want to check whether flush/compaction can keep up with the write speed",
547 &["cf_name"],
548 registry,
549 )
550 .unwrap(),
551 block_cache_hit_count: register_int_counter_vec_with_registry!(
552 "block_cache_hit_count",
553 "Tells us how many times we read data blocks from block cache, and block_read_count tells us how many
554 times we have to read blocks from the file system (either block cache is disabled or it is a cache miss).
555 We can evaluate the block cache efficiency by looking at the two counters over time.",
556 &["cf_name"],
557 registry,
558 )
559 .unwrap(),
560 block_read_count: register_int_counter_vec_with_registry!(
561 "block_read_count",
562 "Tells us how many times we have to read blocks from the file system (either block cache is disabled or it is a cache miss)",
563 &["cf_name"],
564 registry,
565 )
566 .unwrap(),
567 block_read_byte: register_int_counter_vec_with_registry!(
568 "block_read_byte",
569 "Tells us how many total bytes we read from the file system. It can tell us whether a slow query can be caused by reading
570 large blocks from the file system. Index and bloom filter blocks are usually large blocks. A large block can also be the result
571 of a very large key or value",
572 &["cf_name"],
573 registry,
574 )
575 .unwrap(),
576 block_read_nanos: register_int_counter_vec_with_registry!(
577 "block_read_nanos",
578 "Total nanos spent on block reads",
579 &["cf_name"],
580 registry,
581 )
582 .unwrap(),
583 block_checksum_nanos: register_int_counter_vec_with_registry!(
584 "block_checksum_nanos",
585 "Total nanos spent on verifying block checksum",
586 &["cf_name"],
587 registry,
588 )
589 .unwrap(),
590 block_decompress_nanos: register_int_counter_vec_with_registry!(
591 "block_decompress_nanos",
592 "Total nanos spent on decompressing a block",
593 &["cf_name"],
594 registry,
595 )
596 .unwrap(),
597 get_read_bytes: register_int_counter_vec_with_registry!(
598 "get_read_bytes",
599 "Total bytes for values returned by Get",
600 &["cf_name"],
601 registry,
602 )
603 .unwrap(),
604 multiget_read_bytes: register_int_counter_vec_with_registry!(
605 "multiget_read_bytes",
606 "Total bytes for values returned by MultiGet.",
607 &["cf_name"],
608 registry,
609 )
610 .unwrap(),
611 get_snapshot_nanos: register_int_counter_vec_with_registry!(
612 "get_snapshot_nanos",
613 "Time spent in getting snapshot.",
614 &["cf_name"],
615 registry,
616 )
617 .unwrap(),
618 get_from_memtable_nanos: register_int_counter_vec_with_registry!(
619 "get_from_memtable_nanos",
620 "Time spent on reading data from memtable.",
621 &["cf_name"],
622 registry,
623 )
624 .unwrap(),
625 get_from_memtable_count: register_int_counter_vec_with_registry!(
626 "get_from_memtable_count",
627 "Number of memtables queried",
628 &["cf_name"],
629 registry,
630 )
631 .unwrap(),
632 get_post_process_nanos: register_int_counter_vec_with_registry!(
633 "get_post_process_nanos",
634 "Total nanos spent after Get() finds a key",
635 &["cf_name"],
636 registry,
637 )
638 .unwrap(),
639 get_from_output_files_nanos: register_int_counter_vec_with_registry!(
640 "get_from_output_files_nanos",
641 "Total nanos reading from output files",
642 &["cf_name"],
643 registry,
644 )
645 .unwrap(),
646 db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
647 "db_mutex_lock_nanos",
648 "Time spent on acquiring db mutex",
649 &["cf_name"],
650 registry,
651 )
652 .unwrap(),
653 db_condition_wait_nanos: register_int_counter_vec_with_registry!(
654 "db_condition_wait_nanos",
655 "Time spent waiting with a condition variable created with DB Mutex.",
656 &["cf_name"],
657 registry,
658 )
659 .unwrap(),
660 merge_operator_nanos: register_int_counter_vec_with_registry!(
661 "merge_operator_nanos",
662 "Time spent on merge operator.",
663 &["cf_name"],
664 registry,
665 )
666 .unwrap(),
667 read_index_block_nanos: register_int_counter_vec_with_registry!(
668 "read_index_block_nanos",
669 "Time spent on reading index block from block cache or SST file",
670 &["cf_name"],
671 registry,
672 )
673 .unwrap(),
674 read_filter_block_nanos: register_int_counter_vec_with_registry!(
675 "read_filter_block_nanos",
676 "Time spent on reading filter block from block cache or SST file",
677 &["cf_name"],
678 registry,
679 )
680 .unwrap(),
681 new_table_block_iter_nanos: register_int_counter_vec_with_registry!(
682 "new_table_block_iter_nanos",
683 "Time spent on creating data block iterator",
684 &["cf_name"],
685 registry,
686 )
687 .unwrap(),
688 block_seek_nanos: register_int_counter_vec_with_registry!(
689 "block_seek_nanos",
690 "Time spent on seeking a key in data/index blocks",
691 &["cf_name"],
692 registry,
693 )
694 .unwrap(),
695 find_table_nanos: register_int_counter_vec_with_registry!(
696 "find_table_nanos",
697 "Time spent on finding or creating a table reader",
698 &["cf_name"],
699 registry,
700 )
701 .unwrap(),
702 bloom_memtable_hit_count: register_int_counter_vec_with_registry!(
703 "bloom_memtable_hit_count",
704 "Total number of mem table bloom hits",
705 &["cf_name"],
706 registry,
707 )
708 .unwrap(),
709 bloom_memtable_miss_count: register_int_counter_vec_with_registry!(
710 "bloom_memtable_miss_count",
711 "Total number of mem table bloom misses",
712 &["cf_name"],
713 registry,
714 )
715 .unwrap(),
716 bloom_sst_hit_count: register_int_counter_vec_with_registry!(
717 "bloom_sst_hit_count",
718 "Total number of SST table bloom hits",
719 &["cf_name"],
720 registry,
721 )
722 .unwrap(),
723 bloom_sst_miss_count: register_int_counter_vec_with_registry!(
724 "bloom_sst_miss_count",
725 "Total number of SST table bloom misses",
726 &["cf_name"],
727 registry,
728 )
729 .unwrap(),
730 key_lock_wait_time: register_int_counter_vec_with_registry!(
731 "key_lock_wait_time",
732 "Time spent waiting on key locks in transaction lock manager",
733 &["cf_name"],
734 registry,
735 )
736 .unwrap(),
737 key_lock_wait_count: register_int_counter_vec_with_registry!(
738 "key_lock_wait_count",
739 "Number of times acquiring a lock was blocked by another transaction",
740 &["cf_name"],
741 registry,
742 )
743 .unwrap(),
744 internal_delete_skipped_count: register_int_counter_vec_with_registry!(
745 "internal_delete_skipped_count",
746 "Total number of deleted keys skipped during iteration",
747 &["cf_name"],
748 registry,
749 )
750 .unwrap(),
751 internal_skipped_count: register_int_counter_vec_with_registry!(
752 "internal_skipped_count",
753 "Totall number of internal keys skipped during iteration",
754 &["cf_name"],
755 registry,
756 )
757 .unwrap(),
758 }
759 }
760
761 pub fn report_metrics(&self, cf_name: &str) {
762 PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
763 set_perf_stats(PerfStatsLevel::Disable);
764 let perf_context = perf_context_cell.borrow();
765 self.user_key_comparison_count
766 .with_label_values(&[cf_name])
767 .inc_by(perf_context.metric(PerfMetric::UserKeyComparisonCount));
768 self.block_cache_hit_count
769 .with_label_values(&[cf_name])
770 .inc_by(perf_context.metric(PerfMetric::BlockCacheHitCount));
771 self.block_read_count
772 .with_label_values(&[cf_name])
773 .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
774 self.block_read_byte
775 .with_label_values(&[cf_name])
776 .inc_by(perf_context.metric(PerfMetric::BlockReadByte));
777 self.block_read_nanos
778 .with_label_values(&[cf_name])
779 .inc_by(perf_context.metric(PerfMetric::BlockReadTime));
780 self.block_read_count
781 .with_label_values(&[cf_name])
782 .inc_by(perf_context.metric(PerfMetric::BlockReadCount));
783 self.block_checksum_nanos
784 .with_label_values(&[cf_name])
785 .inc_by(perf_context.metric(PerfMetric::BlockChecksumTime));
786 self.block_decompress_nanos
787 .with_label_values(&[cf_name])
788 .inc_by(perf_context.metric(PerfMetric::BlockDecompressTime));
789 self.get_read_bytes
790 .with_label_values(&[cf_name])
791 .inc_by(perf_context.metric(PerfMetric::GetReadBytes));
792 self.multiget_read_bytes
793 .with_label_values(&[cf_name])
794 .inc_by(perf_context.metric(PerfMetric::MultigetReadBytes));
795 self.get_snapshot_nanos
796 .with_label_values(&[cf_name])
797 .inc_by(perf_context.metric(PerfMetric::GetSnapshotTime));
798 self.get_from_memtable_nanos
799 .with_label_values(&[cf_name])
800 .inc_by(perf_context.metric(PerfMetric::GetFromMemtableTime));
801 self.get_from_memtable_count
802 .with_label_values(&[cf_name])
803 .inc_by(perf_context.metric(PerfMetric::GetFromMemtableCount));
804 self.get_post_process_nanos
805 .with_label_values(&[cf_name])
806 .inc_by(perf_context.metric(PerfMetric::GetPostProcessTime));
807 self.get_from_output_files_nanos
808 .with_label_values(&[cf_name])
809 .inc_by(perf_context.metric(PerfMetric::GetFromOutputFilesTime));
810 self.db_mutex_lock_nanos
811 .with_label_values(&[cf_name])
812 .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
813 self.db_condition_wait_nanos
814 .with_label_values(&[cf_name])
815 .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
816 self.merge_operator_nanos
817 .with_label_values(&[cf_name])
818 .inc_by(perf_context.metric(PerfMetric::MergeOperatorTimeNanos));
819 self.read_index_block_nanos
820 .with_label_values(&[cf_name])
821 .inc_by(perf_context.metric(PerfMetric::ReadIndexBlockNanos));
822 self.read_filter_block_nanos
823 .with_label_values(&[cf_name])
824 .inc_by(perf_context.metric(PerfMetric::ReadFilterBlockNanos));
825 self.new_table_block_iter_nanos
826 .with_label_values(&[cf_name])
827 .inc_by(perf_context.metric(PerfMetric::NewTableBlockIterNanos));
828 self.block_seek_nanos
829 .with_label_values(&[cf_name])
830 .inc_by(perf_context.metric(PerfMetric::BlockSeekNanos));
831 self.find_table_nanos
832 .with_label_values(&[cf_name])
833 .inc_by(perf_context.metric(PerfMetric::FindTableNanos));
834 self.bloom_memtable_hit_count
835 .with_label_values(&[cf_name])
836 .inc_by(perf_context.metric(PerfMetric::BloomMemtableHitCount));
837 self.bloom_memtable_miss_count
838 .with_label_values(&[cf_name])
839 .inc_by(perf_context.metric(PerfMetric::BloomMemtableMissCount));
840 self.bloom_sst_hit_count
841 .with_label_values(&[cf_name])
842 .inc_by(perf_context.metric(PerfMetric::BloomSstHitCount));
843 self.bloom_sst_miss_count
844 .with_label_values(&[cf_name])
845 .inc_by(perf_context.metric(PerfMetric::BloomSstMissCount));
846 self.key_lock_wait_time
847 .with_label_values(&[cf_name])
848 .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
849 self.key_lock_wait_count
850 .with_label_values(&[cf_name])
851 .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
852 self.internal_delete_skipped_count
853 .with_label_values(&[cf_name])
854 .inc_by(perf_context.metric(PerfMetric::InternalDeleteSkippedCount));
855 self.internal_skipped_count
856 .with_label_values(&[cf_name])
857 .inc_by(perf_context.metric(PerfMetric::InternalKeySkippedCount));
858 });
859 }
860}
861
862#[derive(Debug)]
863pub struct WritePerfContextMetrics {
864 pub write_wal_nanos: IntCounterVec,
865 pub write_memtable_nanos: IntCounterVec,
866 pub write_delay_nanos: IntCounterVec,
867 pub write_pre_and_post_process_nanos: IntCounterVec,
868 pub write_db_mutex_lock_nanos: IntCounterVec,
869 pub write_db_condition_wait_nanos: IntCounterVec,
870 pub write_key_lock_wait_nanos: IntCounterVec,
871 pub write_key_lock_wait_count: IntCounterVec,
872}
873
874impl WritePerfContextMetrics {
875 pub(crate) fn new(registry: &Registry) -> Self {
876 WritePerfContextMetrics {
877 write_wal_nanos: register_int_counter_vec_with_registry!(
878 "write_wal_nanos",
879 "Total nanos spent on writing to WAL",
880 &["cf_name"],
881 registry,
882 )
883 .unwrap(),
884 write_memtable_nanos: register_int_counter_vec_with_registry!(
885 "write_memtable_nanos",
886 "Total nanos spent on writing to memtable",
887 &["cf_name"],
888 registry,
889 )
890 .unwrap(),
891 write_delay_nanos: register_int_counter_vec_with_registry!(
892 "write_delay_nanos",
893 "Total nanos spent on delaying or throttling write",
894 &["cf_name"],
895 registry,
896 )
897 .unwrap(),
898 write_pre_and_post_process_nanos: register_int_counter_vec_with_registry!(
899 "write_pre_and_post_process_nanos",
900 "Total nanos spent on writing a record, excluding the above four things",
901 &["cf_name"],
902 registry,
903 )
904 .unwrap(),
905 write_db_mutex_lock_nanos: register_int_counter_vec_with_registry!(
906 "write_db_mutex_lock_nanos",
907 "Time spent on acquiring db mutex",
908 &["cf_name"],
909 registry,
910 )
911 .unwrap(),
912 write_db_condition_wait_nanos: register_int_counter_vec_with_registry!(
913 "write_db_condition_wait_nanos",
914 "Time spent waiting with a condition variable created with DB Mutex.",
915 &["cf_name"],
916 registry,
917 )
918 .unwrap(),
919 write_key_lock_wait_nanos: register_int_counter_vec_with_registry!(
920 "write_key_lock_wait_time",
921 "Time spent waiting on key locks in transaction lock manager",
922 &["cf_name"],
923 registry,
924 )
925 .unwrap(),
926 write_key_lock_wait_count: register_int_counter_vec_with_registry!(
927 "write_key_lock_wait_count",
928 "Number of times acquiring a lock was blocked by another transaction",
929 &["cf_name"],
930 registry,
931 )
932 .unwrap(),
933 }
934 }
935 pub fn report_metrics(&self, db_name: &str) {
936 PER_THREAD_ROCKS_PERF_CONTEXT.with(|perf_context_cell| {
937 set_perf_stats(PerfStatsLevel::Disable);
938 let perf_context = perf_context_cell.borrow();
939 self.write_wal_nanos
940 .with_label_values(&[db_name])
941 .inc_by(perf_context.metric(PerfMetric::WriteWalTime));
942 self.write_memtable_nanos
943 .with_label_values(&[db_name])
944 .inc_by(perf_context.metric(PerfMetric::WriteMemtableTime));
945 self.write_delay_nanos
946 .with_label_values(&[db_name])
947 .inc_by(perf_context.metric(PerfMetric::WriteDelayTime));
948 self.write_pre_and_post_process_nanos
949 .with_label_values(&[db_name])
950 .inc_by(perf_context.metric(PerfMetric::WritePreAndPostProcessTime));
951 self.write_db_mutex_lock_nanos
952 .with_label_values(&[db_name])
953 .inc_by(perf_context.metric(PerfMetric::DbMutexLockNanos));
954 self.write_db_condition_wait_nanos
955 .with_label_values(&[db_name])
956 .inc_by(perf_context.metric(PerfMetric::DbConditionWaitNanos));
957 self.write_key_lock_wait_nanos
958 .with_label_values(&[db_name])
959 .inc_by(perf_context.metric(PerfMetric::KeyLockWaitTime));
960 self.write_key_lock_wait_count
961 .with_label_values(&[db_name])
962 .inc_by(perf_context.metric(PerfMetric::KeyLockWaitCount));
963 });
964 }
965}
966
967#[derive(Debug)]
968pub struct DBMetrics {
969 pub op_metrics: OperationMetrics,
970 pub cf_metrics: ColumnFamilyMetrics,
971 pub read_perf_ctx_metrics: ReadPerfContextMetrics,
972 pub write_perf_ctx_metrics: WritePerfContextMetrics,
973 pub registry_serivce: RegistryService,
974}
975
976static ONCE: OnceCell<Arc<DBMetrics>> = OnceCell::new();
977
978impl DBMetrics {
979 fn new(registry_service: RegistryService) -> Self {
980 let registry = registry_service.default_registry();
981 DBMetrics {
982 op_metrics: OperationMetrics::new(®istry),
983 cf_metrics: ColumnFamilyMetrics::new(®istry),
984 read_perf_ctx_metrics: ReadPerfContextMetrics::new(®istry),
985 write_perf_ctx_metrics: WritePerfContextMetrics::new(®istry),
986 registry_serivce: registry_service,
987 }
988 }
989
990 pub fn init(registry_service: RegistryService) {
993 let _ = ONCE
995 .set(Arc::new(DBMetrics::new(registry_service)))
996 .tap_err(|_| warn!("DBMetrics registry overwritten"));
998 }
999
1000 pub fn increment_num_active_dbs(&self, db_name: &str) {
1001 self.op_metrics
1002 .rocksdb_num_active_db_handles
1003 .with_label_values(&[db_name])
1004 .inc();
1005 }
1006
1007 pub fn decrement_num_active_dbs(&self, db_name: &str) {
1008 self.op_metrics
1009 .rocksdb_num_active_db_handles
1010 .with_label_values(&[db_name])
1011 .dec();
1012 }
1013
1014 pub fn get() -> &'static Arc<DBMetrics> {
1015 ONCE.get_or_init(|| {
1016 Arc::new(DBMetrics::new(RegistryService::new(
1017 prometheus::default_registry().clone(),
1018 )))
1019 })
1020 }
1021}