telemetry_subscribers/
span_latency_prom.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4//! This is a module that records Tokio-tracing [span](https://docs.rs/tracing/latest/tracing/span/index.html)
5//! latencies into Prometheus histograms directly.
6//! The name of the Prometheus histogram is "tracing_span_latencies[_sum/count/bucket]"
7//!
8//! There is also the tracing-timing crate, from which this differs significantly:
9//! - tracing-timing records latencies between events (logs).  We just want to record the latencies of spans.
10//! - tracing-timing does not output to Prometheus, and extracting data from its histograms takes extra CPU
11//! - tracing-timing records latencies using HDRHistogram, which is great, but uses extra memory when one
12//!   is already using Prometheus
13//!
14//! Thus this is a much smaller and more focused module.
15//!
16//! ## Making spans visible
17//! This module can only record latencies for spans that get created.  By default, this is controlled by
18//! env_filter and logging levels.
19
20use std::time::Instant;
21
22use prometheus::{Registry, exponential_buckets, register_histogram_vec_with_registry};
23use tracing::{Subscriber, span};
24
25/// A tokio_tracing Layer that records span latencies into Prometheus histograms
26pub struct PrometheusSpanLatencyLayer {
27    span_latencies: prometheus::HistogramVec,
28}
29
30#[derive(Debug)]
31pub enum PrometheusSpanError {
32    /// num_buckets must be positive >= 1
33    ZeroOrNegativeNumBuckets,
34    PromError(prometheus::Error),
35}
36
37impl From<prometheus::Error> for PrometheusSpanError {
38    fn from(err: prometheus::Error) -> Self {
39        Self::PromError(err)
40    }
41}
42
43const TOP_LATENCY_IN_NS: f64 = 300.0 * 1.0e9;
44const LOWEST_LATENCY_IN_NS: f64 = 500.0;
45
46impl PrometheusSpanLatencyLayer {
47    /// Create a new layer, injecting latencies into the given registry.
48    /// The num_buckets controls how many buckets thus how much memory and time series one
49    /// uses up in Prometheus (and in the application).  10 is probably a minimum.
50    pub fn try_new(registry: &Registry, num_buckets: usize) -> Result<Self, PrometheusSpanError> {
51        if num_buckets < 1 {
52            return Err(PrometheusSpanError::ZeroOrNegativeNumBuckets);
53        }
54
55        // Histogram for span latencies must accommodate a wide range of possible latencies, so
56        // don't use the default Prometheus buckets.  Latencies in NS.  Calculate the multiplier
57        // to go from LOWEST to TOP in num_bucket steps, step n+1 = step n * factor.
58        let factor = (TOP_LATENCY_IN_NS / LOWEST_LATENCY_IN_NS).powf(1.0 / (num_buckets as f64));
59        let buckets = exponential_buckets(LOWEST_LATENCY_IN_NS, factor, num_buckets)?;
60        let span_latencies = register_histogram_vec_with_registry!(
61            "tracing_span_latencies",
62            "Latencies from tokio-tracing spans",
63            &["span_name"],
64            buckets,
65            registry
66        )?;
67        Ok(Self { span_latencies })
68    }
69}
70
71struct PromSpanTimestamp(Instant);
72
73impl<S> tracing_subscriber::Layer<S> for PrometheusSpanLatencyLayer
74where
75    S: Subscriber + for<'span> tracing_subscriber::registry::LookupSpan<'span>,
76{
77    fn on_new_span(
78        &self,
79        _attrs: &span::Attributes,
80        id: &span::Id,
81        ctx: tracing_subscriber::layer::Context<S>,
82    ) {
83        let span = ctx.span(id).unwrap();
84        // NOTE: there are other extensions that insert timings.  For example,
85        // tracing_subscriber's with_span_events() inserts events at open and close that contain timings.
86        // However, we cannot be guaranteed that those events would be turned on.
87        span.extensions_mut()
88            .insert(PromSpanTimestamp(Instant::now()));
89    }
90
91    fn on_close(&self, id: span::Id, ctx: tracing_subscriber::layer::Context<'_, S>) {
92        let span = ctx.span(&id).unwrap();
93        let start_time = span
94            .extensions()
95            .get::<PromSpanTimestamp>()
96            .expect("Could not find saved timestamp on span")
97            .0;
98        let elapsed_ns = start_time.elapsed().as_nanos() as u64;
99        self.span_latencies
100            .with_label_values(&[span.name()])
101            .observe(elapsed_ns as f64);
102    }
103}
104
105#[cfg(test)]
106mod tests {
107    use super::*;
108
109    #[test]
110    fn test_prom_span_latency_init() {
111        let registry = prometheus::Registry::new();
112
113        let res = PrometheusSpanLatencyLayer::try_new(&registry, 0);
114        assert!(matches!(
115            res,
116            Err(PrometheusSpanError::ZeroOrNegativeNumBuckets)
117        ));
118    }
119}