sui_node/
metrics.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use mysten_network::metrics::MetricsCallbackProvider;
5use prometheus::{
6    HistogramVec, IntCounterVec, IntGauge, IntGaugeVec, Registry,
7    register_histogram_vec_with_registry, register_int_counter_vec_with_registry,
8    register_int_gauge_vec_with_registry, register_int_gauge_with_registry,
9};
10
11use std::time::Duration;
12use sui_network::tonic::Code;
13
14pub struct SuiNodeMetrics {
15    pub jwk_requests: IntCounterVec,
16    pub jwk_request_errors: IntCounterVec,
17
18    pub total_jwks: IntCounterVec,
19    pub invalid_jwks: IntCounterVec,
20    pub unique_jwks: IntCounterVec,
21
22    pub current_protocol_version: IntGauge,
23    pub binary_max_protocol_version: IntGauge,
24    pub configured_max_protocol_version: IntGauge,
25}
26
27impl SuiNodeMetrics {
28    pub fn new(registry: &Registry) -> Self {
29        Self {
30            jwk_requests: register_int_counter_vec_with_registry!(
31                "jwk_requests",
32                "Total number of JWK requests",
33                &["provider"],
34                registry,
35            )
36            .unwrap(),
37            jwk_request_errors: register_int_counter_vec_with_registry!(
38                "jwk_request_errors",
39                "Total number of JWK request errors",
40                &["provider"],
41                registry,
42            )
43            .unwrap(),
44            total_jwks: register_int_counter_vec_with_registry!(
45                "total_jwks",
46                "Total number of JWKs",
47                &["provider"],
48                registry,
49            )
50            .unwrap(),
51            invalid_jwks: register_int_counter_vec_with_registry!(
52                "invalid_jwks",
53                "Total number of invalid JWKs",
54                &["provider"],
55                registry,
56            )
57            .unwrap(),
58            unique_jwks: register_int_counter_vec_with_registry!(
59                "unique_jwks",
60                "Total number of unique JWKs",
61                &["provider"],
62                registry,
63            )
64            .unwrap(),
65            current_protocol_version: register_int_gauge_with_registry!(
66                "sui_current_protocol_version",
67                "Current protocol version in this epoch",
68                registry,
69            )
70            .unwrap(),
71            binary_max_protocol_version: register_int_gauge_with_registry!(
72                "sui_binary_max_protocol_version",
73                "Max protocol version supported by this binary",
74                registry,
75            )
76            .unwrap(),
77            configured_max_protocol_version: register_int_gauge_with_registry!(
78                "sui_configured_max_protocol_version",
79                "Max protocol version configured in the node config",
80                registry,
81            )
82            .unwrap(),
83        }
84    }
85}
86
87#[derive(Clone)]
88pub struct GrpcMetrics {
89    inflight_grpc: IntGaugeVec,
90    grpc_requests: IntCounterVec,
91    grpc_request_latency: HistogramVec,
92}
93
94const LATENCY_SEC_BUCKETS: &[f64] = &[
95    0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1., 2.5, 5., 10., 20., 30., 60., 90.,
96];
97
98impl GrpcMetrics {
99    pub fn new(registry: &Registry) -> Self {
100        Self {
101            inflight_grpc: register_int_gauge_vec_with_registry!(
102                "inflight_grpc",
103                "Total in-flight GRPC requests per route",
104                &["path"],
105                registry,
106            )
107            .unwrap(),
108            grpc_requests: register_int_counter_vec_with_registry!(
109                "grpc_requests",
110                "Total GRPC requests per route",
111                &["path", "status"],
112                registry,
113            )
114            .unwrap(),
115            grpc_request_latency: register_histogram_vec_with_registry!(
116                "grpc_request_latency",
117                "Latency of GRPC requests per route",
118                &["path"],
119                LATENCY_SEC_BUCKETS.to_vec(),
120                registry,
121            )
122            .unwrap(),
123        }
124    }
125}
126
127impl MetricsCallbackProvider for GrpcMetrics {
128    fn on_request(&self, _path: String) {}
129
130    fn on_response(&self, path: String, latency: Duration, _status: u16, grpc_status_code: Code) {
131        self.grpc_requests
132            .with_label_values(&[path.as_str(), format!("{grpc_status_code:?}").as_str()])
133            .inc();
134        self.grpc_request_latency
135            .with_label_values(&[path.as_str()])
136            .observe(latency.as_secs_f64());
137    }
138
139    fn on_start(&self, path: &str) {
140        self.inflight_grpc.with_label_values(&[path]).inc();
141    }
142
143    fn on_drop(&self, path: &str) {
144        self.inflight_grpc.with_label_values(&[path]).dec();
145    }
146}
147
148#[cfg(test)]
149mod tests {
150    use mysten_metrics::start_prometheus_server;
151    use prometheus::{IntCounter, Registry};
152    use std::net::{IpAddr, Ipv4Addr, SocketAddr};
153
154    #[tokio::test]
155    pub async fn test_metrics_endpoint_with_multiple_registries_add_remove() {
156        let port: u16 = 8081;
157        let socket = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), port);
158
159        let registry_service = start_prometheus_server(socket);
160
161        tokio::task::yield_now().await;
162
163        // now add a few registries to the service along side with metrics
164        let registry_1 = Registry::new_custom(Some("narwhal".to_string()), None).unwrap();
165        let counter_1 = IntCounter::new("counter_1", "a sample counter 1").unwrap();
166        registry_1.register(Box::new(counter_1)).unwrap();
167
168        let registry_2 = Registry::new_custom(Some("sui".to_string()), None).unwrap();
169        let counter_2 = IntCounter::new("counter_2", "a sample counter 2").unwrap();
170        registry_2.register(Box::new(counter_2.clone())).unwrap();
171
172        let registry_1_id = registry_service.add(registry_1);
173        let _registry_2_id = registry_service.add(registry_2);
174
175        // request the endpoint
176        let result = get_metrics(port).await;
177
178        assert!(result.contains(
179            "# HELP sui_counter_2 a sample counter 2
180# TYPE sui_counter_2 counter
181sui_counter_2 0"
182        ));
183
184        assert!(result.contains(
185            "# HELP narwhal_counter_1 a sample counter 1
186# TYPE narwhal_counter_1 counter
187narwhal_counter_1 0"
188        ));
189
190        // Now remove registry 1
191        assert!(registry_service.remove(registry_1_id));
192
193        // AND increase metric 2
194        counter_2.inc();
195
196        // Now pull again metrics
197        // request the endpoint
198        let result = get_metrics(port).await;
199
200        // Registry 1 metrics should not be present anymore
201        assert!(!result.contains(
202            "# HELP narwhal_counter_1 a sample counter 1
203# TYPE narwhal_counter_1 counter
204narwhal_counter_1 0"
205        ));
206
207        // Registry 2 metric should have increased by 1
208        assert!(result.contains(
209            "# HELP sui_counter_2 a sample counter 2
210# TYPE sui_counter_2 counter
211sui_counter_2 1"
212        ));
213    }
214
215    async fn get_metrics(port: u16) -> String {
216        let client = reqwest::Client::new();
217        let response = client
218            .get(format!("http://127.0.0.1:{}/metrics", port))
219            .send()
220            .await
221            .unwrap();
222        response.text().await.unwrap()
223    }
224}