sui_core/epoch/
epoch_metrics.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use prometheus::{
5    CounterVec, IntCounter, IntCounterVec, IntGauge, Registry, register_counter_vec_with_registry,
6    register_int_counter_vec_with_registry, register_int_counter_with_registry,
7    register_int_gauge_with_registry,
8};
9use std::sync::Arc;
10
11pub struct EpochMetrics {
12    /// The current epoch ID. This is updated only when the AuthorityState finishes reconfiguration.
13    pub current_epoch: IntGauge,
14
15    /// Current voting right of the validator in the protocol. Updated at the start of epochs.
16    pub current_voting_right: IntGauge,
17
18    /// Total duration of the epoch. This is measured from when the current epoch store is opened,
19    /// until the current epoch store is replaced with the next epoch store.
20    pub epoch_total_duration: IntGauge,
21
22    /// Number of checkpoints in the epoch.
23    pub epoch_checkpoint_count: IntGauge,
24
25    /// Number of transactions in the epoch.
26    pub epoch_transaction_count: IntGauge,
27
28    /// Total amount of gas rewards (i.e. computation gas cost) in the epoch.
29    pub epoch_total_gas_reward: IntGauge,
30
31    // An active validator reconfigures through the following steps:
32    // 1. Halt validator (a.k.a. close epoch) and stop accepting user transaction certs.
33    // 2. Finishes processing all pending certificates and then send EndOfPublish message.
34    // 3. Stop accepting messages from consensus after seeing 2f+1 EndOfPublish messages.
35    // 4. Creating the last checkpoint of the epoch by augmenting it with AdvanceEpoch transaction.
36    // 5. CheckpointExecutor finishes executing the last checkpoint, and triggers reconfiguration.
37    // 6. During reconfiguration, we tear down consensus, reconfigure state (at which point we opens
38    //    up user certs), and start consensus again.
39    // 7. After reconfiguration, and eventually consensus starts successfully, at some point the first
40    //    checkpoint of the new epoch will be created.
41    // We introduce various metrics to cover the latency of above steps.
42    /// The duration from when the epoch is closed (i.e. validator halted) to when all pending
43    /// certificates are processed (i.e. ready to send EndOfPublish message).
44    /// This is the duration of (1) through (2) above.
45    pub epoch_pending_certs_processed_time_since_epoch_close_ms: IntGauge,
46
47    /// The interval from when the epoch is closed to when we receive 2f+1 EndOfPublish messages.
48    /// This is the duration of (1) through (3) above.
49    pub epoch_end_of_publish_quorum_time_since_epoch_close_ms: IntGauge,
50
51    /// The interval from when the epoch is closed to when we created the last checkpoint of the
52    /// epoch.
53    /// This is the duration of (1) through (4) above.
54    pub epoch_last_checkpoint_created_time_since_epoch_close_ms: IntGauge,
55
56    /// The interval from when the epoch is closed to when we finished executing the last transaction
57    /// of the checkpoint (and hence triggering reconfiguration process).
58    /// This is the duration of (1) through (5) above.
59    pub epoch_reconfig_start_time_since_epoch_close_ms: IntGauge,
60
61    /// The total duration when this validator is halted, and hence does not accept certs from users.
62    /// This is the duration of (1) through (6) above, and is the most important latency metric
63    /// reflecting reconfiguration delay for each validator.
64    pub epoch_validator_halt_duration_ms: IntGauge,
65
66    /// The interval from when the epoch begins (i.e. right after state reconfigure, when the new
67    /// epoch_store is created), to when the first checkpoint of the epoch is ready for creation locally.
68    /// This is (7) above, and is a good proxy to how long it takes for the validator
69    /// to become useful in the network after reconfiguration.
70    // TODO: This needs to be reported properly.
71    pub epoch_first_checkpoint_created_time_since_epoch_begin_ms: IntGauge,
72
73    /// Whether we are running in safe mode where reward distribution and tokenomics are disabled.
74    pub is_safe_mode: IntGauge,
75
76    /// When building the last checkpoint of the epoch, we execute advance epoch transaction once
77    /// without committing results to the store. It's useful to know whether this execution leads
78    /// to safe_mode, since in theory the result could be different from checkpoint executor.
79    pub checkpoint_builder_advance_epoch_is_safe_mode: IntGauge,
80
81    /// Buffer stake current in effect for this epoch
82    pub effective_buffer_stake: IntGauge,
83
84    /// Set to 1 if the random beacon DKG protocol failed for the most recent epoch.
85    pub epoch_random_beacon_dkg_failed: IntGauge,
86
87    /// The number of shares held by this node after the random beacon DKG protocol completed.
88    pub epoch_random_beacon_dkg_num_shares: IntGauge,
89
90    /// The amount of time taken from epoch start to completion of random beacon DKG protocol,
91    /// for the most recent epoch.
92    pub epoch_random_beacon_dkg_epoch_start_completion_time_ms: IntGauge,
93
94    /// The amount of time taken to complete random beacon DKG protocol from the time it was
95    /// started (which may be a bit after the epcoh began), for the most recent epoch.
96    pub epoch_random_beacon_dkg_completion_time_ms: IntGauge,
97
98    /// The amount of time taken to start first phase of the random beacon DKG protocol,
99    /// at which point the node has submitted a DKG Message, for the most recent epoch.
100    pub epoch_random_beacon_dkg_message_time_ms: IntGauge,
101
102    /// The amount of time taken to complete first phase of the random beacon DKG protocol,
103    /// at which point the node has submitted a DKG Confirmation, for the most recent epoch.
104    pub epoch_random_beacon_dkg_confirmation_time_ms: IntGauge,
105
106    /// The number of execution time observations messages shared by this node.
107    pub epoch_execution_time_observations_shared: IntCounter,
108
109    /// The number of execution time observations messages intended to be shared by this node, annotated with reason.
110    pub epoch_execution_time_observations_sharing_reason: IntCounterVec,
111
112    /// The number of execution time measurements dropped due to backpressure from the observer.
113    pub epoch_execution_time_measurements_dropped: IntCounter,
114
115    /// The number of execution time consensus messages dropped.
116    pub epoch_execution_time_observations_dropped: IntCounterVec,
117
118    /// The number of cached indebted objects in the execution time observer.
119    pub epoch_execution_time_observer_indebted_objects: IntGauge,
120
121    /// The number of objects tracked by the object utilization cache.
122    pub epoch_execution_time_observer_utilization_cache_size: IntGauge,
123
124    /// The number of objects determined by the execution time observer to be overutilized.
125    /// Note: this may overcount if objects are evicted from the cache before being computed
126    /// as not-overutilized.
127    pub epoch_execution_time_observer_overutilized_objects: IntGauge,
128
129    /// Per-object utilization for objects that were overutilized at least once at some
130    /// point in their lifetime.
131    /// Note: This metric is disabled by default as it may have very large cardinality.
132    pub epoch_execution_time_observer_object_utilization: CounterVec,
133
134    /// The number of execution time observations loaded at start of epoch.
135    pub epoch_execution_time_observations_loaded: IntGauge,
136
137    /// The number of consensus output items in the quarantine.
138    pub consensus_quarantine_queue_size: IntGauge,
139
140    /// The number of shared object assignments in the quarantine.
141    pub shared_object_assignments_size: IntGauge,
142}
143
144impl EpochMetrics {
145    pub fn new(registry: &Registry) -> Arc<Self> {
146        let this = Self {
147            current_epoch: register_int_gauge_with_registry!(
148                "current_epoch",
149                "Current epoch ID",
150                registry
151            )
152            .unwrap(),
153            current_voting_right: register_int_gauge_with_registry!(
154                "current_voting_right",
155                "Current voting right of the validator",
156                registry
157            )
158            .unwrap(),
159            epoch_checkpoint_count: register_int_gauge_with_registry!(
160                "epoch_checkpoint_count",
161                "Number of checkpoints in the epoch",
162                registry
163            ).unwrap(),
164            epoch_total_duration: register_int_gauge_with_registry!(
165                "epoch_total_duration",
166                "Total duration of the epoch",
167                registry
168            ).unwrap(),
169            epoch_transaction_count: register_int_gauge_with_registry!(
170                "epoch_transaction_count",
171                "Number of transactions in the epoch",
172                registry
173            ).unwrap(),
174            epoch_total_gas_reward: register_int_gauge_with_registry!(
175                "epoch_total_gas_reward",
176                "Total amount of gas rewards (i.e. computation gas cost) in the epoch",
177                registry
178            ).unwrap(),
179            epoch_pending_certs_processed_time_since_epoch_close_ms: register_int_gauge_with_registry!(
180                "epoch_pending_certs_processed_time_since_epoch_close_ms",
181                "Time interval from when epoch was closed to when all pending certificates are processed",
182                registry
183            ).unwrap(),
184            epoch_end_of_publish_quorum_time_since_epoch_close_ms: register_int_gauge_with_registry!(
185                "epoch_end_of_publish_quorum_time_since_epoch_close_ms",
186                "Time interval from when epoch was closed to when 2f+1 EndOfPublish messages are received",
187                registry
188            ).unwrap(),
189            epoch_last_checkpoint_created_time_since_epoch_close_ms: register_int_gauge_with_registry!(
190                "epoch_last_checkpoint_created_time_since_epoch_close_ms",
191                "Time interval from when epoch was closed to when the last checkpoint of the epoch is created",
192                registry
193            ).unwrap(),
194            epoch_reconfig_start_time_since_epoch_close_ms: register_int_gauge_with_registry!(
195                "epoch_reconfig_start_time_since_epoch_close_ms",
196                "Total time duration from when epoch was closed to when we begin to reconfigure the validator",
197                registry
198            ).unwrap(),
199            epoch_validator_halt_duration_ms: register_int_gauge_with_registry!(
200                "epoch_validator_halt_duration_ms",
201                "Total time duration when the validator was halted (i.e. epoch closed)",
202                registry
203            ).unwrap(),
204            epoch_first_checkpoint_created_time_since_epoch_begin_ms: register_int_gauge_with_registry!(
205                "epoch_first_checkpoint_created_time_since_epoch_begin_ms",
206                "Time interval from when the epoch opens at new epoch to the first checkpoint is created locally",
207                registry
208            ).unwrap(),
209            is_safe_mode: register_int_gauge_with_registry!(
210                "is_safe_mode",
211                "Whether we are running in safe mode",
212                registry,
213            ).unwrap(),
214            checkpoint_builder_advance_epoch_is_safe_mode: register_int_gauge_with_registry!(
215                "checkpoint_builder_advance_epoch_is_safe_mode",
216                "Whether the advance epoch execution leads to safe mode while building the last checkpoint",
217                registry,
218            ).unwrap(),
219            effective_buffer_stake: register_int_gauge_with_registry!(
220                "effective_buffer_stake",
221                "Buffer stake current in effect for this epoch",
222                registry,
223            ).unwrap(),
224            epoch_random_beacon_dkg_failed: register_int_gauge_with_registry!(
225                "epoch_random_beacon_dkg_failed",
226                "Set to 1 if the random beacon DKG protocol failed for the most recent epoch.",
227                registry
228            )
229            .unwrap(),
230            epoch_random_beacon_dkg_num_shares: register_int_gauge_with_registry!(
231                "epoch_random_beacon_dkg_num_shares",
232                "The number of shares held by this node after the random beacon DKG protocol completed",
233                registry
234            )
235            .unwrap(),
236            epoch_random_beacon_dkg_epoch_start_completion_time_ms: register_int_gauge_with_registry!(
237                "epoch_random_beacon_dkg_epoch_start_completion_time_ms",
238                "The amount of time taken from epoch start to completion of random beacon DKG protocol, for the most recent epoch",
239                registry
240            )
241            .unwrap(),
242            epoch_random_beacon_dkg_completion_time_ms: register_int_gauge_with_registry!(
243                "epoch_random_beacon_dkg_completion_time_ms",
244                "The amount of time taken to complete random beacon DKG protocol from the time it was started (which may be a bit after the epoch began), for the most recent epoch",
245                registry
246            )
247            .unwrap(),
248            epoch_random_beacon_dkg_message_time_ms: register_int_gauge_with_registry!(
249                "epoch_random_beacon_dkg_message_time_ms",
250                "The amount of time taken to start first phase of the random beacon DKG protocol, at which point the node has submitted a DKG Message, for the most recent epoch",
251                registry
252            )
253            .unwrap(),
254            epoch_random_beacon_dkg_confirmation_time_ms: register_int_gauge_with_registry!(
255                "epoch_random_beacon_dkg_confirmation_time_ms",
256                "The amount of time taken to complete first phase of the random beacon DKG protocol, at which point the node has submitted a DKG Confirmation, for the most recent epoch",
257                registry
258            )
259            .unwrap(),
260            epoch_execution_time_observations_shared: register_int_counter_with_registry!(
261                "epoch_execution_time_observations_shared",
262                "The number of execution time observations messages shared by this node",
263                registry
264            )
265            .unwrap(),
266            epoch_execution_time_observations_sharing_reason: register_int_counter_vec_with_registry!(
267                "epoch_execution_time_observations_sharing_reason",
268                "The number of execution time observations messages intended to be shared by this node, annotated with reason",
269                &["reason"],
270                registry
271            )
272            .unwrap(),
273            epoch_execution_time_measurements_dropped: register_int_counter_with_registry!(
274                "epoch_execution_time_measurements_dropped",
275                "The number of execution time measurements dropped due to backpressure from the observer",
276                registry
277            )
278            .unwrap(),
279            epoch_execution_time_observations_dropped: register_int_counter_vec_with_registry!(
280                "epoch_execution_time_observations_dropped",
281                "The number of execution time observations dropped",
282                &["reason"],
283                registry
284            )
285            .unwrap(),
286            epoch_execution_time_observer_indebted_objects: register_int_gauge_with_registry!(
287                "epoch_execution_time_observer_indebted_objects",
288                "The number of cached indebted objects in the execution time observer",
289                registry
290            )
291            .unwrap(),
292            epoch_execution_time_observer_utilization_cache_size: register_int_gauge_with_registry!(
293                "epoch_execution_time_observer_utilization_cache_size",
294                "The number of objects tracked by the object utilization cache",
295                registry
296            )
297            .unwrap(),
298            epoch_execution_time_observer_overutilized_objects: register_int_gauge_with_registry!(
299                "epoch_execution_time_observer_overutilized_objects",
300                "The number of objects determined by the execution time observer to be overutilized. Note: this may overcount if objects are evicted from the cache before being computed as not-overutilized.",
301                registry
302            )
303            .unwrap(),
304            epoch_execution_time_observer_object_utilization: register_counter_vec_with_registry!(
305                "epoch_execution_time_observer_object_utilization",
306                "Per-object utilization for objects that were overutilized at least once at some point in their lifetime",
307                &["object_id"],
308                registry
309            )
310            .unwrap(),
311            epoch_execution_time_observations_loaded: register_int_gauge_with_registry!(
312                "epoch_execution_time_observations_loaded",
313                "The number of execution time observations loaded at start of epoch",
314                registry
315            )
316            .unwrap(),
317            consensus_quarantine_queue_size: register_int_gauge_with_registry!(
318                "consensus_quarantine_queue_size",
319                "The number of consensus output items in the quarantine",
320                registry
321            )
322            .unwrap(),
323            shared_object_assignments_size: register_int_gauge_with_registry!(
324                "shared_object_assignments_size",
325                "The number of shared object assignments in the quarantine",
326                registry
327            )
328            .unwrap(),
329        };
330        Arc::new(this)
331    }
332}