sui_core/epoch/
epoch_metrics.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
// Copyright (c) Mysten Labs, Inc.
// SPDX-License-Identifier: Apache-2.0

use prometheus::{
    register_counter_vec_with_registry, register_int_counter_vec_with_registry,
    register_int_counter_with_registry, register_int_gauge_with_registry, CounterVec, IntCounter,
    IntCounterVec, IntGauge, Registry,
};
use std::sync::Arc;

pub struct EpochMetrics {
    /// The current epoch ID. This is updated only when the AuthorityState finishes reconfiguration.
    pub current_epoch: IntGauge,

    /// Current voting right of the validator in the protocol. Updated at the start of epochs.
    pub current_voting_right: IntGauge,

    /// Total duration of the epoch. This is measured from when the current epoch store is opened,
    /// until the current epoch store is replaced with the next epoch store.
    pub epoch_total_duration: IntGauge,

    /// Number of checkpoints in the epoch.
    pub epoch_checkpoint_count: IntGauge,

    /// Number of transactions in the epoch.
    pub epoch_transaction_count: IntGauge,

    /// Total amount of gas rewards (i.e. computation gas cost) in the epoch.
    pub epoch_total_gas_reward: IntGauge,

    // An active validator reconfigures through the following steps:
    // 1. Halt validator (a.k.a. close epoch) and stop accepting user transaction certs.
    // 2. Finishes processing all pending certificates and then send EndOfPublish message.
    // 3. Stop accepting messages from consensus after seeing 2f+1 EndOfPublish messages.
    // 4. Creating the last checkpoint of the epoch by augmenting it with AdvanceEpoch transaction.
    // 5. CheckpointExecutor finishes executing the last checkpoint, and triggers reconfiguration.
    // 6. During reconfiguration, we tear down consensus, reconfigure state (at which point we opens
    //    up user certs), and start consensus again.
    // 7. After reconfiguration, and eventually consensus starts successfully, at some point the first
    //    checkpoint of the new epoch will be created.
    // We introduce various metrics to cover the latency of above steps.
    /// The duration from when the epoch is closed (i.e. validator halted) to when all pending
    /// certificates are processed (i.e. ready to send EndOfPublish message).
    /// This is the duration of (1) through (2) above.
    pub epoch_pending_certs_processed_time_since_epoch_close_ms: IntGauge,

    /// The interval from when the epoch is closed to when we receive 2f+1 EndOfPublish messages.
    /// This is the duration of (1) through (3) above.
    pub epoch_end_of_publish_quorum_time_since_epoch_close_ms: IntGauge,

    /// The interval from when the epoch is closed to when we created the last checkpoint of the
    /// epoch.
    /// This is the duration of (1) through (4) above.
    pub epoch_last_checkpoint_created_time_since_epoch_close_ms: IntGauge,

    /// The interval from when the epoch is closed to when we finished executing the last transaction
    /// of the checkpoint (and hence triggering reconfiguration process).
    /// This is the duration of (1) through (5) above.
    pub epoch_reconfig_start_time_since_epoch_close_ms: IntGauge,

    /// The total duration when this validator is halted, and hence does not accept certs from users.
    /// This is the duration of (1) through (6) above, and is the most important latency metric
    /// reflecting reconfiguration delay for each validator.
    pub epoch_validator_halt_duration_ms: IntGauge,

    /// The interval from when the epoch begins (i.e. right after state reconfigure, when the new
    /// epoch_store is created), to when the first checkpoint of the epoch is ready for creation locally.
    /// This is (7) above, and is a good proxy to how long it takes for the validator
    /// to become useful in the network after reconfiguration.
    // TODO: This needs to be reported properly.
    pub epoch_first_checkpoint_created_time_since_epoch_begin_ms: IntGauge,

    /// Whether we are running in safe mode where reward distribution and tokenomics are disabled.
    pub is_safe_mode: IntGauge,

    /// When building the last checkpoint of the epoch, we execute advance epoch transaction once
    /// without committing results to the store. It's useful to know whether this execution leads
    /// to safe_mode, since in theory the result could be different from checkpoint executor.
    pub checkpoint_builder_advance_epoch_is_safe_mode: IntGauge,

    /// Buffer stake current in effect for this epoch
    pub effective_buffer_stake: IntGauge,

    /// Set to 1 if the random beacon DKG protocol failed for the most recent epoch.
    pub epoch_random_beacon_dkg_failed: IntGauge,

    /// The number of shares held by this node after the random beacon DKG protocol completed.
    pub epoch_random_beacon_dkg_num_shares: IntGauge,

    /// The amount of time taken from epoch start to completion of random beacon DKG protocol,
    /// for the most recent epoch.
    pub epoch_random_beacon_dkg_epoch_start_completion_time_ms: IntGauge,

    /// The amount of time taken to complete random beacon DKG protocol from the time it was
    /// started (which may be a bit after the epcoh began), for the most recent epoch.
    pub epoch_random_beacon_dkg_completion_time_ms: IntGauge,

    /// The amount of time taken to start first phase of the random beacon DKG protocol,
    /// at which point the node has submitted a DKG Message, for the most recent epoch.
    pub epoch_random_beacon_dkg_message_time_ms: IntGauge,

    /// The amount of time taken to complete first phase of the random beacon DKG protocol,
    /// at which point the node has submitted a DKG Confirmation, for the most recent epoch.
    pub epoch_random_beacon_dkg_confirmation_time_ms: IntGauge,

    /// The number of execution time observations messages shared by this node.
    pub epoch_execution_time_observations_shared: IntCounter,

    /// The number of execution time measurements dropped due to backpressure from the observer.
    pub epoch_execution_time_measurements_dropped: IntCounter,

    /// The number of execution time consensus messages dropped.
    pub epoch_execution_time_observations_dropped: IntCounterVec,

    /// The number of cached indebted objects in the execution time observer.
    pub epoch_execution_time_observer_indebted_objects: IntGauge,

    /// The number of objects tracked by the object utilization cache.
    pub epoch_execution_time_observer_utilization_cache_size: IntGauge,

    /// The number of objects determined by the execution time observer to be overutilized.
    /// Note: this may overcount if objects are evicted from the cache before being computed
    /// as not-overutilized.
    pub epoch_execution_time_observer_overutilized_objects: IntGauge,

    /// Per-object utilization for objects that were overutilized at least once at some
    /// point in their lifetime.
    /// Note: This metric is disabled by default as it may have very large cardinality.
    pub epoch_execution_time_observer_object_utilization: CounterVec,

    /// The number of consensus output items in the quarantine.
    pub consensus_quarantine_queue_size: IntGauge,

    /// The number of shared object assignments in the quarantine.
    pub shared_object_assignments_size: IntGauge,
}

impl EpochMetrics {
    pub fn new(registry: &Registry) -> Arc<Self> {
        let this = Self {
            current_epoch: register_int_gauge_with_registry!(
                "current_epoch",
                "Current epoch ID",
                registry
            )
            .unwrap(),
            current_voting_right: register_int_gauge_with_registry!(
                "current_voting_right",
                "Current voting right of the validator",
                registry
            )
            .unwrap(),
            epoch_checkpoint_count: register_int_gauge_with_registry!(
                "epoch_checkpoint_count",
                "Number of checkpoints in the epoch",
                registry
            ).unwrap(),
            epoch_total_duration: register_int_gauge_with_registry!(
                "epoch_total_duration",
                "Total duration of the epoch",
                registry
            ).unwrap(),
            epoch_transaction_count: register_int_gauge_with_registry!(
                "epoch_transaction_count",
                "Number of transactions in the epoch",
                registry
            ).unwrap(),
            epoch_total_gas_reward: register_int_gauge_with_registry!(
                "epoch_total_gas_reward",
                "Total amount of gas rewards (i.e. computation gas cost) in the epoch",
                registry
            ).unwrap(),
            epoch_pending_certs_processed_time_since_epoch_close_ms: register_int_gauge_with_registry!(
                "epoch_pending_certs_processed_time_since_epoch_close_ms",
                "Time interval from when epoch was closed to when all pending certificates are processed",
                registry
            ).unwrap(),
            epoch_end_of_publish_quorum_time_since_epoch_close_ms: register_int_gauge_with_registry!(
                "epoch_end_of_publish_quorum_time_since_epoch_close_ms",
                "Time interval from when epoch was closed to when 2f+1 EndOfPublish messages are received",
                registry
            ).unwrap(),
            epoch_last_checkpoint_created_time_since_epoch_close_ms: register_int_gauge_with_registry!(
                "epoch_last_checkpoint_created_time_since_epoch_close_ms",
                "Time interval from when epoch was closed to when the last checkpoint of the epoch is created",
                registry
            ).unwrap(),
            epoch_reconfig_start_time_since_epoch_close_ms: register_int_gauge_with_registry!(
                "epoch_reconfig_start_time_since_epoch_close_ms",
                "Total time duration from when epoch was closed to when we begin to reconfigure the validator",
                registry
            ).unwrap(),
            epoch_validator_halt_duration_ms: register_int_gauge_with_registry!(
                "epoch_validator_halt_duration_ms",
                "Total time duration when the validator was halted (i.e. epoch closed)",
                registry
            ).unwrap(),
            epoch_first_checkpoint_created_time_since_epoch_begin_ms: register_int_gauge_with_registry!(
                "epoch_first_checkpoint_created_time_since_epoch_begin_ms",
                "Time interval from when the epoch opens at new epoch to the first checkpoint is created locally",
                registry
            ).unwrap(),
            is_safe_mode: register_int_gauge_with_registry!(
                "is_safe_mode",
                "Whether we are running in safe mode",
                registry,
            ).unwrap(),
            checkpoint_builder_advance_epoch_is_safe_mode: register_int_gauge_with_registry!(
                "checkpoint_builder_advance_epoch_is_safe_mode",
                "Whether the advance epoch execution leads to safe mode while building the last checkpoint",
                registry,
            ).unwrap(),
            effective_buffer_stake: register_int_gauge_with_registry!(
                "effective_buffer_stake",
                "Buffer stake current in effect for this epoch",
                registry,
            ).unwrap(),
            epoch_random_beacon_dkg_failed: register_int_gauge_with_registry!(
                "epoch_random_beacon_dkg_failed",
                "Set to 1 if the random beacon DKG protocol failed for the most recent epoch.",
                registry
            )
            .unwrap(),
            epoch_random_beacon_dkg_num_shares: register_int_gauge_with_registry!(
                "epoch_random_beacon_dkg_num_shares",
                "The number of shares held by this node after the random beacon DKG protocol completed",
                registry
            )
            .unwrap(),
            epoch_random_beacon_dkg_epoch_start_completion_time_ms: register_int_gauge_with_registry!(
                "epoch_random_beacon_dkg_epoch_start_completion_time_ms",
                "The amount of time taken from epoch start to completion of random beacon DKG protocol, for the most recent epoch",
                registry
            )
            .unwrap(),
            epoch_random_beacon_dkg_completion_time_ms: register_int_gauge_with_registry!(
                "epoch_random_beacon_dkg_completion_time_ms",
                "The amount of time taken to complete random beacon DKG protocol from the time it was started (which may be a bit after the epoch began), for the most recent epoch",
                registry
            )
            .unwrap(),
            epoch_random_beacon_dkg_message_time_ms: register_int_gauge_with_registry!(
                "epoch_random_beacon_dkg_message_time_ms",
                "The amount of time taken to start first phase of the random beacon DKG protocol, at which point the node has submitted a DKG Message, for the most recent epoch",
                registry
            )
            .unwrap(),
            epoch_random_beacon_dkg_confirmation_time_ms: register_int_gauge_with_registry!(
                "epoch_random_beacon_dkg_confirmation_time_ms",
                "The amount of time taken to complete first phase of the random beacon DKG protocol, at which point the node has submitted a DKG Confirmation, for the most recent epoch",
                registry
            )
            .unwrap(),
            epoch_execution_time_observations_shared: register_int_counter_with_registry!(
                "epoch_execution_time_observations_shared",
                "The number of execution time observations messages shared by this node",
                registry
            )
            .unwrap(),
            epoch_execution_time_measurements_dropped: register_int_counter_with_registry!(
                "epoch_execution_time_measurements_dropped",
                "The number of execution time measurements dropped due to backpressure from the observer",
                registry
            )
            .unwrap(),
            epoch_execution_time_observations_dropped: register_int_counter_vec_with_registry!(
                "epoch_execution_time_observations_dropped",
                "The number of execution time observations dropped",
                &["reason"],
                registry
            )
            .unwrap(),
            epoch_execution_time_observer_indebted_objects: register_int_gauge_with_registry!(
                "epoch_execution_time_observer_indebted_objects",
                "The number of cached indebted objects in the execution time observer",
                registry
            )
            .unwrap(),
            epoch_execution_time_observer_utilization_cache_size: register_int_gauge_with_registry!(
                "epoch_execution_time_observer_utilization_cache_size",
                "The number of objects tracked by the object utilization cache",
                registry
            )
            .unwrap(),
            epoch_execution_time_observer_overutilized_objects: register_int_gauge_with_registry!(
                "epoch_execution_time_observer_overutilized_objects",
                "The number of objects determined by the execution time observer to be overutilized. Note: this may overcount if objects are evicted from the cache before being computed as not-overutilized.",
                registry
            )
            .unwrap(),
            epoch_execution_time_observer_object_utilization: register_counter_vec_with_registry!(
                "epoch_execution_time_observer_object_utilization",
                "Per-object utilization for objects that were overutilized at least once at some point in their lifetime",
                &["object_id"],
                registry
            )
            .unwrap(),
            consensus_quarantine_queue_size: register_int_gauge_with_registry!(
                "consensus_quarantine_queue_size",
                "The number of consensus output items in the quarantine",
                registry
            )
            .unwrap(),
            shared_object_assignments_size: register_int_gauge_with_registry!(
                "shared_object_assignments_size",
                "The number of shared object assignments in the quarantine",
                registry
            )
            .unwrap(),
        };
        Arc::new(this)
    }
}