sui_config/
validator_client_monitor_config.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
// Copyright (c) Mysten Labs, Inc.
// SPDX-License-Identifier: Apache-2.0

//! Configuration for the Validator Client Monitor
//!
//! The Validator Client Monitor tracks client-observed performance metrics for validators
//! in the Sui network. It runs from the perspective of a fullnode and monitors:
//! - Transaction submission latency
//! - Effects retrieval latency
//! - Health check response times
//! - Success/failure rates
//!
//! # Tuning Guide
//!
//! ## Monitoring Metrics
//!
//! The following Prometheus metrics can help tune the configuration:
//!
//! - `validator_client_observed_latency` - Histogram of operation latencies per validator
//! - `validator_client_operation_success_total` - Counter of successful operations
//! - `validator_client_operation_failure_total` - Counter of failed operations
//! - `validator_client_observed_score` - Current score for each validator (0-1)
//! - `validator_client_consecutive_failures` - Current consecutive failure count
//! - `validator_client_selections_total` - How often each validator is selected
//!
//! ## Configuration Parameters
//!
//! ### Health Check Settings
//!
//! - `health_check_interval`: How often to probe validator health
//!   - Default: 10s
//!   - Decrease for more responsive failure detection (higher overhead)
//!   - Increase to reduce network traffic
//!   - Monitor `validator_client_operation_success_total{operation="health_check"}` to see probe frequency
//!
//! - `health_check_timeout`: Maximum time to wait for health check response
//!   - Default: 2s
//!   - Should be less than `health_check_interval`
//!   - Set based on p99 of `validator_client_observed_latency{operation="health_check"}`
//!
//! ### Failure Handling
//!
//! - `max_consecutive_failures`: Failures before temporary exclusion
//!   - Default: 5
//!   - Lower values = faster exclusion of problematic validators
//!   - Higher values = more tolerance for transient issues
//!   - Monitor `validator_client_consecutive_failures` to see failure patterns
//!
//! - `failure_cooldown`: How long to exclude failed validators
//!   - Default: 30s
//!   - Should be several times the `health_check_interval`
//!   - Too short = thrashing between exclusion/inclusion
//!   - Too long = reduced validator pool during transient issues
//!
//! ### Score Weights
//!
//! Scores combine reliability and latency metrics. Adjust weights based on priorities:
//!
//! - `reliability`: Weight for success rate (0-1)
//!   - Default: 0.6
//!   - Increase if consistency is critical
//!   - Decrease if latency is more important than occasional failures
//!
//! - `latency`: Weight for latency scores
//!   - Default: 0.4
//!   - Increase for latency-sensitive applications
//!   - Individual operation weights can be tuned separately
//!
//! # Example Configurations
//!
//! ## Low Latency Priority
//! ```yaml
//! validator-client-monitor-config:
//!   health-check-interval: 5s
//!   health-check-timeout: 1s
//!   max-consecutive-failures: 3
//!   failure-cooldown: 20s
//!   score-weights:
//!     latency: 0.7
//!     reliability: 0.3
//!     effects-latency-weight: 0.6  # Effects queries are critical
//! ```
//!
//! ## High Reliability Priority
//! ```yaml
//! validator-client-monitor-config:
//!   health-check-interval: 15s
//!   max-consecutive-failures: 10  # Very tolerant
//!   failure-cooldown: 60s
//!   score-weights:
//!     latency: 0.2
//!     reliability: 0.8
//! ```

use serde::{Deserialize, Serialize};
use std::time::Duration;

/// Configuration for validator client monitoring from the client perspective
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct ValidatorClientMonitorConfig {
    /// How often to perform health checks on validators.
    ///
    /// Lower values provide faster failure detection but increase network overhead.
    /// This should be balanced against the `failure_cooldown` period.
    #[serde(default = "default_health_check_interval")]
    pub health_check_interval: Duration,

    /// Timeout for health check requests.
    ///
    /// Should be less than `health_check_interval` to avoid overlapping checks.
    /// Set based on network latency characteristics - typically 2-3x p99 latency.
    #[serde(default = "default_health_check_timeout")]
    pub health_check_timeout: Duration,

    /// Weight configuration for score calculation.
    ///
    /// Determines how different factors contribute to validator selection.
    #[serde(default)]
    pub score_weights: ScoreWeights,

    /// Cooldown period after failures before considering a validator again.
    ///
    /// Should be long enough to allow transient issues to resolve,
    /// but short enough to quickly recover capacity when issues are fixed.
    #[serde(default = "default_failure_cooldown")]
    pub failure_cooldown: Duration,

    /// Maximum number of consecutive failures before temporary exclusion.
    ///
    /// Lower values are more aggressive about excluding problematic validators.
    /// Higher values are more tolerant of intermittent issues.
    #[serde(default = "default_max_consecutive_failures")]
    pub max_consecutive_failures: u32,
}

/// Weights for different factors in score calculation
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct ScoreWeights {
    /// Weight for latency (lower is better).
    ///
    /// This is the overall weight for all latency scores combined.
    /// Individual operation latencies are weighted separately below.
    #[serde(default = "default_latency_weight")]
    pub latency: f64,

    /// Weight for success rate.
    ///
    /// Higher values prioritize reliability over performance.
    #[serde(default = "default_reliability_weight")]
    pub reliability: f64,

    /// Weight for submit transaction latency.
    ///
    /// Controls importance of transaction submission speed.
    #[serde(default = "default_submit_latency_weight")]
    pub submit_latency_weight: f64,

    /// Weight for effects retrieval latency.
    ///
    /// Controls importance of effects query speed.
    /// Often the most critical operation for application responsiveness.
    #[serde(default = "default_effects_latency_weight")]
    pub effects_latency_weight: f64,

    /// Weight for health check latency.
    ///
    /// Usually less critical than actual operations.
    #[serde(default = "default_health_check_latency_weight")]
    pub health_check_latency_weight: f64,
}

impl Default for ValidatorClientMonitorConfig {
    fn default() -> Self {
        Self {
            health_check_interval: default_health_check_interval(),
            health_check_timeout: default_health_check_timeout(),
            score_weights: ScoreWeights::default(),
            failure_cooldown: default_failure_cooldown(),
            max_consecutive_failures: default_max_consecutive_failures(),
        }
    }
}

impl Default for ScoreWeights {
    fn default() -> Self {
        Self {
            latency: default_latency_weight(),
            reliability: default_reliability_weight(),
            submit_latency_weight: default_submit_latency_weight(),
            effects_latency_weight: default_effects_latency_weight(),
            health_check_latency_weight: default_health_check_latency_weight(),
        }
    }
}

// Default value functions

fn default_health_check_interval() -> Duration {
    Duration::from_secs(10)
}

fn default_health_check_timeout() -> Duration {
    Duration::from_secs(2)
}

fn default_failure_cooldown() -> Duration {
    Duration::from_secs(30)
}

fn default_max_consecutive_failures() -> u32 {
    100
}

fn default_latency_weight() -> f64 {
    0.4
}

fn default_reliability_weight() -> f64 {
    0.6
}

fn default_submit_latency_weight() -> f64 {
    0.3
}

fn default_effects_latency_weight() -> f64 {
    0.5
}

fn default_health_check_latency_weight() -> f64 {
    0.2
}