sui_config/
validator_client_monitor_config.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Configuration for the Validator Client Monitor
5//!
6//! The Validator Client Monitor tracks client-observed performance metrics for validators
7//! in the Sui network. It runs from the perspective of a fullnode and monitors:
8//! - Transaction submission latency
9//! - Effects retrieval latency
10//! - Health check response times
11//! - Success/failure rates
12//!
13//! # Tuning Guide
14//!
15//! ## Monitoring Metrics
16//!
17//! The following Prometheus metrics can help tune the configuration:
18//!
19//! - `validator_client_observed_latency` - Histogram of operation latencies per validator
20//! - `validator_client_operation_success_total` - Counter of successful operations
21//! - `validator_client_operation_failure_total` - Counter of failed operations
22//! - `validator_client_observed_score` - Current score for each validator (0-1)
23//! - `validator_client_consecutive_failures` - Current consecutive failure count
24//! - `validator_client_selections_total` - How often each validator is selected
25//!
26//! ## Configuration Parameters
27//!
28//! ### Health Check Settings
29//!
30//! - `health_check_interval`: How often to probe validator health
31//!   - Default: 10s
32//!   - Decrease for more responsive failure detection (higher overhead)
33//!   - Increase to reduce network traffic
34//!   - Monitor `validator_client_operation_success_total{operation="health_check"}` to see probe frequency
35//!
36//! - `health_check_timeout`: Maximum time to wait for health check response
37//!   - Default: 2s
38//!   - Should be less than `health_check_interval`
39//!   - Set based on p99 of `validator_client_observed_latency{operation="health_check"}`
40//!
41//! ### Failure Handling
42//!
43//! - `max_consecutive_failures`: Failures before temporary exclusion
44//!   - Default: 5
45//!   - Lower values = faster exclusion of problematic validators
46//!   - Higher values = more tolerance for transient issues
47//!   - Monitor `validator_client_consecutive_failures` to see failure patterns
48//!
49//! - `failure_cooldown`: How long to exclude failed validators
50//!   - Default: 30s
51//!   - Should be several times the `health_check_interval`
52//!   - Too short = thrashing between exclusion/inclusion
53//!   - Too long = reduced validator pool during transient issues
54//!
55//! ### Score Weights
56//!
57//! Scores combine reliability and latency metrics. Adjust weights based on priorities:
58//!
59//! - `reliability`: Weight for success rate (0-1)
60//!   - Default: 0.6
61//!   - Increase if consistency is critical
62//!   - Decrease if latency is more important than occasional failures
63//!
64//! - `latency`: Weight for latency scores
65//!   - Default: 0.4
66//!   - Increase for latency-sensitive applications
67//!   - Individual operation weights can be tuned separately
68//!
69//! # Example Configurations
70//!
71//! ## Low Latency Priority
72//! ```yaml
73//! validator-client-monitor-config:
74//!   health-check-interval: 5s
75//!   health-check-timeout: 1s
76//!   max-consecutive-failures: 3
77//!   failure-cooldown: 20s
78//!   score-weights:
79//!     latency: 0.7
80//!     reliability: 0.3
81//!     effects-latency-weight: 0.6  # Effects queries are critical
82//! ```
83//!
84//! ## High Reliability Priority
85//! ```yaml
86//! validator-client-monitor-config:
87//!   health-check-interval: 15s
88//!   max-consecutive-failures: 10  # Very tolerant
89//!   failure-cooldown: 60s
90//!   score-weights:
91//!     latency: 0.2
92//!     reliability: 0.8
93//! ```
94
95use serde::{Deserialize, Serialize};
96use std::time::Duration;
97
98/// Configuration for validator client monitoring from the client perspective
99#[derive(Debug, Clone, Serialize, Deserialize)]
100#[serde(rename_all = "kebab-case")]
101pub struct ValidatorClientMonitorConfig {
102    /// How often to perform health checks on validators.
103    ///
104    /// Lower values provide faster failure detection but increase network overhead.
105    /// This should be balanced against the `failure_cooldown` period.
106    #[serde(default = "default_health_check_interval")]
107    pub health_check_interval: Duration,
108
109    /// Timeout for health check requests.
110    ///
111    /// Should be less than `health_check_interval` to avoid overlapping checks.
112    /// Set based on network latency characteristics - typically 2-3x p99 latency.
113    #[serde(default = "default_health_check_timeout")]
114    pub health_check_timeout: Duration,
115
116    /// Weight for reliability.
117    ///
118    /// Controls importance of reliability when adjusting the validator's latency for transaction submission
119    /// selection. The higher the weight, the more penalty is given to unreliable validators.
120    /// Default to 2.0. Value should be positive.
121    #[serde(default = "default_reliability_weight")]
122    pub reliability_weight: f64,
123
124    /// Size of the moving window for latency measurements
125    #[serde(default = "default_latency_moving_window_size")]
126    pub latency_moving_window_size: usize,
127
128    /// Size of the moving window for reliability measurements
129    #[serde(default = "default_reliability_moving_window_size")]
130    pub reliability_moving_window_size: usize,
131}
132
133impl Default for ValidatorClientMonitorConfig {
134    fn default() -> Self {
135        Self {
136            health_check_interval: default_health_check_interval(),
137            health_check_timeout: default_health_check_timeout(),
138            reliability_weight: default_reliability_weight(),
139            latency_moving_window_size: default_latency_moving_window_size(),
140            reliability_moving_window_size: default_reliability_moving_window_size(),
141        }
142    }
143}
144
145// Default value functions
146fn default_health_check_interval() -> Duration {
147    Duration::from_secs(10)
148}
149
150fn default_health_check_timeout() -> Duration {
151    Duration::from_secs(2)
152}
153
154fn default_reliability_weight() -> f64 {
155    2.0
156}
157
158fn default_latency_moving_window_size() -> usize {
159    40
160}
161
162fn default_reliability_moving_window_size() -> usize {
163    20
164}