sui_config/validator_client_monitor_config.rs
1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Configuration for the Validator Client Monitor
5//!
6//! The Validator Client Monitor tracks client-observed performance metrics for validators
7//! in the Sui network. It runs from the perspective of a fullnode and monitors:
8//! - Transaction submission latency
9//! - Effects retrieval latency
10//! - Health check response times
11//! - Success/failure rates
12//!
13//! # Tuning Guide
14//!
15//! ## Monitoring Metrics
16//!
17//! The following Prometheus metrics can help tune the configuration:
18//!
19//! - `validator_client_observed_latency` - Histogram of operation latencies per validator
20//! - `validator_client_operation_success_total` - Counter of successful operations
21//! - `validator_client_operation_failure_total` - Counter of failed operations
22//! - `validator_client_observed_score` - Current score for each validator (0-1)
23//! - `validator_client_consecutive_failures` - Current consecutive failure count
24//! - `validator_client_selections_total` - How often each validator is selected
25//!
26//! ## Configuration Parameters
27//!
28//! ### Health Check Settings
29//!
30//! - `health_check_interval`: How often to probe validator health
31//! - Default: 10s
32//! - Decrease for more responsive failure detection (higher overhead)
33//! - Increase to reduce network traffic
34//! - Monitor `validator_client_operation_success_total{operation="health_check"}` to see probe frequency
35//!
36//! - `health_check_timeout`: Maximum time to wait for health check response
37//! - Default: 2s
38//! - Should be less than `health_check_interval`
39//! - Set based on p99 of `validator_client_observed_latency{operation="health_check"}`
40//!
41//! ### Failure Handling
42//!
43//! - `max_consecutive_failures`: Failures before temporary exclusion
44//! - Default: 5
45//! - Lower values = faster exclusion of problematic validators
46//! - Higher values = more tolerance for transient issues
47//! - Monitor `validator_client_consecutive_failures` to see failure patterns
48//!
49//! - `failure_cooldown`: How long to exclude failed validators
50//! - Default: 30s
51//! - Should be several times the `health_check_interval`
52//! - Too short = thrashing between exclusion/inclusion
53//! - Too long = reduced validator pool during transient issues
54//!
55//! ### Score Weights
56//!
57//! Scores combine reliability and latency metrics. Adjust weights based on priorities:
58//!
59//! - `reliability`: Weight for success rate (0-1)
60//! - Default: 0.6
61//! - Increase if consistency is critical
62//! - Decrease if latency is more important than occasional failures
63//!
64//! - `latency`: Weight for latency scores
65//! - Default: 0.4
66//! - Increase for latency-sensitive applications
67//! - Individual operation weights can be tuned separately
68//!
69//! # Example Configurations
70//!
71//! ## Low Latency Priority
72//! ```yaml
73//! validator-client-monitor-config:
74//! health-check-interval: 5s
75//! health-check-timeout: 1s
76//! max-consecutive-failures: 3
77//! failure-cooldown: 20s
78//! score-weights:
79//! latency: 0.7
80//! reliability: 0.3
81//! effects-latency-weight: 0.6 # Effects queries are critical
82//! ```
83//!
84//! ## High Reliability Priority
85//! ```yaml
86//! validator-client-monitor-config:
87//! health-check-interval: 15s
88//! max-consecutive-failures: 10 # Very tolerant
89//! failure-cooldown: 60s
90//! score-weights:
91//! latency: 0.2
92//! reliability: 0.8
93//! ```
94
95use serde::{Deserialize, Serialize};
96use std::time::Duration;
97
98/// Configuration for validator client monitoring from the client perspective
99#[derive(Debug, Clone, Serialize, Deserialize)]
100#[serde(rename_all = "kebab-case")]
101pub struct ValidatorClientMonitorConfig {
102 /// How often to perform health checks on validators.
103 ///
104 /// Lower values provide faster failure detection but increase network overhead.
105 /// This should be balanced against the `failure_cooldown` period.
106 #[serde(default = "default_health_check_interval")]
107 pub health_check_interval: Duration,
108
109 /// Timeout for health check requests.
110 ///
111 /// Should be less than `health_check_interval` to avoid overlapping checks.
112 /// Set based on network latency characteristics - typically 2-3x p99 latency.
113 #[serde(default = "default_health_check_timeout")]
114 pub health_check_timeout: Duration,
115
116 /// Weight for reliability.
117 ///
118 /// Controls importance of reliability when adjusting the validator's latency for transaction submission
119 /// selection. The higher the weight, the more penalty is given to unreliable validators.
120 /// Default to 2.0. Value should be positive.
121 #[serde(default = "default_reliability_weight")]
122 pub reliability_weight: f64,
123
124 /// Size of the moving window for latency measurements
125 #[serde(default = "default_latency_moving_window_size")]
126 pub latency_moving_window_size: usize,
127
128 /// Size of the moving window for reliability measurements
129 #[serde(default = "default_reliability_moving_window_size")]
130 pub reliability_moving_window_size: usize,
131}
132
133impl Default for ValidatorClientMonitorConfig {
134 fn default() -> Self {
135 Self {
136 health_check_interval: default_health_check_interval(),
137 health_check_timeout: default_health_check_timeout(),
138 reliability_weight: default_reliability_weight(),
139 latency_moving_window_size: default_latency_moving_window_size(),
140 reliability_moving_window_size: default_reliability_moving_window_size(),
141 }
142 }
143}
144
145// Default value functions
146fn default_health_check_interval() -> Duration {
147 Duration::from_secs(10)
148}
149
150fn default_health_check_timeout() -> Duration {
151 Duration::from_secs(2)
152}
153
154fn default_reliability_weight() -> f64 {
155 2.0
156}
157
158fn default_latency_moving_window_size() -> usize {
159 40
160}
161
162fn default_reliability_moving_window_size() -> usize {
163 20
164}