sui_config/
validator_client_monitor_config.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
// Copyright (c) Mysten Labs, Inc.
// SPDX-License-Identifier: Apache-2.0

//! Configuration for the Validator Client Monitor
//!
//! The Validator Client Monitor tracks client-observed performance metrics for validators
//! in the Sui network. It runs from the perspective of a fullnode and monitors:
//! - Transaction submission latency
//! - Effects retrieval latency
//! - Health check response times
//! - Success/failure rates
//!
//! # Tuning Guide
//!
//! ## Monitoring Metrics
//!
//! The following Prometheus metrics can help tune the configuration:
//!
//! - `validator_client_observed_latency` - Histogram of operation latencies per validator
//! - `validator_client_operation_success_total` - Counter of successful operations
//! - `validator_client_operation_failure_total` - Counter of failed operations
//! - `validator_client_observed_score` - Current score for each validator (0-1)
//! - `validator_client_consecutive_failures` - Current consecutive failure count
//! - `validator_client_selections_total` - How often each validator is selected
//!
//! ## Configuration Parameters
//!
//! ### Health Check Settings
//!
//! - `health_check_interval`: How often to probe validator health
//!   - Default: 10s
//!   - Decrease for more responsive failure detection (higher overhead)
//!   - Increase to reduce network traffic
//!   - Monitor `validator_client_operation_success_total{operation="health_check"}` to see probe frequency
//!
//! - `health_check_timeout`: Maximum time to wait for health check response
//!   - Default: 2s
//!   - Should be less than `health_check_interval`
//!   - Set based on p99 of `validator_client_observed_latency{operation="health_check"}`
//!
//! ### Failure Handling
//!
//! - `max_consecutive_failures`: Failures before temporary exclusion
//!   - Default: 5
//!   - Lower values = faster exclusion of problematic validators
//!   - Higher values = more tolerance for transient issues
//!   - Monitor `validator_client_consecutive_failures` to see failure patterns
//!
//! - `failure_cooldown`: How long to exclude failed validators
//!   - Default: 30s
//!   - Should be several times the `health_check_interval`
//!   - Too short = thrashing between exclusion/inclusion
//!   - Too long = reduced validator pool during transient issues
//!
//! ### Score Weights
//!
//! Scores combine reliability and latency metrics. Adjust weights based on priorities:
//!
//! - `reliability`: Weight for success rate (0-1)
//!   - Default: 0.6
//!   - Increase if consistency is critical
//!   - Decrease if latency is more important than occasional failures
//!
//! - `latency`: Weight for latency scores
//!   - Default: 0.4
//!   - Increase for latency-sensitive applications
//!   - Individual operation weights can be tuned separately
//!
//! # Example Configurations
//!
//! ## Low Latency Priority
//! ```yaml
//! validator-client-monitor-config:
//!   health-check-interval: 5s
//!   health-check-timeout: 1s
//!   max-consecutive-failures: 3
//!   failure-cooldown: 20s
//!   score-weights:
//!     latency: 0.7
//!     reliability: 0.3
//!     effects-latency-weight: 0.6  # Effects queries are critical
//! ```
//!
//! ## High Reliability Priority
//! ```yaml
//! validator-client-monitor-config:
//!   health-check-interval: 15s
//!   max-consecutive-failures: 10  # Very tolerant
//!   failure-cooldown: 60s
//!   score-weights:
//!     latency: 0.2
//!     reliability: 0.8
//! ```

use serde::{Deserialize, Serialize};
use std::time::Duration;

/// Configuration for validator client monitoring from the client perspective
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct ValidatorClientMonitorConfig {
    /// How often to perform health checks on validators.
    ///
    /// Lower values provide faster failure detection but increase network overhead.
    /// This should be balanced against the `failure_cooldown` period.
    #[serde(default = "default_health_check_interval")]
    pub health_check_interval: Duration,

    /// Timeout for health check requests.
    ///
    /// Should be less than `health_check_interval` to avoid overlapping checks.
    /// Set based on network latency characteristics - typically 2-3x p99 latency.
    #[serde(default = "default_health_check_timeout")]
    pub health_check_timeout: Duration,

    /// Weight for reliability.
    ///
    /// Controls importance of reliability when adjusting the validator's latency for transaction submission
    /// selection. The higher the weight, the more penalty is given to unreliable validators.
    /// Default to 2.0. Value should be positive.
    #[serde(default = "default_reliability_weight")]
    pub reliability_weight: f64,

    /// Size of the moving window for latency measurements
    #[serde(default = "default_latency_moving_window_size")]
    pub latency_moving_window_size: usize,

    /// Size of the moving window for reliability measurements
    #[serde(default = "default_reliability_moving_window_size")]
    pub reliability_moving_window_size: usize,
}

impl Default for ValidatorClientMonitorConfig {
    fn default() -> Self {
        Self {
            health_check_interval: default_health_check_interval(),
            health_check_timeout: default_health_check_timeout(),
            reliability_weight: default_reliability_weight(),
            latency_moving_window_size: default_latency_moving_window_size(),
            reliability_moving_window_size: default_reliability_moving_window_size(),
        }
    }
}

// Default value functions
fn default_health_check_interval() -> Duration {
    Duration::from_secs(10)
}

fn default_health_check_timeout() -> Duration {
    Duration::from_secs(2)
}

fn default_reliability_weight() -> f64 {
    2.0
}

fn default_latency_moving_window_size() -> usize {
    40
}

fn default_reliability_moving_window_size() -> usize {
    20
}