mysten_metrics/
thread_stall_monitor.rs1use std::sync::Arc;
5use std::sync::Mutex;
6use std::sync::Once;
7use std::time::Duration;
8use std::time::Instant;
9
10use tracing::{info, warn};
11
12use crate::{get_metrics, spawn_logged_monitored_task};
13
14static THREAD_STALL_MONITOR: Once = Once::new();
15
16const MONITOR_INTERVAL: Duration = Duration::from_millis(50);
17const ALERT_THRESHOLD: Duration = Duration::from_millis(500);
18
19#[inline(never)]
42extern "C" fn thread_monitor_report_stall(duration_ms: u64) {
43 warn!("Thread stalled for {}ms", duration_ms);
44}
45
46#[inline(never)]
47extern "C" fn thread_monitor_report_stall_cleared(duration_ms: u64) {
48 warn!("Thread stall cleared after {}ms", duration_ms);
49}
50
51pub fn start_thread_stall_monitor() {
55 let mut called = true;
56 THREAD_STALL_MONITOR.call_once(|| {
57 called = false;
58 });
59 if called {
60 return;
61 }
62 if tokio::runtime::Handle::try_current().is_err() {
63 info!("Not running in a tokio runtime, not starting thread stall monitor.");
64 return;
65 }
66
67 let last_update: Arc<Mutex<Instant>> = Arc::new(Mutex::new(Instant::now()));
68
69 {
70 let last_update = last_update.clone();
71 std::thread::spawn(move || {
72 info!("Starting thread stall monitor watchdog thread");
73 let mut stall_duration = None;
74
75 loop {
76 std::thread::sleep(MONITOR_INTERVAL);
77 let now = Instant::now();
78 let last_update = *last_update.lock().unwrap();
79 let time_since_last_update = now - last_update;
80 if time_since_last_update > ALERT_THRESHOLD {
81 if stall_duration.is_none() {
82 thread_monitor_report_stall(time_since_last_update.as_millis() as u64);
83 }
84 stall_duration = Some(time_since_last_update);
85 } else if let Some(dur) = stall_duration {
86 stall_duration = None;
87 thread_monitor_report_stall_cleared(dur.as_millis() as u64);
88 if let Some(metrics) = get_metrics() {
89 metrics.thread_stall_duration_sec.observe(dur.as_secs_f64());
90 }
91 }
92 }
93 });
94 }
95
96 spawn_logged_monitored_task!(
97 async move {
98 info!("Starting thread stall monitor update task");
99 loop {
100 tokio::time::sleep(MONITOR_INTERVAL).await;
101 *last_update.lock().unwrap() = Instant::now();
102 }
103 },
104 "ThreadStallMonitor"
105 );
106}