mysten_metrics/
thread_stall_monitor.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::sync::Arc;
5use std::sync::Mutex;
6use std::sync::Once;
7use std::time::Duration;
8use std::time::Instant;
9
10use tracing::{info, warn};
11
12use crate::{get_metrics, spawn_logged_monitored_task};
13
14static THREAD_STALL_MONITOR: Once = Once::new();
15
16const MONITOR_INTERVAL: Duration = Duration::from_millis(50);
17const ALERT_THRESHOLD: Duration = Duration::from_millis(500);
18
19// These funcs are extern in order to be easily findable by debuggers
20// To catch a thread stall in the act, do the following:
21//
22// Create a file `gdbcmd` with
23//
24//      set logging file gdb.txt
25//      set logging on
26//      set pagination off
27//      set breakpoint pending on
28//
29//      b thread_monitor_report_stall
30//      commands
31//      thread apply all bt
32//      continue
33//      end
34//
35// Then run gdb with:
36//     gdb -x gdbmcmd -p <pid of sui-node>
37//
38// You will need to type `c` to continue the process after it loads.
39//
40// The debugger will now print out all thread stacks every time a thread stall is detected.
41#[inline(never)]
42extern "C" fn thread_monitor_report_stall(duration_ms: u64) {
43    warn!("Thread stalled for {}ms", duration_ms);
44}
45
46#[inline(never)]
47extern "C" fn thread_monitor_report_stall_cleared(duration_ms: u64) {
48    warn!("Thread stall cleared after {}ms", duration_ms);
49}
50
51/// Monitors temporary stalls in tokio scheduling every MONITOR_INTERVAL.
52/// Calls `thread_monitor_report_stall` if more than ALERT_THRESHOLD has elapsed.
53/// When the stall clears, we observer the duration in a histogram.
54pub fn start_thread_stall_monitor() {
55    let mut called = true;
56    THREAD_STALL_MONITOR.call_once(|| {
57        called = false;
58    });
59    if called {
60        return;
61    }
62    if tokio::runtime::Handle::try_current().is_err() {
63        info!("Not running in a tokio runtime, not starting thread stall monitor.");
64        return;
65    }
66
67    let last_update: Arc<Mutex<Instant>> = Arc::new(Mutex::new(Instant::now()));
68
69    {
70        let last_update = last_update.clone();
71        std::thread::spawn(move || {
72            info!("Starting thread stall monitor watchdog thread");
73            let mut stall_duration = None;
74
75            loop {
76                std::thread::sleep(MONITOR_INTERVAL);
77                let now = Instant::now();
78                let last_update = *last_update.lock().unwrap();
79                let time_since_last_update = now - last_update;
80                if time_since_last_update > ALERT_THRESHOLD {
81                    if stall_duration.is_none() {
82                        thread_monitor_report_stall(time_since_last_update.as_millis() as u64);
83                    }
84                    stall_duration = Some(time_since_last_update);
85                } else if let Some(dur) = stall_duration {
86                    stall_duration = None;
87                    thread_monitor_report_stall_cleared(dur.as_millis() as u64);
88                    if let Some(metrics) = get_metrics() {
89                        metrics.thread_stall_duration_sec.observe(dur.as_secs_f64());
90                    }
91                }
92            }
93        });
94    }
95
96    spawn_logged_monitored_task!(
97        async move {
98            info!("Starting thread stall monitor update task");
99            loop {
100                tokio::time::sleep(MONITOR_INTERVAL).await;
101                *last_update.lock().unwrap() = Instant::now();
102            }
103        },
104        "ThreadStallMonitor"
105    );
106}