Skip to main content

sui_rpc/light_client/
retry.rs

1//! Exponential-backoff retry primitives for the light-client ratchet.
2//!
3//! [`step`] is the building block: a call site that wants retry
4//! semantics inlines its own request/await loop and calls [`step`] on
5//! each error to decide whether to sleep and reissue or propagate.
6//! The retry decision is centralized so all ratchet RPCs share the
7//! same retryable-status set and backoff schedule.
8//!
9//! Why not a higher-order `with_backoff(closure)` helper? The natural
10//! shape — `FnMut() -> impl Future<…>` — doesn't compose with
11//! borrowed RPC state. `LedgerServiceClient<BoxedChannel>` isn't
12//! `Clone` (its `BoxService` backend isn't), and `client: &mut Client`
13//! gets re-borrowed inside the closure body, which can't escape the
14//! `FnMut` invocation. Switching to `AsyncFnMut` works at the type
15//! level but the resulting future can't satisfy `Send` for
16//! `tokio::spawn` from the events streaming task. Inlining the loop
17//! and delegating only the retry decision to [`step`] dodges both.
18
19use std::time::Duration;
20
21use super::RatchetConfig;
22use super::error::LightClientError;
23
24/// Decide whether the current error should be retried.
25///
26/// If `err` is a transient RPC failure and `attempt` is still under
27/// `config.max_retries`, sleep for the backoff delay, bump `attempt`,
28/// and return `Ok(())` — the caller should reissue. Otherwise
29/// propagate `err` unchanged.
30pub(crate) async fn step(
31    config: &RatchetConfig,
32    err: LightClientError,
33    attempt: &mut u32,
34) -> Result<(), LightClientError> {
35    if !is_retryable(&err) || *attempt >= config.max_retries {
36        return Err(err);
37    }
38    tokio::time::sleep(backoff_delay(config, *attempt)).await;
39    *attempt = attempt.saturating_add(1);
40    Ok(())
41}
42
43/// True if `err` is a transient gRPC failure that's worth retrying.
44///
45/// The retryable codes match upstream's reference light client:
46/// network blips and load-shedding statuses, not protocol violations.
47fn is_retryable(err: &LightClientError) -> bool {
48    let LightClientError::Rpc(status) = err else {
49        return false;
50    };
51    matches!(
52        status.code(),
53        tonic::Code::Unavailable
54            | tonic::Code::DeadlineExceeded
55            | tonic::Code::ResourceExhausted
56            | tonic::Code::Aborted
57    )
58}
59
60fn backoff_delay(config: &RatchetConfig, attempt: u32) -> Duration {
61    let shift = attempt.min(20);
62    let base = config
63        .base_retry_delay
64        .saturating_mul(1u32 << shift)
65        .min(config.max_retry_delay);
66    base.saturating_add(pseudo_jitter(attempt, config.retry_jitter))
67}
68
69/// Deterministic per-attempt jitter. Keeps concurrent retriers from
70/// landing on the same instant without bringing in a `rand`
71/// dependency for a single call.
72fn pseudo_jitter(attempt: u32, ceiling: Duration) -> Duration {
73    if ceiling.is_zero() {
74        return Duration::ZERO;
75    }
76    let mix = u64::from(attempt).wrapping_mul(0x9E37_79B9_7F4A_7C15) as u128;
77    let ceiling_ms = ceiling.as_millis().max(1);
78    let offset_ms = (mix % ceiling_ms) as u64;
79    Duration::from_millis(offset_ms)
80}
81
82#[cfg(test)]
83mod tests {
84    use super::*;
85
86    fn no_sleep_config(max_retries: u32) -> RatchetConfig {
87        RatchetConfig {
88            max_retries,
89            base_retry_delay: Duration::ZERO,
90            max_retry_delay: Duration::ZERO,
91            retry_jitter: Duration::ZERO,
92            ..RatchetConfig::default()
93        }
94    }
95
96    /// `step` permits retries up to the configured cap, then refuses.
97    #[tokio::test]
98    async fn step_retries_until_cap_then_surfaces_last_error() {
99        let config = no_sleep_config(3);
100        let mut attempt = 0u32;
101        for _ in 0..3 {
102            let err = LightClientError::Rpc(tonic::Status::unavailable("down"));
103            step(&config, err, &mut attempt)
104                .await
105                .expect("retry allowed");
106        }
107        // Fourth call: attempt is now 3, equal to max_retries; refuse.
108        let err = LightClientError::Rpc(tonic::Status::unavailable("down"));
109        let result = step(&config, err, &mut attempt).await;
110        assert!(matches!(result, Err(LightClientError::Rpc(_))));
111        assert_eq!(attempt, 3);
112    }
113
114    /// A non-retryable error is propagated immediately, without
115    /// touching the attempt counter.
116    #[tokio::test]
117    async fn step_passes_non_retryable_errors_through() {
118        let config = no_sleep_config(5);
119        let mut attempt = 0u32;
120        let err = LightClientError::Rpc(tonic::Status::invalid_argument("nope"));
121        let result = step(&config, err, &mut attempt).await;
122        assert!(matches!(result, Err(LightClientError::Rpc(_))));
123        assert_eq!(attempt, 0);
124    }
125
126    /// Disabling retry (`max_retries = 0`) refuses to retry even the
127    /// first failure.
128    #[tokio::test]
129    async fn step_disabled_retry_refuses_immediately() {
130        let config = no_sleep_config(0);
131        let mut attempt = 0u32;
132        let err = LightClientError::Rpc(tonic::Status::unavailable("down"));
133        let result = step(&config, err, &mut attempt).await;
134        assert!(matches!(result, Err(LightClientError::Rpc(_))));
135        assert_eq!(attempt, 0);
136    }
137
138    /// Backoff delay grows exponentially, then plateaus at
139    /// `max_retry_delay`.
140    #[test]
141    fn backoff_delay_caps_at_max() {
142        let config = RatchetConfig {
143            base_retry_delay: Duration::from_millis(100),
144            max_retry_delay: Duration::from_millis(800),
145            retry_jitter: Duration::ZERO,
146            ..RatchetConfig::default()
147        };
148        // attempt 0: 100ms
149        assert_eq!(backoff_delay(&config, 0), Duration::from_millis(100));
150        // attempt 1: 200ms
151        assert_eq!(backoff_delay(&config, 1), Duration::from_millis(200));
152        // attempt 2: 400ms
153        assert_eq!(backoff_delay(&config, 2), Duration::from_millis(400));
154        // attempt 3: 800ms (would be 800; equals cap)
155        assert_eq!(backoff_delay(&config, 3), Duration::from_millis(800));
156        // attempt 4: would be 1600ms, capped to 800ms
157        assert_eq!(backoff_delay(&config, 4), Duration::from_millis(800));
158    }
159}