sui_indexer_alt_framework/ingestion/remote_client.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
// Copyright (c) Mysten Labs, Inc.
// SPDX-License-Identifier: Apache-2.0
use reqwest::{Client, StatusCode};
use std::time::Duration;
use tracing::{debug, error};
use url::Url;
use crate::ingestion::client::{FetchData, FetchError, FetchResult, IngestionClientTrait};
use crate::ingestion::Result as IngestionResult;
/// Default timeout for remote checkpoint fetches.
/// This prevents requests from hanging indefinitely due to network issues,
/// unresponsive servers, or other connection problems.
const DEFAULT_REQUEST_TIMEOUT: Duration = Duration::from_secs(120);
#[derive(thiserror::Error, Debug, Eq, PartialEq)]
pub enum HttpError {
#[error("HTTP error with status code: {0}")]
Http(StatusCode),
}
fn status_code_to_error(code: StatusCode) -> anyhow::Error {
HttpError::Http(code).into()
}
pub struct RemoteIngestionClient {
url: Url,
client: Client,
}
impl RemoteIngestionClient {
pub fn new(url: Url) -> IngestionResult<Self> {
Ok(Self {
url,
client: Client::builder().timeout(DEFAULT_REQUEST_TIMEOUT).build()?,
})
}
#[cfg(test)]
pub fn new_with_timeout(url: Url, timeout: Duration) -> IngestionResult<Self> {
Ok(Self {
url,
client: Client::builder().timeout(timeout).build()?,
})
}
/// Fetch metadata mapping epoch IDs to the sequence numbers of their last checkpoints.
/// The response is a JSON-encoded array of checkpoint sequence numbers.
pub async fn end_of_epoch_checkpoints(&self) -> reqwest::Result<reqwest::Response> {
// SAFETY: The path being joined is statically known to be valid.
let url = self
.url
.join("/epochs.json")
.expect("Unexpected invalid URL");
self.client.get(url).send().await
}
/// Fetch the bytes for a checkpoint by its sequence number.
/// The response is the serialized representation of a checkpoint, as raw bytes.
pub async fn checkpoint(&self, checkpoint: u64) -> reqwest::Result<reqwest::Response> {
// SAFETY: The path being joined is statically known to be valid.
let url = self
.url
.join(&format!("/{checkpoint}.chk"))
.expect("Unexpected invalid URL");
self.client.get(url).send().await
}
}
#[async_trait::async_trait]
impl IngestionClientTrait for RemoteIngestionClient {
/// Fetch a checkpoint from the remote store.
///
/// Transient errors include:
///
/// - failures to issue a request, (network errors, redirect issues, etc)
/// - request timeouts,
/// - rate limiting,
/// - server errors (5xx),
/// - issues getting a full response.
async fn fetch(&self, checkpoint: u64) -> FetchResult {
let response = self
.checkpoint(checkpoint)
.await
.map_err(|e| FetchError::Transient {
reason: "request",
error: e.into(),
})?;
match response.status() {
code if code.is_success() => {
// Failure to extract all the bytes from the payload, or to deserialize the
// checkpoint from them is considered a transient error -- the store being
// fetched from needs to be corrected, and ingestion will keep retrying it
// until it is.
response
.bytes()
.await
.map_err(|e| FetchError::Transient {
reason: "bytes",
error: e.into(),
})
.map(FetchData::Raw)
}
// Treat 404s as a special case so we can match on this error type.
code @ StatusCode::NOT_FOUND => {
debug!(checkpoint, %code, "Checkpoint not found");
Err(FetchError::NotFound)
}
// Timeouts are a client error but they are usually transient.
code @ StatusCode::REQUEST_TIMEOUT => Err(FetchError::Transient {
reason: "timeout",
error: status_code_to_error(code),
}),
// Rate limiting is also a client error, but the backoff will eventually widen the
// interval appropriately.
code @ StatusCode::TOO_MANY_REQUESTS => Err(FetchError::Transient {
reason: "too_many_requests",
error: status_code_to_error(code),
}),
// Assume that if the server is facing difficulties, it will recover eventually.
code if code.is_server_error() => Err(FetchError::Transient {
reason: "server_error",
error: status_code_to_error(code),
}),
// Still retry on other unsuccessful codes, but the reason is unclear.
code => Err(FetchError::Transient {
reason: "unknown",
error: status_code_to_error(code),
}),
}
}
}
#[cfg(test)]
pub(crate) mod tests {
use super::*;
use crate::ingestion::client::IngestionClient;
use crate::ingestion::error::Error;
use crate::ingestion::test_utils::test_checkpoint_data;
use crate::metrics::tests::test_metrics;
use axum::http::StatusCode;
use std::sync::{
atomic::{AtomicUsize, Ordering},
Mutex,
};
use wiremock::{
matchers::{method, path_regex},
Mock, MockServer, Request, Respond, ResponseTemplate,
};
pub(crate) async fn respond_with(server: &MockServer, response: impl Respond + 'static) {
Mock::given(method("GET"))
.and(path_regex(r"/\d+.chk"))
.respond_with(response)
.mount(server)
.await;
}
pub(crate) fn status(code: StatusCode) -> ResponseTemplate {
ResponseTemplate::new(code.as_u16())
}
fn remote_test_client(uri: String) -> IngestionClient {
IngestionClient::new_remote(Url::parse(&uri).unwrap(), test_metrics()).unwrap()
}
#[tokio::test]
async fn fail_on_not_found() {
let server = MockServer::start().await;
respond_with(&server, status(StatusCode::NOT_FOUND)).await;
let client = remote_test_client(server.uri());
let error = client.fetch(42).await.unwrap_err();
assert!(matches!(error, Error::NotFound(42)));
}
/// Assume that failures to send the request to the remote store are due to temporary
/// connectivity issues, and retry them.
#[tokio::test]
async fn retry_on_request_error() {
let server = MockServer::start().await;
let times: Mutex<u64> = Mutex::new(0);
respond_with(&server, move |r: &Request| {
let mut times = times.lock().unwrap();
*times += 1;
match (*times, r.url.path()) {
// The first request will trigger a redirect to 0.chk no matter what the original
// request was for -- triggering a request error.
(1, _) => status(StatusCode::MOVED_PERMANENTLY).append_header("Location", "/0.chk"),
// Set-up checkpoint 0 as an infinite redirect loop.
(_, "/0.chk") => {
status(StatusCode::MOVED_PERMANENTLY).append_header("Location", r.url.as_str())
}
// Subsequently, requests will succeed.
_ => status(StatusCode::OK).set_body_bytes(test_checkpoint_data(42)),
}
})
.await;
let client = remote_test_client(server.uri());
let checkpoint = client.fetch(42).await.unwrap();
assert_eq!(42, checkpoint.checkpoint_summary.sequence_number)
}
/// Assume that certain errors will recover by themselves, and keep retrying with an
/// exponential back-off. These errors include: 5xx (server) errors, 408 (timeout), and 429
/// (rate limiting).
#[tokio::test]
async fn retry_on_transient_server_error() {
let server = MockServer::start().await;
let times: Mutex<u64> = Mutex::new(0);
respond_with(&server, move |_: &Request| {
let mut times = times.lock().unwrap();
*times += 1;
match *times {
1 => status(StatusCode::INTERNAL_SERVER_ERROR),
2 => status(StatusCode::REQUEST_TIMEOUT),
3 => status(StatusCode::TOO_MANY_REQUESTS),
_ => status(StatusCode::OK).set_body_bytes(test_checkpoint_data(42)),
}
})
.await;
let client = remote_test_client(server.uri());
let checkpoint = client.fetch(42).await.unwrap();
assert_eq!(42, checkpoint.checkpoint_summary.sequence_number)
}
/// Treat deserialization failure as another kind of transient error -- all checkpoint data
/// that is fetched should be valid (deserializable as a `CheckpointData`).
#[tokio::test]
async fn retry_on_deserialization_error() {
let server = MockServer::start().await;
let times: Mutex<u64> = Mutex::new(0);
respond_with(&server, move |_: &Request| {
let mut times = times.lock().unwrap();
*times += 1;
if *times < 3 {
status(StatusCode::OK).set_body_bytes(vec![])
} else {
status(StatusCode::OK).set_body_bytes(test_checkpoint_data(42))
}
})
.await;
let client = remote_test_client(server.uri());
let checkpoint = client.fetch(42).await.unwrap();
assert_eq!(42, checkpoint.checkpoint_summary.sequence_number)
}
/// Test that timeout errors are retried as transient errors.
/// The first request will timeout, the second will succeed.
#[tokio::test]
async fn retry_on_timeout() {
use std::sync::Arc;
let server = MockServer::start().await;
let times: Arc<AtomicUsize> = Arc::new(AtomicUsize::new(0));
let times_clone = times.clone();
// First request will delay longer than timeout, second will succeed immediately
respond_with(&server, move |_: &Request| {
match times_clone.fetch_add(1, Ordering::Relaxed) {
0 => {
// Delay longer than our test timeout (2 seconds)
std::thread::sleep(std::time::Duration::from_secs(4));
status(StatusCode::OK).set_body_bytes(test_checkpoint_data(42))
}
_ => {
// Respond immediately on retry attempts
status(StatusCode::OK).set_body_bytes(test_checkpoint_data(42))
}
}
})
.await;
// Create a client with a 2 second timeout for testing
let ingestion_client = IngestionClient::new_remote_with_timeout(
Url::parse(&server.uri()).unwrap(),
Duration::from_secs(2),
test_metrics(),
)
.unwrap();
// This should timeout once, then succeed on retry
let checkpoint = ingestion_client.fetch(42).await.unwrap();
assert_eq!(42, checkpoint.checkpoint_summary.sequence_number);
// Verify that the server received exactly 2 requests (1 timeout + 1 successful retry)
let final_count = times.load(Ordering::Relaxed);
assert_eq!(final_count, 2);
}
}