sui_indexer/handlers/
committer.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
// Copyright (c) Mysten Labs, Inc.
// SPDX-License-Identifier: Apache-2.0

use std::collections::{BTreeMap, HashMap};

use sui_types::messages_checkpoint::CheckpointSequenceNumber;
use tap::tap::TapFallible;
use tokio_util::sync::CancellationToken;
use tracing::instrument;
use tracing::{error, info};

use crate::metrics::IndexerMetrics;
use crate::models::raw_checkpoints::StoredRawCheckpoint;
use crate::store::IndexerStore;
use crate::types::IndexerResult;

use super::{CheckpointDataToCommit, CommitterTables, CommitterWatermark, EpochToCommit};

pub(crate) const CHECKPOINT_COMMIT_BATCH_SIZE: usize = 100;

pub async fn start_tx_checkpoint_commit_task<S>(
    state: S,
    metrics: IndexerMetrics,
    tx_indexing_receiver: mysten_metrics::metered_channel::Receiver<CheckpointDataToCommit>,
    cancel: CancellationToken,
    mut next_checkpoint_sequence_number: CheckpointSequenceNumber,
    end_checkpoint_opt: Option<CheckpointSequenceNumber>,
) -> IndexerResult<()>
where
    S: IndexerStore + Clone + Sync + Send + 'static,
{
    use futures::StreamExt;

    info!("Indexer checkpoint commit task started...");
    let checkpoint_commit_batch_size = std::env::var("CHECKPOINT_COMMIT_BATCH_SIZE")
        .unwrap_or(CHECKPOINT_COMMIT_BATCH_SIZE.to_string())
        .parse::<usize>()
        .unwrap();
    info!("Using checkpoint commit batch size {checkpoint_commit_batch_size}");

    let mut stream = mysten_metrics::metered_channel::ReceiverStream::new(tx_indexing_receiver)
        .ready_chunks(checkpoint_commit_batch_size);

    let mut unprocessed = HashMap::new();
    let mut batch = vec![];

    while let Some(indexed_checkpoint_batch) = stream.next().await {
        if cancel.is_cancelled() {
            break;
        }

        // split the batch into smaller batches per epoch to handle partitioning
        for checkpoint in indexed_checkpoint_batch {
            unprocessed.insert(checkpoint.checkpoint.sequence_number, checkpoint);
        }
        while let Some(checkpoint) = unprocessed.remove(&next_checkpoint_sequence_number) {
            let epoch = checkpoint.epoch.clone();
            batch.push(checkpoint);
            next_checkpoint_sequence_number += 1;
            let epoch_number_option = epoch.as_ref().map(|epoch| epoch.new_epoch_id());
            // The batch will consist of contiguous checkpoints and at most one epoch boundary at
            // the end.
            if batch.len() == checkpoint_commit_batch_size || epoch.is_some() {
                commit_checkpoints(&state, batch, epoch, &metrics).await;
                batch = vec![];
            }
            if let Some(epoch_number) = epoch_number_option {
                state.upload_display(epoch_number).await.tap_err(|e| {
                    error!(
                        "Failed to upload display table before epoch {} with error: {}",
                        epoch_number,
                        e.to_string()
                    );
                })?;
            }
            // stop adding to the commit batch if we've reached the end checkpoint
            if let Some(end_checkpoint_sequence_number) = end_checkpoint_opt {
                if next_checkpoint_sequence_number > end_checkpoint_sequence_number {
                    break;
                }
            }
        }
        if !batch.is_empty() {
            commit_checkpoints(&state, batch, None, &metrics).await;
            batch = vec![];
        }

        // stop the commit task if we've reached the end checkpoint
        if let Some(end_checkpoint_sequence_number) = end_checkpoint_opt {
            if next_checkpoint_sequence_number > end_checkpoint_sequence_number {
                break;
            }
        }
    }
    Ok(())
}

/// Writes indexed checkpoint data to the database, and then update watermark upper bounds and
/// metrics. Expects `indexed_checkpoint_batch` to be non-empty, and contain contiguous checkpoints.
/// There can be at most one epoch boundary at the end. If an epoch boundary is detected,
/// epoch-partitioned tables must be advanced.
// Unwrap: Caller needs to make sure indexed_checkpoint_batch is not empty
#[instrument(skip_all, fields(
    first = indexed_checkpoint_batch.first().as_ref().unwrap().checkpoint.sequence_number,
    last = indexed_checkpoint_batch.last().as_ref().unwrap().checkpoint.sequence_number
))]
async fn commit_checkpoints<S>(
    state: &S,
    indexed_checkpoint_batch: Vec<CheckpointDataToCommit>,
    epoch: Option<EpochToCommit>,
    metrics: &IndexerMetrics,
) where
    S: IndexerStore + Clone + Sync + Send + 'static,
{
    let mut checkpoint_batch = vec![];
    let mut tx_batch = vec![];
    let mut events_batch = vec![];
    let mut tx_indices_batch = vec![];
    let mut event_indices_batch = vec![];
    let mut display_updates_batch = BTreeMap::new();
    let mut object_changes_batch = vec![];
    let mut object_history_changes_batch = vec![];
    let mut object_versions_batch = vec![];
    let mut packages_batch = vec![];

    for indexed_checkpoint in indexed_checkpoint_batch {
        let CheckpointDataToCommit {
            checkpoint,
            transactions,
            events,
            event_indices,
            tx_indices,
            display_updates,
            object_changes,
            object_history_changes,
            object_versions,
            packages,
            epoch: _,
        } = indexed_checkpoint;

        tx_batch.push(transactions);
        events_batch.push(events);
        tx_indices_batch.push(tx_indices);
        event_indices_batch.push(event_indices);
        display_updates_batch.extend(display_updates.into_iter());
        object_changes_batch.push(object_changes);
        object_versions_batch.push(object_versions);
        object_history_changes_batch.push(object_history_changes);
        checkpoint_batch.push(checkpoint);
        packages_batch.push(packages);
    }

    let first_checkpoint_seq = checkpoint_batch.first().unwrap().sequence_number;
    let last_checkpoint = checkpoint_batch.last().unwrap();
    let committer_watermark = CommitterWatermark::from(last_checkpoint);

    let guard = metrics.checkpoint_db_commit_latency.start_timer();
    let tx_batch = tx_batch.into_iter().flatten().collect::<Vec<_>>();
    let tx_indices_batch = tx_indices_batch.into_iter().flatten().collect::<Vec<_>>();
    let events_batch = events_batch.into_iter().flatten().collect::<Vec<_>>();
    let event_indices_batch = event_indices_batch
        .into_iter()
        .flatten()
        .collect::<Vec<_>>();
    let object_versions_batch = object_versions_batch
        .into_iter()
        .flatten()
        .collect::<Vec<_>>();
    let packages_batch = packages_batch.into_iter().flatten().collect::<Vec<_>>();
    let checkpoint_num = checkpoint_batch.len();
    let tx_count = tx_batch.len();
    let raw_checkpoints_batch = checkpoint_batch
        .iter()
        .map(|c| c.into())
        .collect::<Vec<StoredRawCheckpoint>>();

    {
        let _step_1_guard: prometheus::HistogramTimer =
            metrics.checkpoint_db_commit_latency_step_1.start_timer();

        let mut persist_tasks = vec![
            state.persist_packages(packages_batch),
            state.persist_object_history(object_history_changes_batch.clone()),
            state.persist_transactions(tx_batch),
            state.persist_tx_indices(tx_indices_batch),
            state.persist_events(events_batch),
            state.persist_event_indices(event_indices_batch),
            state.persist_displays(display_updates_batch),
            state.persist_objects(object_changes_batch.clone()),
            state.persist_full_objects_history(object_history_changes_batch.clone()),
            state.persist_objects_version(object_versions_batch.clone()),
            state.persist_raw_checkpoints(raw_checkpoints_batch),
        ];

        if let Some(epoch_data) = epoch.clone() {
            persist_tasks.push(state.persist_epoch(epoch_data));
        }
        futures::future::join_all(persist_tasks)
            .await
            .into_iter()
            .map(|res| {
                if res.is_err() {
                    error!("Failed to persist data with error: {:?}", res);
                }
                res
            })
            .collect::<IndexerResult<Vec<_>>>()
            .expect("Persisting data into DB should not fail.");
    }

    let is_epoch_end = epoch.is_some();

    // On epoch boundary, we need to modify the existing partitions' upper bound, and introduce a
    // new partition for incoming data for the upcoming epoch.
    if let Some(epoch_data) = epoch {
        state
            .advance_epoch(epoch_data)
            .await
            .tap_err(|e| {
                error!("Failed to advance epoch with error: {}", e.to_string());
            })
            .expect("Advancing epochs in DB should not fail.");
        metrics.total_epoch_committed.inc();
    }

    state
        .persist_checkpoints(checkpoint_batch)
        .await
        .tap_err(|e| {
            error!(
                "Failed to persist checkpoint data with error: {}",
                e.to_string()
            );
        })
        .expect("Persisting data into DB should not fail.");

    if is_epoch_end {
        // The epoch has advanced so we update the configs for the new protocol version, if it has changed.
        let chain_id = state
            .get_chain_identifier()
            .await
            .expect("Failed to get chain identifier")
            .expect("Chain identifier should have been indexed at this point");
        let _ = state
            .persist_protocol_configs_and_feature_flags(chain_id)
            .await;
    }

    state
        .update_watermarks_upper_bound::<CommitterTables>(committer_watermark)
        .await
        .tap_err(|e| {
            error!(
                "Failed to update watermark upper bound with error: {}",
                e.to_string()
            );
        })
        .expect("Updating watermark upper bound in DB should not fail.");

    let elapsed = guard.stop_and_record();

    info!(
        elapsed,
        "Checkpoint {}-{} committed with {} transactions.",
        first_checkpoint_seq,
        committer_watermark.checkpoint_hi_inclusive,
        tx_count,
    );
    metrics
        .latest_tx_checkpoint_sequence_number
        .set(committer_watermark.checkpoint_hi_inclusive as i64);
    metrics
        .total_tx_checkpoint_committed
        .inc_by(checkpoint_num as u64);
    metrics.total_transaction_committed.inc_by(tx_count as u64);
    metrics.transaction_per_checkpoint.observe(
        tx_count as f64
            / (committer_watermark.checkpoint_hi_inclusive - first_checkpoint_seq + 1) as f64,
    );
    // 1000.0 is not necessarily the batch size, it's to roughly map average tx commit latency to [0.1, 1] seconds,
    // which is well covered by DB_COMMIT_LATENCY_SEC_BUCKETS.
    metrics
        .thousand_transaction_avg_db_commit_latency
        .observe(elapsed * 1000.0 / tx_count as f64);
}