sui_core/authority/
consensus_quarantine.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use crate::authority::authority_per_epoch_store::{
5    AuthorityEpochTables, EncG, ExecutionIndicesWithStatsV2, LockDetails, LockDetailsWrapper, PkG,
6};
7use crate::authority::transaction_deferral::DeferralKey;
8use crate::checkpoints::BuilderCheckpointSummary;
9use crate::epoch::randomness::SINGLETON_KEY;
10use dashmap::DashMap;
11use fastcrypto_tbls::{dkg_v1, nodes::PartyId};
12use fastcrypto_zkp::bn254::zk_login::{JWK, JwkId};
13use moka::policy::EvictionPolicy;
14use moka::sync::SegmentedCache as MokaCache;
15use mysten_common::ZipDebugEqIteratorExt;
16use mysten_common::fatal;
17use mysten_common::random_util::randomize_cache_capacity_in_tests;
18use parking_lot::Mutex;
19use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque, hash_map};
20use sui_types::authenticator_state::ActiveJwk;
21use sui_types::base_types::{AuthorityName, ObjectRef, SequenceNumber};
22use sui_types::crypto::RandomnessRound;
23use sui_types::error::SuiResult;
24use sui_types::executable_transaction::{
25    TrustedExecutableTransactionWithAliases, VerifiedExecutableTransactionWithAliases,
26};
27use sui_types::execution::ExecutionTimeObservationKey;
28use sui_types::messages_checkpoint::{CheckpointContents, CheckpointSequenceNumber};
29use sui_types::messages_consensus::AuthorityIndex;
30use sui_types::{
31    base_types::{ConsensusObjectSequenceKey, ObjectID},
32    digests::TransactionDigest,
33    messages_consensus::{Round, TimestampMs, VersionedDkgConfirmation},
34    signature::GenericSignature,
35};
36use tracing::{debug, info};
37use typed_store::Map;
38use typed_store::rocks::DBBatch;
39
40use crate::{
41    authority::{
42        authority_per_epoch_store::AuthorityPerEpochStore,
43        shared_object_congestion_tracker::CongestionPerObjectDebt,
44    },
45    checkpoints::{CheckpointHeight, PendingCheckpoint, PendingCheckpointV2},
46    consensus_handler::SequencedConsensusTransactionKey,
47    epoch::{
48        randomness::{VersionedProcessedMessage, VersionedUsedProcessedMessages},
49        reconfiguration::ReconfigState,
50    },
51};
52
53use super::*;
54
55#[derive(Default)]
56#[allow(clippy::type_complexity)]
57pub(crate) struct ConsensusCommitOutput {
58    // Consensus and reconfig state
59    consensus_round: Round,
60    consensus_messages_processed: BTreeSet<SequencedConsensusTransactionKey>,
61    end_of_publish: BTreeSet<AuthorityName>,
62    reconfig_state: Option<ReconfigState>,
63    consensus_commit_stats: Option<ExecutionIndicesWithStatsV2>,
64
65    // transaction scheduling state
66    next_shared_object_versions: Option<HashMap<ConsensusObjectSequenceKey, SequenceNumber>>,
67
68    deferred_txns: Vec<(DeferralKey, Vec<VerifiedExecutableTransactionWithAliases>)>,
69    deleted_deferred_txns: BTreeSet<DeferralKey>,
70
71    // checkpoint state
72    pending_checkpoints: Vec<PendingCheckpoint>,
73    pending_checkpoints_v2: Vec<PendingCheckpointV2>,
74
75    // random beacon state
76    next_randomness_round: Option<(RandomnessRound, TimestampMs)>,
77
78    dkg_confirmations: BTreeMap<PartyId, VersionedDkgConfirmation>,
79    dkg_processed_messages: BTreeMap<PartyId, VersionedProcessedMessage>,
80    dkg_used_message: Option<VersionedUsedProcessedMessages>,
81    dkg_output: Option<dkg_v1::Output<PkG, EncG>>,
82
83    // jwk state
84    pending_jwks: BTreeSet<(AuthorityName, JwkId, JWK)>,
85    active_jwks: BTreeSet<(u64, (JwkId, JWK))>,
86
87    // congestion control state
88    congestion_control_object_debts: Vec<(ObjectID, u64)>,
89    congestion_control_randomness_object_debts: Vec<(ObjectID, u64)>,
90    execution_time_observations: Vec<(
91        AuthorityIndex,
92        u64, /* generation */
93        Vec<(ExecutionTimeObservationKey, Duration)>,
94    )>,
95
96    // Owned object locks acquired post-consensus.
97    owned_object_locks: HashMap<ObjectRef, LockDetails>,
98
99    // True when the checkpoint queue had no pending roots after this commit's flush.
100    // Used by quarantine to determine safe commit boundaries on restart.
101    checkpoint_queue_drained: bool,
102}
103
104impl ConsensusCommitOutput {
105    pub fn new(consensus_round: Round) -> Self {
106        Self {
107            consensus_round,
108            ..Default::default()
109        }
110    }
111
112    pub fn get_deleted_deferred_txn_keys(&self) -> impl Iterator<Item = DeferralKey> + use<'_> {
113        self.deleted_deferred_txns.iter().cloned()
114    }
115
116    pub fn has_deferred_transactions(&self) -> bool {
117        !self.deferred_txns.is_empty()
118    }
119
120    fn get_randomness_last_round_timestamp(&self) -> Option<TimestampMs> {
121        self.next_randomness_round.as_ref().map(|(_, ts)| *ts)
122    }
123
124    fn get_highest_pending_checkpoint_height(&self) -> Option<CheckpointHeight> {
125        self.pending_checkpoints.last().map(|cp| cp.height())
126    }
127
128    fn get_pending_checkpoints(
129        &self,
130        last: Option<CheckpointHeight>,
131    ) -> impl Iterator<Item = &PendingCheckpoint> {
132        self.pending_checkpoints.iter().filter(move |cp| {
133            if let Some(last) = last {
134                cp.height() > last
135            } else {
136                true
137            }
138        })
139    }
140
141    fn pending_checkpoint_exists(&self, index: &CheckpointHeight) -> bool {
142        self.pending_checkpoints
143            .iter()
144            .any(|cp| cp.height() == *index)
145    }
146
147    fn get_pending_checkpoints_v2(
148        &self,
149        last: Option<CheckpointHeight>,
150    ) -> impl Iterator<Item = &PendingCheckpointV2> {
151        self.pending_checkpoints_v2.iter().filter(move |cp| {
152            if let Some(last) = last {
153                cp.height() > last
154            } else {
155                true
156            }
157        })
158    }
159
160    fn pending_checkpoint_exists_v2(&self, index: &CheckpointHeight) -> bool {
161        self.pending_checkpoints_v2
162            .iter()
163            .any(|cp| cp.height() == *index)
164    }
165
166    fn get_round(&self) -> Option<u64> {
167        self.consensus_commit_stats
168            .as_ref()
169            .map(|stats| stats.index.last_committed_round)
170    }
171
172    pub fn insert_end_of_publish(&mut self, authority: AuthorityName) {
173        self.end_of_publish.insert(authority);
174    }
175
176    pub fn insert_execution_time_observation(
177        &mut self,
178        source: AuthorityIndex,
179        generation: u64,
180        estimates: Vec<(ExecutionTimeObservationKey, Duration)>,
181    ) {
182        self.execution_time_observations
183            .push((source, generation, estimates));
184    }
185
186    pub(crate) fn record_consensus_commit_stats(&mut self, stats: ExecutionIndicesWithStatsV2) {
187        self.consensus_commit_stats = Some(stats);
188    }
189
190    // in testing code we often need to write to the db outside of a consensus commit
191    pub(crate) fn set_default_commit_stats_for_testing(&mut self) {
192        self.record_consensus_commit_stats(Default::default());
193    }
194
195    pub fn store_reconfig_state(&mut self, state: ReconfigState) {
196        self.reconfig_state = Some(state);
197    }
198
199    pub fn record_consensus_message_processed(&mut self, key: SequencedConsensusTransactionKey) {
200        self.consensus_messages_processed.insert(key);
201    }
202
203    pub fn get_consensus_messages_processed(
204        &self,
205    ) -> impl Iterator<Item = &SequencedConsensusTransactionKey> {
206        self.consensus_messages_processed.iter()
207    }
208
209    pub fn set_next_shared_object_versions(
210        &mut self,
211        next_versions: HashMap<ConsensusObjectSequenceKey, SequenceNumber>,
212    ) {
213        assert!(self.next_shared_object_versions.is_none());
214        self.next_shared_object_versions = Some(next_versions);
215    }
216
217    pub fn defer_transactions(
218        &mut self,
219        key: DeferralKey,
220        transactions: Vec<VerifiedExecutableTransactionWithAliases>,
221    ) {
222        self.deferred_txns.push((key, transactions));
223    }
224
225    pub fn delete_loaded_deferred_transactions(&mut self, deferral_keys: &[DeferralKey]) {
226        self.deleted_deferred_txns
227            .extend(deferral_keys.iter().cloned());
228    }
229
230    pub fn insert_pending_checkpoint(&mut self, checkpoint: PendingCheckpoint) {
231        self.pending_checkpoints.push(checkpoint);
232    }
233
234    pub fn insert_pending_checkpoint_v2(&mut self, checkpoint: PendingCheckpointV2) {
235        self.pending_checkpoints_v2.push(checkpoint);
236    }
237
238    pub fn reserve_next_randomness_round(
239        &mut self,
240        next_randomness_round: RandomnessRound,
241        commit_timestamp: TimestampMs,
242    ) {
243        assert!(self.next_randomness_round.is_none());
244        self.next_randomness_round = Some((next_randomness_round, commit_timestamp));
245    }
246
247    pub fn insert_dkg_confirmation(&mut self, conf: VersionedDkgConfirmation) {
248        self.dkg_confirmations.insert(conf.sender(), conf);
249    }
250
251    pub fn insert_dkg_processed_message(&mut self, message: VersionedProcessedMessage) {
252        self.dkg_processed_messages
253            .insert(message.sender(), message);
254    }
255
256    pub fn insert_dkg_used_messages(&mut self, used_messages: VersionedUsedProcessedMessages) {
257        self.dkg_used_message = Some(used_messages);
258    }
259
260    pub fn set_dkg_output(&mut self, output: dkg_v1::Output<PkG, EncG>) {
261        self.dkg_output = Some(output);
262    }
263
264    pub fn insert_pending_jwk(&mut self, authority: AuthorityName, id: JwkId, jwk: JWK) {
265        self.pending_jwks.insert((authority, id, jwk));
266    }
267
268    pub fn insert_active_jwk(&mut self, round: u64, key: (JwkId, JWK)) {
269        self.active_jwks.insert((round, key));
270    }
271
272    pub fn set_congestion_control_object_debts(&mut self, object_debts: Vec<(ObjectID, u64)>) {
273        self.congestion_control_object_debts = object_debts;
274    }
275
276    pub fn set_congestion_control_randomness_object_debts(
277        &mut self,
278        object_debts: Vec<(ObjectID, u64)>,
279    ) {
280        self.congestion_control_randomness_object_debts = object_debts;
281    }
282
283    pub fn set_checkpoint_queue_drained(&mut self, drained: bool) {
284        self.checkpoint_queue_drained = drained;
285    }
286
287    pub fn set_owned_object_locks(&mut self, locks: HashMap<ObjectRef, LockDetails>) {
288        assert!(self.owned_object_locks.is_empty());
289        self.owned_object_locks = locks;
290    }
291
292    pub fn write_to_batch(
293        self,
294        epoch_store: &AuthorityPerEpochStore,
295        batch: &mut DBBatch,
296    ) -> SuiResult {
297        let tables = epoch_store.tables()?;
298        batch.insert_batch(
299            &tables.consensus_message_processed,
300            self.consensus_messages_processed
301                .iter()
302                .map(|key| (key, true)),
303        )?;
304
305        batch.insert_batch(
306            &tables.end_of_publish,
307            self.end_of_publish.iter().map(|authority| (authority, ())),
308        )?;
309
310        if let Some(reconfig_state) = &self.reconfig_state {
311            batch.insert_batch(
312                &tables.reconfig_state,
313                [(RECONFIG_STATE_INDEX, reconfig_state)],
314            )?;
315        }
316
317        let consensus_commit_stats = self
318            .consensus_commit_stats
319            .expect("consensus_commit_stats must be set");
320        let round = consensus_commit_stats.index.last_committed_round;
321
322        batch.insert_batch(
323            &tables.last_consensus_stats_v2,
324            [(LAST_CONSENSUS_STATS_ADDR, consensus_commit_stats)],
325        )?;
326
327        if let Some(next_versions) = self.next_shared_object_versions {
328            batch.insert_batch(&tables.next_shared_object_versions_v2, next_versions)?;
329        }
330
331        if !self.owned_object_locks.is_empty() {
332            batch.insert_batch(
333                &tables.owned_object_locked_transactions,
334                self.owned_object_locks
335                    .into_iter()
336                    .map(|(obj_ref, lock)| (obj_ref, LockDetailsWrapper::from(lock))),
337            )?;
338        }
339
340        batch.delete_batch(
341            &tables.deferred_transactions_v2,
342            &self.deleted_deferred_txns,
343        )?;
344        batch.delete_batch(
345            &tables.deferred_transactions_with_aliases_v2,
346            &self.deleted_deferred_txns,
347        )?;
348        batch.delete_batch(
349            &tables.deferred_transactions_with_aliases_v3,
350            &self.deleted_deferred_txns,
351        )?;
352
353        batch.insert_batch(
354            &tables.deferred_transactions_with_aliases_v3,
355            self.deferred_txns.into_iter().map(|(key, txs)| {
356                (
357                    key,
358                    txs.into_iter()
359                        .map(|tx| {
360                            let tx: TrustedExecutableTransactionWithAliases = tx.serializable();
361                            tx
362                        })
363                        .collect::<Vec<_>>(),
364                )
365            }),
366        )?;
367
368        if let Some((round, commit_timestamp)) = self.next_randomness_round {
369            batch.insert_batch(&tables.randomness_next_round, [(SINGLETON_KEY, round)])?;
370            batch.insert_batch(
371                &tables.randomness_last_round_timestamp,
372                [(SINGLETON_KEY, commit_timestamp)],
373            )?;
374        }
375
376        batch.insert_batch(&tables.dkg_confirmations_v2, self.dkg_confirmations)?;
377        batch.insert_batch(
378            &tables.dkg_processed_messages_v2,
379            self.dkg_processed_messages,
380        )?;
381        batch.insert_batch(
382            &tables.dkg_used_messages_v2,
383            // using Option as iter
384            self.dkg_used_message
385                .into_iter()
386                .map(|used_msgs| (SINGLETON_KEY, used_msgs)),
387        )?;
388        if let Some(output) = self.dkg_output {
389            batch.insert_batch(&tables.dkg_output, [(SINGLETON_KEY, output)])?;
390        }
391
392        batch.insert_batch(
393            &tables.pending_jwks,
394            self.pending_jwks.into_iter().map(|j| (j, ())),
395        )?;
396        batch.insert_batch(
397            &tables.active_jwks,
398            self.active_jwks.into_iter().map(|j| {
399                // TODO: we don't need to store the round in this map if it is invariant
400                assert_eq!(j.0, round);
401                (j, ())
402            }),
403        )?;
404
405        batch.insert_batch(
406            &tables.congestion_control_object_debts,
407            self.congestion_control_object_debts
408                .into_iter()
409                .map(|(object_id, debt)| {
410                    (
411                        object_id,
412                        CongestionPerObjectDebt::new(self.consensus_round, debt),
413                    )
414                }),
415        )?;
416        batch.insert_batch(
417            &tables.congestion_control_randomness_object_debts,
418            self.congestion_control_randomness_object_debts
419                .into_iter()
420                .map(|(object_id, debt)| {
421                    (
422                        object_id,
423                        CongestionPerObjectDebt::new(self.consensus_round, debt),
424                    )
425                }),
426        )?;
427
428        batch.insert_batch(
429            &tables.execution_time_observations,
430            self.execution_time_observations
431                .into_iter()
432                .map(|(authority, generation, estimates)| ((generation, authority), estimates)),
433        )?;
434
435        Ok(())
436    }
437}
438
439/// ConsensusOutputCache holds outputs of consensus processing that do not need to be committed to disk.
440/// Data quarantining guarantees that all of this data will be used (e.g. for building checkpoints)
441/// before the consensus commit from which it originated is marked as processed. Therefore we can rely
442/// on replay of consensus commits to recover this data.
443pub(crate) struct ConsensusOutputCache {
444    // deferred transactions is only used by consensus handler so there should never be lock contention
445    // - hence no need for a DashMap.
446    pub(crate) deferred_transactions:
447        Mutex<BTreeMap<DeferralKey, Vec<VerifiedExecutableTransactionWithAliases>>>,
448
449    // user_signatures_for_checkpoints is written to by consensus handler and read from by checkpoint builder
450    // The critical sections are small in both cases so a DashMap is probably not helpful.
451    #[allow(clippy::type_complexity)]
452    pub(crate) user_signatures_for_checkpoints:
453        Mutex<HashMap<TransactionDigest, Vec<(GenericSignature, Option<SequenceNumber>)>>>,
454
455    executed_in_epoch: RwLock<DashMap<TransactionDigest, ()>>,
456    executed_in_epoch_cache: MokaCache<TransactionDigest, ()>,
457}
458
459impl ConsensusOutputCache {
460    pub(crate) fn new(tables: &AuthorityEpochTables) -> Self {
461        let deferred_transactions = tables
462            .get_all_deferred_transactions_v2()
463            .expect("load deferred transactions cannot fail");
464
465        let executed_in_epoch_cache_capacity = 50_000;
466
467        Self {
468            deferred_transactions: Mutex::new(deferred_transactions),
469            user_signatures_for_checkpoints: Default::default(),
470            executed_in_epoch: RwLock::new(DashMap::with_shard_amount(2048)),
471            executed_in_epoch_cache: MokaCache::builder(8)
472                // most queries should be for recent transactions
473                .max_capacity(randomize_cache_capacity_in_tests(
474                    executed_in_epoch_cache_capacity,
475                ))
476                .eviction_policy(EvictionPolicy::lru())
477                .build(),
478        }
479    }
480
481    pub fn executed_in_current_epoch(&self, digest: &TransactionDigest) -> bool {
482        self.executed_in_epoch
483            .read()
484            .contains_key(digest) ||
485            // we use get instead of contains key to mark the entry as read
486            self.executed_in_epoch_cache.get(digest).is_some()
487    }
488
489    // Called by execution
490    pub fn insert_executed_in_epoch(&self, tx_digest: TransactionDigest) {
491        assert!(
492            self.executed_in_epoch
493                .read()
494                .insert(tx_digest, ())
495                .is_none(),
496            "transaction already executed"
497        );
498        self.executed_in_epoch_cache.insert(tx_digest, ());
499    }
500
501    // CheckpointExecutor calls this (indirectly) in order to prune the in-memory cache of executed
502    // transactions. By the time this is called, the transaction digests will have been committed to
503    // the `executed_transactions_to_checkpoint` table.
504    pub fn remove_executed_in_epoch(&self, tx_digests: &[TransactionDigest]) {
505        let executed_in_epoch = self.executed_in_epoch.read();
506        for tx_digest in tx_digests {
507            executed_in_epoch.remove(tx_digest);
508        }
509    }
510}
511
512/// ConsensusOutputQuarantine holds outputs of consensus processing in memory until the checkpoints
513/// for the commit have been certified.
514pub(crate) struct ConsensusOutputQuarantine {
515    // Output from consensus handler
516    output_queue: VecDeque<ConsensusCommitOutput>,
517
518    // Highest known certified checkpoint sequence number
519    highest_executed_checkpoint: CheckpointSequenceNumber,
520
521    // Checkpoint Builder output
522    builder_checkpoint_summary:
523        BTreeMap<CheckpointSequenceNumber, (BuilderCheckpointSummary, CheckpointContents)>,
524
525    builder_digest_to_checkpoint: HashMap<TransactionDigest, CheckpointSequenceNumber>,
526
527    // Any un-committed next versions are stored here.
528    shared_object_next_versions: RefCountedHashMap<ConsensusObjectSequenceKey, SequenceNumber>,
529
530    // The most recent congestion control debts for objects. Uses a ref-count to track
531    // which objects still exist in some element of output_queue.
532    congestion_control_randomness_object_debts:
533        RefCountedHashMap<ObjectID, CongestionPerObjectDebt>,
534    congestion_control_object_debts: RefCountedHashMap<ObjectID, CongestionPerObjectDebt>,
535
536    processed_consensus_messages: RefCountedHashMap<SequencedConsensusTransactionKey, ()>,
537
538    // Owned object locks acquired post-consensus.
539    owned_object_locks: HashMap<ObjectRef, LockDetails>,
540
541    metrics: Arc<EpochMetrics>,
542}
543
544impl ConsensusOutputQuarantine {
545    pub(super) fn new(
546        highest_executed_checkpoint: CheckpointSequenceNumber,
547        authority_metrics: Arc<EpochMetrics>,
548    ) -> Self {
549        Self {
550            highest_executed_checkpoint,
551
552            output_queue: VecDeque::new(),
553            builder_checkpoint_summary: BTreeMap::new(),
554            builder_digest_to_checkpoint: HashMap::new(),
555            shared_object_next_versions: RefCountedHashMap::new(),
556            processed_consensus_messages: RefCountedHashMap::new(),
557            congestion_control_randomness_object_debts: RefCountedHashMap::new(),
558            congestion_control_object_debts: RefCountedHashMap::new(),
559            owned_object_locks: HashMap::new(),
560            metrics: authority_metrics,
561        }
562    }
563}
564
565// Write methods - all methods in this block insert new data into the quarantine.
566// There are only two sources! ConsensusHandler and CheckpointBuilder.
567impl ConsensusOutputQuarantine {
568    // Push all data gathered from a consensus commit into the quarantine.
569    pub(crate) fn push_consensus_output(
570        &mut self,
571        output: ConsensusCommitOutput,
572        epoch_store: &AuthorityPerEpochStore,
573    ) -> SuiResult {
574        self.insert_shared_object_next_versions(&output);
575        self.insert_congestion_control_debts(&output);
576        self.insert_processed_consensus_messages(&output);
577        self.insert_owned_object_locks(&output);
578        self.output_queue.push_back(output);
579
580        self.metrics
581            .consensus_quarantine_queue_size
582            .set(self.output_queue.len() as i64);
583
584        // we may already have observed the certified checkpoint for this round, if state sync is running
585        // ahead of consensus, so there may be data to commit right away.
586        self.commit(epoch_store)
587    }
588
589    // Record a newly built checkpoint.
590    pub(super) fn insert_builder_summary(
591        &mut self,
592        sequence_number: CheckpointSequenceNumber,
593        summary: BuilderCheckpointSummary,
594        contents: CheckpointContents,
595    ) {
596        debug!(?sequence_number, "inserting builder summary {:?}", summary);
597        for tx in contents.iter() {
598            self.builder_digest_to_checkpoint
599                .insert(tx.transaction, sequence_number);
600        }
601        self.builder_checkpoint_summary
602            .insert(sequence_number, (summary, contents));
603    }
604}
605
606// Commit methods.
607impl ConsensusOutputQuarantine {
608    /// Update the highest executed checkpoint and commit any data which is now
609    /// below the watermark.
610    pub(super) fn update_highest_executed_checkpoint(
611        &mut self,
612        checkpoint: CheckpointSequenceNumber,
613        epoch_store: &AuthorityPerEpochStore,
614        batch: &mut DBBatch,
615    ) -> SuiResult {
616        self.highest_executed_checkpoint = checkpoint;
617        self.commit_with_batch(epoch_store, batch)
618    }
619
620    pub(super) fn commit(&mut self, epoch_store: &AuthorityPerEpochStore) -> SuiResult {
621        let mut batch = epoch_store.db_batch()?;
622        self.commit_with_batch(epoch_store, &mut batch)?;
623        batch.write()?;
624        Ok(())
625    }
626
627    /// Commit all data below the watermark.
628    fn commit_with_batch(
629        &mut self,
630        epoch_store: &AuthorityPerEpochStore,
631        batch: &mut DBBatch,
632    ) -> SuiResult {
633        // The commit algorithm is simple:
634        // 1. First commit all checkpoint builder state which is below the watermark.
635        // 2. Determine the consensus commit height that corresponds to the highest committed
636        //    checkpoint.
637        // 3. Commit all consensus output at that height or below.
638
639        let tables = epoch_store.tables()?;
640
641        let mut highest_committed_height = None;
642
643        while self
644            .builder_checkpoint_summary
645            .first_key_value()
646            .map(|(seq, _)| *seq <= self.highest_executed_checkpoint)
647            == Some(true)
648        {
649            let (seq, (builder_summary, contents)) =
650                self.builder_checkpoint_summary.pop_first().unwrap();
651
652            for tx in contents.iter() {
653                let digest = &tx.transaction;
654                assert_eq!(
655                    self.builder_digest_to_checkpoint
656                        .remove(digest)
657                        .unwrap_or_else(|| {
658                            panic!(
659                                "transaction {:?} not found in builder_digest_to_checkpoint",
660                                digest
661                            )
662                        }),
663                    seq
664                );
665            }
666
667            batch.insert_batch(
668                &tables.builder_digest_to_checkpoint,
669                contents.iter().map(|tx| (tx.transaction, seq)),
670            )?;
671
672            batch.insert_batch(
673                &tables.builder_checkpoint_summary_v2,
674                [(seq, &builder_summary)],
675            )?;
676
677            let checkpoint_height = builder_summary
678                .checkpoint_height
679                .expect("non-genesis checkpoint must have height");
680            if let Some(highest) = highest_committed_height {
681                assert!(
682                    checkpoint_height >= highest,
683                    "current checkpoint height {} must be no less than highest committed height {}",
684                    checkpoint_height,
685                    highest
686                );
687            }
688
689            highest_committed_height = Some(checkpoint_height);
690        }
691
692        let Some(highest_committed_height) = highest_committed_height else {
693            return Ok(());
694        };
695
696        let split_checkpoints_in_consensus_handler = epoch_store
697            .protocol_config()
698            .split_checkpoints_in_consensus_handler();
699
700        if split_checkpoints_in_consensus_handler {
701            // V2: only commit outputs up to the last one where the checkpoint queue
702            // was fully drained (no pending roots). If the queue is empty after an
703            // output, there are no roots that could be lost on restart. Any outputs
704            // after the last drain point stay in the quarantine and get full-replayed
705            // on restart with correct root reconstruction.
706            let mut last_drain_idx = None;
707            for (i, output) in self.output_queue.iter().enumerate() {
708                let stats = output
709                    .consensus_commit_stats
710                    .as_ref()
711                    .expect("consensus_commit_stats must be set");
712                if stats.height > highest_committed_height {
713                    break;
714                }
715                if output.checkpoint_queue_drained {
716                    last_drain_idx = Some(i);
717                }
718            }
719            if let Some(idx) = last_drain_idx {
720                for _ in 0..=idx {
721                    let output = self.output_queue.pop_front().unwrap();
722                    self.remove_shared_object_next_versions(&output);
723                    self.remove_processed_consensus_messages(&output);
724                    self.remove_congestion_control_debts(&output);
725                    self.remove_owned_object_locks(&output);
726                    output.write_to_batch(epoch_store, batch)?;
727                }
728            }
729        } else {
730            while !self.output_queue.is_empty() {
731                let output = self.output_queue.front().unwrap();
732                let Some(highest_in_commit) = output.get_highest_pending_checkpoint_height() else {
733                    break;
734                };
735
736                if highest_in_commit <= highest_committed_height {
737                    info!(
738                        "committing output with highest pending checkpoint height {:?}",
739                        highest_in_commit
740                    );
741                    let output = self.output_queue.pop_front().unwrap();
742                    self.remove_shared_object_next_versions(&output);
743                    self.remove_processed_consensus_messages(&output);
744                    self.remove_congestion_control_debts(&output);
745                    self.remove_owned_object_locks(&output);
746                    output.write_to_batch(epoch_store, batch)?;
747                } else {
748                    break;
749                }
750            }
751        }
752
753        self.metrics
754            .consensus_quarantine_queue_size
755            .set(self.output_queue.len() as i64);
756
757        Ok(())
758    }
759}
760
761impl ConsensusOutputQuarantine {
762    fn insert_shared_object_next_versions(&mut self, output: &ConsensusCommitOutput) {
763        if let Some(next_versions) = output.next_shared_object_versions.as_ref() {
764            for (object_id, next_version) in next_versions {
765                self.shared_object_next_versions
766                    .insert(*object_id, *next_version);
767            }
768        }
769    }
770
771    fn insert_congestion_control_debts(&mut self, output: &ConsensusCommitOutput) {
772        let current_round = output.consensus_round;
773
774        for (object_id, debt) in output.congestion_control_object_debts.iter() {
775            self.congestion_control_object_debts.insert(
776                *object_id,
777                CongestionPerObjectDebt::new(current_round, *debt),
778            );
779        }
780
781        for (object_id, debt) in output.congestion_control_randomness_object_debts.iter() {
782            self.congestion_control_randomness_object_debts.insert(
783                *object_id,
784                CongestionPerObjectDebt::new(current_round, *debt),
785            );
786        }
787    }
788
789    fn remove_congestion_control_debts(&mut self, output: &ConsensusCommitOutput) {
790        for (object_id, _) in output.congestion_control_object_debts.iter() {
791            self.congestion_control_object_debts.remove(object_id);
792        }
793        for (object_id, _) in output.congestion_control_randomness_object_debts.iter() {
794            self.congestion_control_randomness_object_debts
795                .remove(object_id);
796        }
797    }
798
799    fn insert_processed_consensus_messages(&mut self, output: &ConsensusCommitOutput) {
800        for tx_key in output.consensus_messages_processed.iter() {
801            self.processed_consensus_messages.insert(tx_key.clone(), ());
802        }
803    }
804
805    fn remove_processed_consensus_messages(&mut self, output: &ConsensusCommitOutput) {
806        for tx_key in output.consensus_messages_processed.iter() {
807            self.processed_consensus_messages.remove(tx_key);
808        }
809    }
810
811    fn remove_shared_object_next_versions(&mut self, output: &ConsensusCommitOutput) {
812        if let Some(next_versions) = output.next_shared_object_versions.as_ref() {
813            for object_id in next_versions.keys() {
814                if !self.shared_object_next_versions.remove(object_id) {
815                    fatal!(
816                        "Shared object next version not found in quarantine: {:?}",
817                        object_id
818                    );
819                }
820            }
821        }
822    }
823
824    fn insert_owned_object_locks(&mut self, output: &ConsensusCommitOutput) {
825        for (obj_ref, lock) in &output.owned_object_locks {
826            self.owned_object_locks.insert(*obj_ref, *lock);
827        }
828    }
829
830    fn remove_owned_object_locks(&mut self, output: &ConsensusCommitOutput) {
831        for obj_ref in output.owned_object_locks.keys() {
832            self.owned_object_locks.remove(obj_ref);
833        }
834    }
835}
836
837// Read methods - all methods in this block return data from the quarantine which would otherwise
838// be found in the database.
839impl ConsensusOutputQuarantine {
840    pub(super) fn last_built_summary(&self) -> Option<&BuilderCheckpointSummary> {
841        self.builder_checkpoint_summary
842            .values()
843            .last()
844            .map(|(summary, _)| summary)
845    }
846
847    pub(super) fn get_built_summary(
848        &self,
849        sequence: CheckpointSequenceNumber,
850    ) -> Option<&BuilderCheckpointSummary> {
851        self.builder_checkpoint_summary
852            .get(&sequence)
853            .map(|(summary, _)| summary)
854    }
855
856    pub(super) fn included_transaction_in_checkpoint(&self, digest: &TransactionDigest) -> bool {
857        self.builder_digest_to_checkpoint.contains_key(digest)
858    }
859
860    pub(super) fn is_consensus_message_processed(
861        &self,
862        key: &SequencedConsensusTransactionKey,
863    ) -> bool {
864        self.processed_consensus_messages.contains_key(key)
865    }
866
867    pub(super) fn is_empty(&self) -> bool {
868        self.output_queue.is_empty()
869    }
870
871    pub(super) fn get_next_shared_object_versions(
872        &self,
873        tables: &AuthorityEpochTables,
874        objects_to_init: &[ConsensusObjectSequenceKey],
875    ) -> SuiResult<Vec<Option<SequenceNumber>>> {
876        Ok(do_fallback_lookup(
877            objects_to_init,
878            |object_key| {
879                if let Some(next_version) = self.shared_object_next_versions.get(object_key) {
880                    CacheResult::Hit(Some(*next_version))
881                } else {
882                    CacheResult::Miss
883                }
884            },
885            |object_keys| {
886                tables
887                    .next_shared_object_versions_v2
888                    .multi_get(object_keys)
889                    .expect("db error")
890            },
891        ))
892    }
893
894    /// Gets owned object locks, checking quarantine first then falling back to DB.
895    /// After crash recovery, quarantine is empty so we naturally fall back to DB.
896    pub(super) fn get_owned_object_locks(
897        &self,
898        tables: &AuthorityEpochTables,
899        obj_refs: &[ObjectRef],
900    ) -> SuiResult<Vec<Option<LockDetails>>> {
901        Ok(do_fallback_lookup(
902            obj_refs,
903            |obj_ref| {
904                if let Some(lock) = self.owned_object_locks.get(obj_ref) {
905                    CacheResult::Hit(Some(*lock))
906                } else {
907                    CacheResult::Miss
908                }
909            },
910            |obj_refs| {
911                tables
912                    .multi_get_locked_transactions(obj_refs)
913                    .expect("db error")
914            },
915        ))
916    }
917
918    pub(super) fn get_highest_pending_checkpoint_height(&self) -> Option<CheckpointHeight> {
919        self.output_queue
920            .back()
921            .and_then(|output| output.get_highest_pending_checkpoint_height())
922    }
923
924    pub(super) fn get_pending_checkpoints(
925        &self,
926        last: Option<CheckpointHeight>,
927    ) -> Vec<(CheckpointHeight, PendingCheckpoint)> {
928        let mut checkpoints = Vec::new();
929        for output in &self.output_queue {
930            checkpoints.extend(
931                output
932                    .get_pending_checkpoints(last)
933                    .map(|cp| (cp.height(), cp.clone())),
934            );
935        }
936        if cfg!(debug_assertions) {
937            let mut prev = None;
938            for (height, _) in &checkpoints {
939                if let Some(prev) = prev {
940                    assert!(prev < *height);
941                }
942                prev = Some(*height);
943            }
944        }
945        checkpoints
946    }
947
948    pub(super) fn pending_checkpoint_exists(&self, index: &CheckpointHeight) -> bool {
949        self.output_queue
950            .iter()
951            .any(|output| output.pending_checkpoint_exists(index))
952    }
953
954    pub(super) fn get_pending_checkpoints_v2(
955        &self,
956        last: Option<CheckpointHeight>,
957    ) -> Vec<(CheckpointHeight, PendingCheckpointV2)> {
958        let mut checkpoints = Vec::new();
959        for output in &self.output_queue {
960            checkpoints.extend(
961                output
962                    .get_pending_checkpoints_v2(last)
963                    .map(|cp| (cp.height(), cp.clone())),
964            );
965        }
966        if cfg!(debug_assertions) {
967            let mut prev = None;
968            for (height, _) in &checkpoints {
969                if let Some(prev) = prev {
970                    assert!(prev < *height);
971                }
972                prev = Some(*height);
973            }
974        }
975        checkpoints
976    }
977
978    pub(super) fn pending_checkpoint_exists_v2(&self, index: &CheckpointHeight) -> bool {
979        self.output_queue
980            .iter()
981            .any(|output| output.pending_checkpoint_exists_v2(index))
982    }
983
984    pub(super) fn get_new_jwks(
985        &self,
986        epoch_store: &AuthorityPerEpochStore,
987        round: u64,
988    ) -> SuiResult<Vec<ActiveJwk>> {
989        let epoch = epoch_store.epoch();
990
991        // Check if the requested round is in memory
992        for output in self.output_queue.iter().rev() {
993            // unwrap safe because output will always have last consensus stats set before being added
994            // to the quarantine
995            let output_round = output.get_round().unwrap();
996            if round == output_round {
997                return Ok(output
998                    .active_jwks
999                    .iter()
1000                    .map(|(_, (jwk_id, jwk))| ActiveJwk {
1001                        jwk_id: jwk_id.clone(),
1002                        jwk: jwk.clone(),
1003                        epoch,
1004                    })
1005                    .collect());
1006            }
1007        }
1008
1009        // Fall back to reading from database
1010        let empty_jwk_id = JwkId::new(String::new(), String::new());
1011        let empty_jwk = JWK {
1012            kty: String::new(),
1013            e: String::new(),
1014            n: String::new(),
1015            alg: String::new(),
1016        };
1017
1018        let start = (round, (empty_jwk_id.clone(), empty_jwk.clone()));
1019        let end = (round + 1, (empty_jwk_id, empty_jwk));
1020
1021        Ok(epoch_store
1022            .tables()?
1023            .active_jwks
1024            .safe_iter_with_bounds(Some(start), Some(end))
1025            .map_ok(|((r, (jwk_id, jwk)), _)| {
1026                debug_assert!(round == r);
1027                ActiveJwk { jwk_id, jwk, epoch }
1028            })
1029            .collect::<Result<Vec<_>, _>>()?)
1030    }
1031
1032    pub(super) fn get_randomness_last_round_timestamp(&self) -> Option<TimestampMs> {
1033        self.output_queue
1034            .iter()
1035            .rev()
1036            .filter_map(|output| output.get_randomness_last_round_timestamp())
1037            .next()
1038    }
1039
1040    pub(crate) fn load_initial_object_debts(
1041        &self,
1042        epoch_store: &AuthorityPerEpochStore,
1043        current_round: Round,
1044        for_randomness: bool,
1045        transactions: &[VerifiedExecutableTransactionWithAliases],
1046    ) -> SuiResult<impl IntoIterator<Item = (ObjectID, u64)>> {
1047        let protocol_config = epoch_store.protocol_config();
1048        let tables = epoch_store.tables()?;
1049        let default_per_commit_budget = protocol_config
1050            .max_accumulated_txn_cost_per_object_in_mysticeti_commit_as_option()
1051            .unwrap_or(0);
1052        let (hash_table, db_table, per_commit_budget) = if for_randomness {
1053            (
1054                &self.congestion_control_randomness_object_debts,
1055                &tables.congestion_control_randomness_object_debts,
1056                protocol_config
1057                    .max_accumulated_randomness_txn_cost_per_object_in_mysticeti_commit_as_option()
1058                    .unwrap_or(default_per_commit_budget),
1059            )
1060        } else {
1061            (
1062                &self.congestion_control_object_debts,
1063                &tables.congestion_control_object_debts,
1064                default_per_commit_budget,
1065            )
1066        };
1067        let mut shared_input_object_ids: Vec<_> = transactions
1068            .iter()
1069            .flat_map(|tx| tx.tx().shared_input_objects().map(|obj| obj.id))
1070            .collect();
1071        shared_input_object_ids.sort();
1072        shared_input_object_ids.dedup();
1073
1074        let results = do_fallback_lookup(
1075            &shared_input_object_ids,
1076            |object_id| {
1077                if let Some(debt) = hash_table.get(object_id) {
1078                    CacheResult::Hit(Some(debt.into_v1()))
1079                } else {
1080                    CacheResult::Miss
1081                }
1082            },
1083            |object_ids| {
1084                db_table
1085                    .multi_get(object_ids)
1086                    .expect("db error")
1087                    .into_iter()
1088                    .map(|debt| debt.map(|debt| debt.into_v1()))
1089                    .collect()
1090            },
1091        );
1092
1093        Ok(results
1094            .into_iter()
1095            .zip_debug_eq(shared_input_object_ids)
1096            .filter_map(|(debt, object_id)| debt.map(|debt| (debt, object_id)))
1097            .map(move |((round, debt), object_id)| {
1098                // Stored debts already account for the budget of the round in which
1099                // they were accumulated. Application of budget from future rounds to
1100                // the debt is handled here.
1101                assert!(current_round > round);
1102                let num_rounds = current_round - round - 1;
1103                let debt = debt.saturating_sub(per_commit_budget * num_rounds);
1104                (object_id, debt)
1105            }))
1106    }
1107}
1108
1109// A wrapper around HashMap that uses refcounts to keep entries alive until
1110// they are no longer needed.
1111//
1112// If there are N inserts for the same key, the key will not be removed until
1113// there are N removes.
1114//
1115// It is intended to track the *latest* value for a given key, so duplicate
1116// inserts are intended to overwrite any prior value.
1117#[derive(Debug, Default)]
1118struct RefCountedHashMap<K, V> {
1119    map: HashMap<K, (usize, V)>,
1120}
1121
1122impl<K, V> RefCountedHashMap<K, V>
1123where
1124    K: Clone + Eq + std::hash::Hash,
1125{
1126    pub fn new() -> Self {
1127        Self {
1128            map: HashMap::new(),
1129        }
1130    }
1131
1132    pub fn insert(&mut self, key: K, value: V) {
1133        let entry = self.map.entry(key);
1134        match entry {
1135            hash_map::Entry::Occupied(mut entry) => {
1136                let (ref_count, v) = entry.get_mut();
1137                *ref_count += 1;
1138                *v = value;
1139            }
1140            hash_map::Entry::Vacant(entry) => {
1141                entry.insert((1, value));
1142            }
1143        }
1144    }
1145
1146    // Returns true if the key was present, false otherwise.
1147    // Note that the key may not be removed if present, as it may have a refcount > 1.
1148    pub fn remove(&mut self, key: &K) -> bool {
1149        let entry = self.map.entry(key.clone());
1150        match entry {
1151            hash_map::Entry::Occupied(mut entry) => {
1152                let (ref_count, _) = entry.get_mut();
1153                *ref_count -= 1;
1154                if *ref_count == 0 {
1155                    entry.remove();
1156                }
1157                true
1158            }
1159            hash_map::Entry::Vacant(_) => false,
1160        }
1161    }
1162
1163    pub fn get(&self, key: &K) -> Option<&V> {
1164        self.map.get(key).map(|(_, v)| v)
1165    }
1166
1167    pub fn contains_key(&self, key: &K) -> bool {
1168        self.map.contains_key(key)
1169    }
1170}
1171
1172#[cfg(test)]
1173impl ConsensusOutputQuarantine {
1174    fn output_queue_len_for_testing(&self) -> usize {
1175        self.output_queue.len()
1176    }
1177}
1178
1179#[cfg(test)]
1180mod tests {
1181    use super::*;
1182    use crate::authority::test_authority_builder::TestAuthorityBuilder;
1183    use sui_types::base_types::ExecutionDigests;
1184    use sui_types::gas::GasCostSummary;
1185
1186    fn make_output(height: u64, round: u64, drained: bool) -> ConsensusCommitOutput {
1187        let mut output = ConsensusCommitOutput::new(round);
1188        output.record_consensus_commit_stats(ExecutionIndicesWithStatsV2 {
1189            height,
1190            ..Default::default()
1191        });
1192        output.set_checkpoint_queue_drained(drained);
1193        output
1194    }
1195
1196    fn make_builder_summary(
1197        seq: CheckpointSequenceNumber,
1198        height: CheckpointHeight,
1199        protocol_config: &ProtocolConfig,
1200    ) -> (BuilderCheckpointSummary, CheckpointContents) {
1201        let contents =
1202            CheckpointContents::new_with_digests_only_for_tests([ExecutionDigests::random()]);
1203        let summary = CheckpointSummary::new(
1204            protocol_config,
1205            0,
1206            seq,
1207            0,
1208            &contents,
1209            None,
1210            GasCostSummary::default(),
1211            None,
1212            0,
1213            vec![],
1214            vec![],
1215        );
1216        let builder_summary = BuilderCheckpointSummary {
1217            summary,
1218            checkpoint_height: Some(height),
1219            position_in_commit: 0,
1220        };
1221        (builder_summary, contents)
1222    }
1223
1224    #[tokio::test]
1225    async fn test_drain_boundary_prevents_premature_commit() {
1226        let mut protocol_config =
1227            ProtocolConfig::get_for_version(ProtocolVersion::max(), Chain::Unknown);
1228        protocol_config.set_split_checkpoints_in_consensus_handler_for_testing(true);
1229        let state = TestAuthorityBuilder::new()
1230            .with_protocol_config(protocol_config)
1231            .build()
1232            .await;
1233        let epoch_store = state.epoch_store_for_testing();
1234
1235        let metrics = epoch_store.metrics.clone();
1236        let mut quarantine = ConsensusOutputQuarantine::new(0, metrics);
1237
1238        // Output C: height=4, not drained
1239        let c = make_output(4, 1, false);
1240        quarantine.push_consensus_output(c, &epoch_store).unwrap();
1241
1242        // Output C2: height=5, drained
1243        let c2 = make_output(5, 2, true);
1244        quarantine.push_consensus_output(c2, &epoch_store).unwrap();
1245
1246        assert_eq!(quarantine.output_queue_len_for_testing(), 2);
1247
1248        // Insert builder summaries for checkpoints 1-4 with checkpoint_height = seq
1249        let pc = epoch_store.protocol_config();
1250        for seq in 1..=4 {
1251            let (summary, contents) = make_builder_summary(seq, seq, pc);
1252            quarantine.insert_builder_summary(seq, summary, contents);
1253        }
1254
1255        // Certify up to checkpoint 4
1256        let mut batch = epoch_store.db_batch_for_test();
1257        quarantine
1258            .update_highest_executed_checkpoint(4, &epoch_store, &mut batch)
1259            .unwrap();
1260        batch.write().unwrap();
1261
1262        // C has height=4 which is <= 4 but checkpoint_queue_drained=false.
1263        // C2 has height=5 which is > 4, so it's skipped.
1264        // No drain boundary found => nothing drained.
1265        assert_eq!(quarantine.output_queue_len_for_testing(), 2);
1266    }
1267
1268    #[tokio::test]
1269    async fn test_drain_boundary_commits_at_safe_point() {
1270        let mut protocol_config =
1271            ProtocolConfig::get_for_version(ProtocolVersion::max(), Chain::Unknown);
1272        protocol_config.set_split_checkpoints_in_consensus_handler_for_testing(true);
1273        let state = TestAuthorityBuilder::new()
1274            .with_protocol_config(protocol_config)
1275            .build()
1276            .await;
1277        let epoch_store = state.epoch_store_for_testing();
1278
1279        let metrics = epoch_store.metrics.clone();
1280        let mut quarantine = ConsensusOutputQuarantine::new(0, metrics);
1281
1282        let c = make_output(4, 1, false);
1283        quarantine.push_consensus_output(c, &epoch_store).unwrap();
1284
1285        let c2 = make_output(5, 2, true);
1286        quarantine.push_consensus_output(c2, &epoch_store).unwrap();
1287
1288        assert_eq!(quarantine.output_queue_len_for_testing(), 2);
1289
1290        // Insert builder summaries for checkpoints 1-5 with checkpoint_height = seq
1291        let pc = epoch_store.protocol_config();
1292        for seq in 1..=5 {
1293            let (summary, contents) = make_builder_summary(seq, seq, pc);
1294            quarantine.insert_builder_summary(seq, summary, contents);
1295        }
1296
1297        // Certify up to checkpoint 5
1298        let mut batch = epoch_store.db_batch_for_test();
1299        quarantine
1300            .update_highest_executed_checkpoint(5, &epoch_store, &mut batch)
1301            .unwrap();
1302        batch.write().unwrap();
1303
1304        // C has height=4 <= 5, drained=false.
1305        // C2 has height=5 <= 5, drained=true => drain boundary at index 1.
1306        // Both outputs drained.
1307        assert_eq!(quarantine.output_queue_len_for_testing(), 0);
1308    }
1309}