1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
use crate::{Node, NodeStorage};
use arc_swap::ArcSwap;
use config::{Committee, Parameters, SharedWorkerCache, WorkerCache, WorkerId};
use crypto::{KeyPair, NetworkKeyPair};
use executor::ExecutionState;
use fastcrypto::traits::KeyPair as _;
use futures::future::join_all;
use network::{P2pNetwork, ReliableNetwork};
use prometheus::Registry;
use std::{path::PathBuf, sync::Arc};
use tokio::sync::mpsc::Receiver;
use types::{PrimaryWorkerMessage, ReconfigureNotification, WorkerPrimaryMessage};
pub struct NodeRestarter;
impl NodeRestarter {
pub async fn watch<State>(
primary_keypair: KeyPair,
primary_network_keypair: NetworkKeyPair,
worker_ids_and_keypairs: Vec<(WorkerId, NetworkKeyPair)>,
committee: &Committee,
worker_cache: SharedWorkerCache,
storage_base_path: PathBuf,
execution_state: Arc<State>,
parameters: Parameters,
mut rx_reconfigure: Receiver<(
KeyPair,
NetworkKeyPair,
Committee,
Vec<(WorkerId, NetworkKeyPair)>,
WorkerCache,
)>,
registry: &Registry,
) where
State: ExecutionState + Send + Sync + 'static,
{
let mut primary_keypair = primary_keypair;
let mut primary_network_keypair = primary_network_keypair;
let mut name = primary_keypair.public().clone();
let mut worker_ids_and_keypairs = worker_ids_and_keypairs;
let mut committee = committee.clone();
let mut handles = Vec::new();
let network = anemo::Network::bind("127.0.0.1:0")
.server_name("narwhal")
.private_key(
NetworkKeyPair::generate(&mut rand::rngs::OsRng)
.private()
.0
.to_bytes(),
)
.start(anemo::Router::new())
.unwrap();
loop {
tracing::info!("Starting epoch E{}", committee.epoch());
let mut store_path = storage_base_path.clone();
store_path.push(format!("epoch{}", committee.epoch()));
let store = NodeStorage::reopen(store_path);
let primary_handles = Node::spawn_primary(
primary_keypair,
primary_network_keypair,
Arc::new(ArcSwap::new(Arc::new(committee.clone()))),
worker_cache.clone(),
&store,
parameters.clone(),
true,
execution_state.clone(),
registry,
)
.await
.unwrap();
let worker_handles = Node::spawn_workers(
name.clone(),
worker_ids_and_keypairs,
Arc::new(ArcSwap::new(Arc::new(committee.clone()))),
worker_cache.clone(),
&store,
parameters.clone(),
registry,
);
handles.extend(primary_handles);
handles.extend(worker_handles);
let (
new_keypair,
new_network_keypair,
new_committee,
new_worker_ids_and_keypairs,
new_worker_cache,
) = match rx_reconfigure.recv().await {
Some(x) => x,
None => break,
};
tracing::info!("Starting reconfiguration with committee {committee}");
let address = network::multiaddr_to_address(
&committee
.primary(&name)
.expect("Our key is not in the committee"),
)
.unwrap();
let network_key = committee
.network_key(&name)
.expect("Our key is not in the committee");
let mut primary_network =
P2pNetwork::new_for_single_address(network_key.to_owned(), address).await;
let message = WorkerPrimaryMessage::Reconfigure(ReconfigureNotification::Shutdown);
let primary_cancel_handle =
primary_network.send(network_key.to_owned(), &message).await;
let message = PrimaryWorkerMessage::Reconfigure(ReconfigureNotification::Shutdown);
let mut worker_names = Vec::new();
for worker in worker_cache
.load()
.our_workers(&name)
.expect("Our key is not in the worker cache")
{
let address = network::multiaddr_to_address(&worker.worker_address).unwrap();
let peer_id = anemo::PeerId(worker.name.0.to_bytes());
network
.connect_with_peer_id(address, peer_id)
.await
.unwrap();
worker_names.push(worker.name);
}
let worker_cancel_handles = P2pNetwork::new(network.clone())
.broadcast(worker_names, &message)
.await;
primary_cancel_handle
.await
.expect("Failed to notify primary");
join_all(worker_cancel_handles).await;
tracing::debug!("Committee reconfiguration message successfully sent");
join_all(handles.drain(..)).await;
tracing::debug!("All tasks successfully exited");
tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
tracing::debug!("Epoch E{} terminated", committee.epoch());
primary_keypair = new_keypair;
primary_network_keypair = new_network_keypair;
name = primary_keypair.public().clone();
worker_ids_and_keypairs = new_worker_ids_and_keypairs;
committee = new_committee;
worker_cache.swap(Arc::new(new_worker_cache));
}
}
}