sui_aws_orchestrator/
monitor.rs

1// Copyright (c) Mysten Labs, Inc.
2// SPDX-License-Identifier: Apache-2.0
3
4use std::{fs, net::SocketAddr, path::PathBuf};
5
6use crate::{
7    client::Instance,
8    error::{MonitorError, MonitorResult},
9    protocol::ProtocolMetrics,
10    ssh::{CommandContext, SshConnectionManager},
11};
12
13pub struct Monitor {
14    instance: Instance,
15    clients: Vec<Instance>,
16    nodes: Vec<Instance>,
17    ssh_manager: SshConnectionManager,
18}
19
20impl Monitor {
21    /// Create a new monitor.
22    pub fn new(
23        instance: Instance,
24        clients: Vec<Instance>,
25        nodes: Vec<Instance>,
26        ssh_manager: SshConnectionManager,
27    ) -> Self {
28        Self {
29            instance,
30            clients,
31            nodes,
32            ssh_manager,
33        }
34    }
35
36    /// Dependencies to install.
37    pub fn dependencies() -> Vec<&'static str> {
38        let mut commands = Vec::new();
39        commands.extend(Prometheus::install_commands());
40        commands.extend(Grafana::install_commands());
41        commands
42    }
43
44    /// Start a prometheus instance on each remote machine.
45    pub async fn start_prometheus<P: ProtocolMetrics>(
46        &self,
47        protocol_commands: &P,
48    ) -> MonitorResult<()> {
49        let instance = std::iter::once(self.instance.clone());
50        let commands =
51            Prometheus::setup_commands(self.clients.clone(), self.nodes.clone(), protocol_commands);
52        self.ssh_manager
53            .execute(instance, commands, CommandContext::default())
54            .await?;
55        Ok(())
56    }
57
58    /// Start grafana on the local host.
59    pub async fn start_grafana(&self) -> MonitorResult<()> {
60        // Configure and reload grafana.
61        let instance = std::iter::once(self.instance.clone());
62        let commands = Grafana::setup_commands();
63        self.ssh_manager
64            .execute(instance, commands, CommandContext::default())
65            .await?;
66
67        Ok(())
68    }
69
70    /// The public address of the grafana instance.
71    pub fn grafana_address(&self) -> String {
72        format!("http://{}:{}", self.instance.main_ip, Grafana::DEFAULT_PORT)
73    }
74}
75
76/// Generate the commands to setup prometheus on the given instances.
77/// TODO: Modify the configuration to also get client metrics.
78pub struct Prometheus;
79
80impl Prometheus {
81    /// The default prometheus configuration path.
82    const DEFAULT_PROMETHEUS_CONFIG_PATH: &'static str = "/etc/prometheus/prometheus.yml";
83    /// The default prometheus port.
84    pub const DEFAULT_PORT: u16 = 9090;
85
86    /// The commands to install prometheus.
87    pub fn install_commands() -> Vec<&'static str> {
88        vec![
89            "sudo apt-get -y install prometheus",
90            "sudo chmod 777 -R /var/lib/prometheus/ /etc/prometheus/",
91        ]
92    }
93
94    /// Generate the commands to update the prometheus configuration and restart prometheus.
95    pub fn setup_commands<I, P>(clients: I, nodes: I, protocol: &P) -> String
96    where
97        I: IntoIterator<Item = Instance>,
98        P: ProtocolMetrics,
99    {
100        // Generate the prometheus' global configuration.
101        let mut config = vec![Self::global_configuration()];
102
103        // Add configurations to scrape the clients.
104        let clients_metrics_path = protocol.clients_metrics_path(clients);
105        for (i, (_, clients_metrics_path)) in clients_metrics_path.into_iter().enumerate() {
106            let id = format!("client-{i}");
107            let scrape_config = Self::scrape_configuration(&id, &clients_metrics_path);
108            config.push(scrape_config);
109        }
110
111        // Add configurations to scrape the nodes.
112        let nodes_metrics_path = protocol.nodes_metrics_path(nodes);
113        for (i, (_, nodes_metrics_path)) in nodes_metrics_path.into_iter().enumerate() {
114            let id = format!("node-{i}");
115            let scrape_config = Self::scrape_configuration(&id, &nodes_metrics_path);
116            config.push(scrape_config);
117        }
118
119        // Make the command to configure and restart prometheus.
120        [
121            &format!(
122                "sudo echo \"{}\" > {}",
123                config.join("\n"),
124                Self::DEFAULT_PROMETHEUS_CONFIG_PATH
125            ),
126            "sudo service prometheus restart",
127        ]
128        .join(" && ")
129    }
130
131    /// Generate the global prometheus configuration.
132    /// NOTE: The configuration file is a yaml file so spaces are important.
133    fn global_configuration() -> String {
134        [
135            "global:",
136            "  scrape_interval: 5s",
137            "  evaluation_interval: 5s",
138            "scrape_configs:",
139        ]
140        .join("\n")
141    }
142
143    /// Generate the prometheus configuration from the given metrics path.
144    /// NOTE: The configuration file is a yaml file so spaces are important.
145    fn scrape_configuration(id: &str, nodes_metrics_path: &str) -> String {
146        let parts: Vec<_> = nodes_metrics_path.split('/').collect();
147        let address = parts[0].parse::<SocketAddr>().unwrap();
148        let ip = address.ip();
149        let port = address.port();
150        let path = parts[1];
151
152        [
153            &format!("  - job_name: {id}"),
154            &format!("    metrics_path: /{path}"),
155            "    static_configs:",
156            "      - targets:",
157            &format!("        - {ip}:{port}"),
158        ]
159        .join("\n")
160    }
161}
162
163pub struct Grafana;
164
165impl Grafana {
166    /// The path to the datasources directory.
167    const DATASOURCES_PATH: &'static str = "/etc/grafana/provisioning/datasources";
168    /// The default grafana port.
169    pub const DEFAULT_PORT: u16 = 3000;
170
171    /// The commands to install prometheus.
172    pub fn install_commands() -> Vec<&'static str> {
173        vec![
174            "sudo apt-get install -y apt-transport-https software-properties-common wget",
175            "sudo wget -q -O /usr/share/keyrings/grafana.key https://apt.grafana.com/gpg.key",
176            "(sudo rm /etc/apt/sources.list.d/grafana.list || true)",
177            "echo \"deb [signed-by=/usr/share/keyrings/grafana.key] https://apt.grafana.com stable main\" | sudo tee -a /etc/apt/sources.list.d/grafana.list",
178            "sudo apt-get update",
179            "sudo apt-get install -y grafana",
180            "sudo chmod 777 -R /etc/grafana/",
181        ]
182    }
183
184    /// Generate the commands to update the grafana datasource and restart grafana.
185    pub fn setup_commands() -> String {
186        [
187            &format!("(rm -r {} || true)", Self::DATASOURCES_PATH),
188            &format!("mkdir -p {}", Self::DATASOURCES_PATH),
189            &format!(
190                "sudo echo \"{}\" > {}/testbed.yml",
191                Self::datasource(),
192                Self::DATASOURCES_PATH
193            ),
194            "sudo service grafana-server restart",
195        ]
196        .join(" && ")
197    }
198
199    /// Generate the content of the datasource file for the given instance.
200    /// NOTE: The datasource file is a yaml file so spaces are important.
201    fn datasource() -> String {
202        [
203            "apiVersion: 1",
204            "deleteDatasources:",
205            "  - name: testbed",
206            "    orgId: 1",
207            "datasources:",
208            "  - name: testbed",
209            "    type: prometheus",
210            "    access: proxy",
211            "    orgId: 1",
212            &format!("    url: http://localhost:{}", Prometheus::DEFAULT_PORT),
213            "    editable: true",
214            "    uid: Fixed-UID-testbed",
215        ]
216        .join("\n")
217    }
218}
219
220#[allow(dead_code)]
221/// Bootstrap the grafana with datasource to connect to the given instances.
222/// NOTE: Only for macOS. Grafana must be installed through homebrew (and not from source). Deeper grafana
223/// configuration can be done through the grafana.ini file (/opt/homebrew/etc/grafana/grafana.ini) or the
224/// plist file (~/Library/LaunchAgents/homebrew.mxcl.grafana.plist).
225pub struct LocalGrafana;
226
227#[allow(dead_code)]
228impl LocalGrafana {
229    /// The default grafana home directory (macOS, homebrew install).
230    const DEFAULT_GRAFANA_HOME: &'static str = "/opt/homebrew/opt/grafana/share/grafana/";
231    /// The path to the datasources directory.
232    const DATASOURCES_PATH: &'static str = "conf/provisioning/datasources/";
233    /// The default grafana port.
234    pub const DEFAULT_PORT: u16 = 3000;
235
236    /// Configure grafana to connect to the given instances. Only for macOS.
237    pub fn run<I>(instances: I) -> MonitorResult<()>
238    where
239        I: IntoIterator<Item = Instance>,
240    {
241        let path: PathBuf = [Self::DEFAULT_GRAFANA_HOME, Self::DATASOURCES_PATH]
242            .iter()
243            .collect();
244
245        // Remove the old datasources.
246        fs::remove_dir_all(&path).unwrap();
247        fs::create_dir(&path).unwrap();
248
249        // Create the new datasources.
250        for (i, instance) in instances.into_iter().enumerate() {
251            let mut file = path.clone();
252            file.push(format!("instance-{}.yml", i));
253            fs::write(&file, Self::datasource(&instance, i)).map_err(|e| {
254                MonitorError::GrafanaError(format!("Failed to write grafana datasource ({e})"))
255            })?;
256        }
257
258        // Restart grafana.
259        std::process::Command::new("brew")
260            .arg("services")
261            .arg("restart")
262            .arg("grafana")
263            .arg("-q")
264            .spawn()
265            .map_err(|e| MonitorError::GrafanaError(e.to_string()))?;
266
267        Ok(())
268    }
269
270    /// Generate the content of the datasource file for the given instance. This grafana instance takes
271    /// one datasource per instance and assumes one prometheus server runs per instance.
272    /// NOTE: The datasource file is a yaml file so spaces are important.
273    fn datasource(instance: &Instance, index: usize) -> String {
274        [
275            "apiVersion: 1",
276            "deleteDatasources:",
277            &format!("  - name: instance-{index}"),
278            "    orgId: 1",
279            "datasources:",
280            &format!("  - name: instance-{index}"),
281            "    type: prometheus",
282            "    access: proxy",
283            "    orgId: 1",
284            &format!(
285                "    url: http://{}:{}",
286                instance.main_ip,
287                Prometheus::DEFAULT_PORT
288            ),
289            "    editable: true",
290            &format!("    uid: UID-{index}"),
291        ]
292        .join("\n")
293    }
294}