Skip to content

Commit

Permalink
feat(metrics): expose the uptime of the node
Browse files Browse the repository at this point in the history
  • Loading branch information
RolandSherwin committed Jun 17, 2024
1 parent 81b0c50 commit 1d993a7
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 6 deletions.
14 changes: 14 additions & 0 deletions sn_node/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use prometheus_client::{
},
registry::Registry,
};
use sn_networking::Instant;

#[derive(Clone)]
pub(crate) struct NodeMetrics {
Expand All @@ -35,6 +36,10 @@ pub(crate) struct NodeMetrics {
// wallet
pub(crate) current_reward_wallet_balance: Gauge,
pub(crate) total_forwarded_rewards: Gauge,

// to track the uptime of the node.
pub(crate) started_instant: Instant,
pub(crate) uptime: Gauge,
}

#[derive(EncodeLabelSet, Hash, Clone, Eq, PartialEq, Debug)]
Expand Down Expand Up @@ -109,6 +114,13 @@ impl NodeMetrics {
total_forwarded_rewards.clone(),
);

let uptime = Gauge::default();
sub_registry.register(
"uptime",
"The uptime of the node in seconds",
uptime.clone(),
);

Self {
put_record_ok,
put_record_err,
Expand All @@ -118,6 +130,8 @@ impl NodeMetrics {
peer_removed_from_routing_table,
current_reward_wallet_balance,
total_forwarded_rewards,
started_instant: Instant::now(),
uptime,
}
}

Expand Down
25 changes: 19 additions & 6 deletions sn_node/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use prometheus_client::metrics::gauge::Gauge;
use prometheus_client::registry::Registry;
use rand::{rngs::StdRng, thread_rng, Rng, SeedableRng};
use sn_networking::{
close_group_majority, Network, NetworkBuilder, NetworkError, NetworkEvent, NodeIssue,
close_group_majority, Instant, Network, NetworkBuilder, NetworkError, NetworkEvent, NodeIssue,
SwarmDriver, CLOSE_GROUP_SIZE,
};
use sn_protocol::{
Expand Down Expand Up @@ -68,6 +68,9 @@ const CHUNK_PROOF_VERIFY_RETRY_INTERVAL: Duration = Duration::from_secs(15);
/// Track the forward balance by storing the balance in a file. This is useful to restore the balance between restarts.
const FORWARDED_BALANCE_FILE_NAME: &str = "forwarded_balance";

/// Interval to update the nodes uptime metric
const UPTIME_METRICS_UPDATE_INTERVAL: Duration = Duration::from_secs(10);

/// Helper to build and run a Node
pub struct NodeBuilder {
keypair: Keypair,
Expand Down Expand Up @@ -277,14 +280,18 @@ impl Node {
let mut balance_forward_interval = tokio::time::interval(balance_forward_time);
let _ = balance_forward_interval.tick().await; // first tick completes immediately

let mut uptime_metrics_update_interval =
tokio::time::interval(UPTIME_METRICS_UPDATE_INTERVAL);
let _ = uptime_metrics_update_interval.tick().await; // first tick completes immediately

loop {
let peers_connected = &peers_connected;

tokio::select! {
net_event = network_event_receiver.recv() => {
match net_event {
Some(event) => {
let start = std::time::Instant::now();
let start = Instant::now();
let event_string = format!("{event:?}");

self.handle_network_event(event, peers_connected);
Expand All @@ -300,7 +307,7 @@ impl Node {
}
// runs every replication_interval time
_ = replication_interval.tick() => {
let start = std::time::Instant::now();
let start = Instant::now();
trace!("Periodic replication triggered");
let network = self.network.clone();
self.record_metrics(Marker::IntervalReplicationTriggered);
Expand All @@ -312,7 +319,7 @@ impl Node {
}
// runs every bad_nodes_check_time time
_ = bad_nodes_check_interval.tick() => {
let start = std::time::Instant::now();
let start = Instant::now();
trace!("Periodic bad_nodes check triggered");
let network = self.network.clone();
self.record_metrics(Marker::IntervalBadNodesCheckTriggered);
Expand All @@ -332,7 +339,7 @@ impl Node {
_ = balance_forward_interval.tick() => {
if cfg!(feature = "reward-forward") {
if let Some(ref owner) = self.owner {
let start = std::time::Instant::now();
let start = Instant::now();
trace!("Periodic balance forward triggered");
let network = self.network.clone();
let forwarding_reason = owner.clone();
Expand All @@ -352,6 +359,12 @@ impl Node {

}
}
_ = uptime_metrics_update_interval.tick() => {
#[cfg(feature = "open-metrics")]
if let Some(node_metrics) = &self.node_metrics {
let _ = node_metrics.uptime.set(node_metrics.started_instant.elapsed().as_secs() as i64);
}
}
node_cmd = cmds_receiver.recv() => {
match node_cmd {
Ok(cmd) => {
Expand Down Expand Up @@ -380,7 +393,7 @@ impl Node {
/// Handle a network event.
/// Spawns a thread for any likely long running tasks
fn handle_network_event(&self, event: NetworkEvent, peers_connected: &Arc<AtomicUsize>) {
let start = std::time::Instant::now();
let start = Instant::now();
let event_string = format!("{event:?}");
let event_header;
trace!("Handling NetworkEvent {event_string:?}");
Expand Down

0 comments on commit 1d993a7

Please sign in to comment.