diff --git a/ant-node/src/bin/antnode/main.rs b/ant-node/src/bin/antnode/main.rs index 3397d81461..777cf00666 100644 --- a/ant-node/src/bin/antnode/main.rs +++ b/ant-node/src/bin/antnode/main.rs @@ -36,7 +36,7 @@ use std::{ process::Command, time::Duration, }; -use sysinfo::{self, System}; +use sysinfo::{self, Pid, ProcessRefreshKind, System}; use tokio::{ runtime::Runtime, sync::{broadcast::error::RecvError, mpsc}, @@ -413,12 +413,14 @@ You can check your reward balance by running: } }); let ctrl_tx_clone_cpu = ctrl_tx.clone(); - // Monitor host CPU usage + let pid = Pid::from_u32(std::process::id()); + // Monitor process CPU usage tokio::spawn(async move { use rand::{thread_rng, Rng}; const CPU_CHECK_INTERVAL: Duration = Duration::from_secs(60); - const CPU_USAGE_THRESHOLD: f32 = 50.0; + // As this is now a per-process CPU usage, it shall never exceeds 10% for too long time. + const CPU_USAGE_THRESHOLD: f64 = 10.0; const HIGH_CPU_CONSECUTIVE_LIMIT: u8 = 5; const NODE_STOP_DELAY: Duration = Duration::from_secs(1); const INITIAL_DELAY_MIN_S: u64 = 10; @@ -427,38 +429,50 @@ You can check your reward balance by running: const JITTER_MIN_S: u64 = 1; const JITTER_MAX_S: u64 = 15; - let mut sys = System::new_all(); - let mut high_cpu_count: u8 = 0; + let process_refresh_kind = ProcessRefreshKind::everything() + .without_disk_usage() + .without_memory(); + let mut system = System::new_all(); + let physical_core_count = system.physical_core_count(); + // Random initial delay between 1 and 5 minutes let initial_delay = Duration::from_secs(thread_rng().gen_range(INITIAL_DELAY_MIN_S..=INITIAL_DELAY_MAX_S)); tokio::time::sleep(initial_delay).await; loop { - sys.refresh_cpu(); - let cpu_usage = sys.global_cpu_info().cpu_usage(); - - if cpu_usage > CPU_USAGE_THRESHOLD { - high_cpu_count += 1; - } else { - high_cpu_count = 0; - } + system.refresh_process_specifics(pid, process_refresh_kind); + if let (Some(process), Some(core_count)) = (system.process(pid), physical_core_count) { + // divide by core_count to get value between 0-100 + let cpu_usage = + ((process.cpu_usage() as f64 / core_count as f64) * 10000.0).round() / 10000.0; + info!( + "Detected process {pid} CPU usage is {cpu_usage:?} (with {core_count} cores)" + ); + + if cpu_usage > CPU_USAGE_THRESHOLD { + high_cpu_count += 1; + } else { + high_cpu_count = 0; + } - if high_cpu_count >= HIGH_CPU_CONSECUTIVE_LIMIT { - if let Err(err) = ctrl_tx_clone_cpu - .send(NodeCtrl::Stop { - delay: NODE_STOP_DELAY, - result: StopResult::Success(format!("Excess host CPU %{CPU_USAGE_THRESHOLD} detected for {HIGH_CPU_CONSECUTIVE_LIMIT} consecutive minutes!")), - }) - .await - { - error!("Failed to send node control msg to antnode bin main thread: {err}"); + if high_cpu_count >= HIGH_CPU_CONSECUTIVE_LIMIT { + if let Err(err) = ctrl_tx_clone_cpu + .send(NodeCtrl::Stop { + delay: NODE_STOP_DELAY, + result: StopResult::Success(format!("Excess host CPU %{CPU_USAGE_THRESHOLD} detected for {HIGH_CPU_CONSECUTIVE_LIMIT} consecutive minutes!")), + }) + .await + { + error!("Failed to send node control msg to antnode bin main thread: {err}"); + } + break; } - break; + } else { + error!("Cann't refresh systeminfo of process {pid} with OS core_count of {physical_core_count:?}"); } - // Add jitter to the interval let jitter = Duration::from_secs(thread_rng().gen_range(JITTER_MIN_S..=JITTER_MAX_S)); tokio::time::sleep(CPU_CHECK_INTERVAL + jitter).await;