From 6ac6c8af86bbb394cbb0c51d09def03842b15a23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Avila=20Gast=C3=B3n?= <72628438+avilagaston9@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:50:33 -0300 Subject: [PATCH] feat: add latency metrics to batcher (#1578) Co-authored-by: Marcos Nicolau --- batcher/aligned-batcher/src/lib.rs | 31 ++- batcher/aligned-batcher/src/metrics.rs | 18 ++ .../aligned/aggregator_batcher.json | 262 +++++++++++++++++- 3 files changed, 298 insertions(+), 13 deletions(-) diff --git a/batcher/aligned-batcher/src/lib.rs b/batcher/aligned-batcher/src/lib.rs index 40438c8d2..e5588d4e3 100644 --- a/batcher/aligned-batcher/src/lib.rs +++ b/batcher/aligned-batcher/src/lib.rs @@ -12,7 +12,7 @@ use retry::batcher_retryables::{ user_balance_is_unlocked_retryable, }; use retry::{retry_function, RetryError}; -use tokio::time::timeout; +use tokio::time::{timeout, Instant}; use types::batch_state::BatchState; use types::user_state::UserState; @@ -1524,6 +1524,7 @@ impl Batcher { proof_submitters: Vec
, fee_params: CreateNewTaskFeeParams, ) -> Result { + let start = Instant::now(); let result = retry_function( || { create_new_task_retryable( @@ -1542,6 +1543,11 @@ impl Batcher { ETHEREUM_CALL_MAX_RETRY_DELAY, ) .await; + self.metrics + .create_new_task_duration + .set(start.elapsed().as_millis() as i64); + // Set to zero since it is not always executed + self.metrics.cancel_create_new_task_duration.set(0); match result { Ok(receipt) => { if let Err(e) = self @@ -1600,10 +1606,11 @@ impl Batcher { /// After 2 hours (attempt 13), retries occur hourly for 1 day (33 retries). pub async fn cancel_create_new_task_tx(&self, old_tx_gas_price: U256) { info!("Cancelling createNewTask transaction..."); + let start = Instant::now(); let iteration = Arc::new(Mutex::new(0)); let previous_gas_price = Arc::new(Mutex::new(old_tx_gas_price)); - if let Err(e) = retry_function( + match retry_function( || async { let mut iteration = iteration.lock().await; let mut previous_gas_price = previous_gas_price.lock().await; @@ -1639,11 +1646,12 @@ impl Batcher { ) .await { - error!("Could not cancel createNewTask transaction: {e}"); - return; + Ok(_) => info!("createNewTask transaction successfully canceled"), + Err(e) => error!("Could not cancel createNewTask transaction: {e}"), }; - - info!("createNewTask transaction successfully canceled"); + self.metrics + .cancel_create_new_task_duration + .set(start.elapsed().as_millis() as i64); } /// Only relevant for testing and for users to easily use Aligned @@ -1785,7 +1793,8 @@ impl Batcher { batch_bytes: &[u8], file_name: &str, ) -> Result<(), BatcherError> { - retry_function( + let start = Instant::now(); + let result = retry_function( || { Self::upload_batch_to_s3_retryable( batch_bytes, @@ -1800,7 +1809,13 @@ impl Batcher { ETHEREUM_CALL_MAX_RETRY_DELAY, ) .await - .map_err(|e| BatcherError::BatchUploadError(e.to_string())) + .map_err(|e| BatcherError::BatchUploadError(e.to_string())); + + self.metrics + .s3_duration + .set(start.elapsed().as_micros() as i64); + + result } async fn upload_batch_to_s3_retryable( diff --git a/batcher/aligned-batcher/src/metrics.rs b/batcher/aligned-batcher/src/metrics.rs index a7c6f26e3..dccab58f3 100644 --- a/batcher/aligned-batcher/src/metrics.rs +++ b/batcher/aligned-batcher/src/metrics.rs @@ -19,6 +19,9 @@ pub struct BatcherMetrics { pub batcher_started: IntCounter, pub gas_price_used_on_latest_batch: IntGauge, pub broken_ws_connections: IntCounter, + pub s3_duration: IntGauge, + pub create_new_task_duration: IntGauge, + pub cancel_create_new_task_duration: IntGauge, } impl BatcherMetrics { @@ -46,6 +49,15 @@ impl BatcherMetrics { "broken_ws_connections_count", "Broken websocket connections" ))?; + let s3_duration = register_int_gauge!(opts!("s3_duration", "S3 Duration"))?; + let create_new_task_duration = register_int_gauge!(opts!( + "create_new_task_duration", + "Create New Task Duration" + ))?; + let cancel_create_new_task_duration = register_int_gauge!(opts!( + "cancel_create_new_task_duration", + "Cancel create New Task Duration" + ))?; registry.register(Box::new(open_connections.clone()))?; registry.register(Box::new(received_proofs.clone()))?; @@ -56,6 +68,9 @@ impl BatcherMetrics { registry.register(Box::new(gas_price_used_on_latest_batch.clone()))?; registry.register(Box::new(batcher_started.clone()))?; registry.register(Box::new(broken_ws_connections.clone()))?; + registry.register(Box::new(s3_duration.clone()))?; + registry.register(Box::new(create_new_task_duration.clone()))?; + registry.register(Box::new(cancel_create_new_task_duration.clone()))?; let metrics_route = warp::path!("metrics") .and(warp::any().map(move || registry.clone())) @@ -77,6 +92,9 @@ impl BatcherMetrics { batcher_started, gas_price_used_on_latest_batch, broken_ws_connections, + s3_duration, + create_new_task_duration, + cancel_create_new_task_duration, }) } diff --git a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json index 52de76921..71167098a 100644 --- a/grafana/provisioning/dashboards/aligned/aggregator_batcher.json +++ b/grafana/provisioning/dashboards/aligned/aggregator_batcher.json @@ -2650,6 +2650,126 @@ ], "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 69 + }, + "id": 46, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "

\n Latency\n

", + "mode": "html" + }, + "pluginVersion": "10.1.10", + "transparent": true, + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 48, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 71 + }, + "id": 47, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "s3_duration * 10 ^ (-3)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Upload Batch to S3 Duration", + "type": "timeseries" + }, { "datasource": { "type": "prometheus", @@ -2714,7 +2834,7 @@ "h": 8, "w": 12, "x": 12, - "y": 61 + "y": 71 }, "id": 43, "interval": "1s", @@ -2748,6 +2868,138 @@ "title": "Latest respond to task latency", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 48, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 79 + }, + "id": 45, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "cancel_create_new_task_duration * 10 ^ (-3)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "cancel_new_task", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "create_new_task_duration * 10 ^(-3)", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "create_new_task", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "CreateNewTask Duration", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "total", + "mode": "reduceRow", + "reduce": { + "include": [ + "cancel_new_task", + "create_new_task" + ], + "reducer": "sum" + } + } + } + ], + "type": "timeseries" + }, { "datasource": { "type": "prometheus", @@ -2810,7 +3062,7 @@ "h": 8, "w": 12, "x": 12, - "y": 69 + "y": 79 }, "id": 44, "interval": "1s", @@ -2853,13 +3105,13 @@ "list": [] }, "time": { - "from": "now-30m", + "from": "now-5m", "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "System Data", "uid": "aggregator", - "version": 19, + "version": 7, "weekStart": "" -} +} \ No newline at end of file