diff --git a/bootstrap.sh b/bootstrap.sh index 0c9f880..cc934c9 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -3,6 +3,9 @@ set -e BOOTSTRAP_DIR="bootstrap/overlays/default/" ARGO_NS=mlops-demo-gitops +GRAFANA_PROMETHEUS_OPERATORS_DIR="components/grafana_prometheus/operators/overlays/default/" +GRAFANA_PROMETHEUS_NS=mlops-demo-dev +GRAFANA_PROMETHEUS_SERVICES_DIR="components/grafana_prometheus/services/overlays/default/" # check login check_oc_login(){ @@ -25,6 +28,28 @@ main(){ echo "https://${route}" } +grafana_prometheus() { + echo "Applying grafana & prometheus operators to ${GRAFANA_PROMETHEUS_NS}" + kustomize build ${GRAFANA_PROMETHEUS_OPERATORS_DIR} | oc apply -f - + + echo "waiting for a minute" + sleep 60 + + echo "" + + echo "Applying grafana & prometheus services to ${GRAFANA_PROMETHEUS_NS}" + + kustomize build ${GRAFANA_PROMETHEUS_SERVICES_DIR} | oc apply -f - + + echo "" + echo "Grafana Route: + " + route=$(oc get route grafana-route -n ${GRAFANA_PROMETHEUS_NS} -o jsonpath='{.spec.host}') + echo "https://${route}" +} + check_oc_login -main \ No newline at end of file +main + +grafana_prometheus \ No newline at end of file diff --git a/components/grafana_prometheus/operators/base/grafana-operator-subs.yaml b/components/grafana_prometheus/operators/base/grafana-operator-subs.yaml new file mode 100644 index 0000000..6974311 --- /dev/null +++ b/components/grafana_prometheus/operators/base/grafana-operator-subs.yaml @@ -0,0 +1,10 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: grafana-operator +spec: + channel: v4 + installPlanApproval: Automatic + name: grafana-operator + source: community-operators + sourceNamespace: openshift-marketplace diff --git a/components/grafana_prometheus/operators/base/kustomization.yaml b/components/grafana_prometheus/operators/base/kustomization.yaml new file mode 100644 index 0000000..5647e75 --- /dev/null +++ b/components/grafana_prometheus/operators/base/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: mlops-demo-dev + +resources: + - operatorgroup.yaml + - seldon-subs.yaml + - prometheus-operator-subs.yaml + - grafana-operator-subs.yaml diff --git a/components/grafana_prometheus/operators/base/operatorgroup.yaml b/components/grafana_prometheus/operators/base/operatorgroup.yaml new file mode 100644 index 0000000..b092cd1 --- /dev/null +++ b/components/grafana_prometheus/operators/base/operatorgroup.yaml @@ -0,0 +1,8 @@ +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: mlops-demo-dev + namespace: mlops-demo-dev +spec: + targetNamespaces: + - mlops-demo-dev diff --git a/components/grafana_prometheus/operators/base/prometheus-operator-subs.yaml b/components/grafana_prometheus/operators/base/prometheus-operator-subs.yaml new file mode 100644 index 0000000..9e23649 --- /dev/null +++ b/components/grafana_prometheus/operators/base/prometheus-operator-subs.yaml @@ -0,0 +1,10 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: prometheus +spec: + channel: beta + installPlanApproval: Automatic + name: prometheus + source: community-operators + sourceNamespace: openshift-marketplace diff --git a/components/grafana_prometheus/operators/base/seldon-subs.yaml b/components/grafana_prometheus/operators/base/seldon-subs.yaml new file mode 100644 index 0000000..a936576 --- /dev/null +++ b/components/grafana_prometheus/operators/base/seldon-subs.yaml @@ -0,0 +1,10 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: seldon-operator-certified +spec: + channel: stable + installPlanApproval: Automatic + name: seldon-operator-certified + source: certified-operators + sourceNamespace: openshift-marketplace diff --git a/components/grafana_prometheus/operators/overlays/default/kustomization.yaml b/components/grafana_prometheus/operators/overlays/default/kustomization.yaml new file mode 100644 index 0000000..2e9e797 --- /dev/null +++ b/components/grafana_prometheus/operators/overlays/default/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +bases: + - ../../base diff --git a/components/grafana_prometheus/services/base/grafana-instance.yaml b/components/grafana_prometheus/services/base/grafana-instance.yaml new file mode 100644 index 0000000..0c2b3cd --- /dev/null +++ b/components/grafana_prometheus/services/base/grafana-instance.yaml @@ -0,0 +1,24 @@ +apiVersion: integreatly.org/v1alpha1 +kind: Grafana +metadata: + name: grafana +spec: + config: + auth: + disable_signout_menu: true + auth.anonymous: + enabled: true + log: + level: warn + mode: console + security: + admin_password: secret + admin_user: root + ingress: + enabled: true + dashboardLabelSelector: + - matchExpressions: + - key: app + operator: In + values: + - grafana diff --git a/components/grafana_prometheus/services/base/grafana-prometheus-datasource.yaml b/components/grafana_prometheus/services/base/grafana-prometheus-datasource.yaml new file mode 100644 index 0000000..909fdec --- /dev/null +++ b/components/grafana_prometheus/services/base/grafana-prometheus-datasource.yaml @@ -0,0 +1,16 @@ +apiVersion: integreatly.org/v1alpha1 +kind: GrafanaDataSource +metadata: + name: prometheus +spec: + datasources: + - access: proxy + editable: true + isDefault: true + jsonData: + timeInterval: 5s + name: prometheus + type: prometheus + url: "http://prometheus-operated:9090" + version: 1 + name: mlops-prometheus.yaml diff --git a/components/grafana_prometheus/services/base/kustomization.yaml b/components/grafana_prometheus/services/base/kustomization.yaml new file mode 100644 index 0000000..0d9a837 --- /dev/null +++ b/components/grafana_prometheus/services/base/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: mlops-demo-dev + +resources: + - prometheus-instance.yaml + - prometheus-route.yaml + - rest-seldon-model-servicemonitor.yaml + - grafana-instance.yaml + - grafana-prometheus-datasource.yaml + - prediction-analytics-seldon-core-1.2.2.yaml diff --git a/components/grafana_prometheus/services/base/prediction-analytics-seldon-core-1.2.2.yaml b/components/grafana_prometheus/services/base/prediction-analytics-seldon-core-1.2.2.yaml new file mode 100644 index 0000000..383e784 --- /dev/null +++ b/components/grafana_prometheus/services/base/prediction-analytics-seldon-core-1.2.2.yaml @@ -0,0 +1,1360 @@ +apiVersion: integreatly.org/v1alpha1 +kind: GrafanaDashboard +metadata: + name: prediction-analytics-seldon-core-1.2.2 + labels: + app: grafana +spec: + json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Prediction Analytics based on original. Changed for Seldon 1.2.2 operator", + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 5, + "iteration": 1604834019741, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 29, + "panels": [], + "repeat": null, + "title": "Heading", + "type": "row" + }, + { + "content": "
\n Seldon Deploy Prediction API Dashboard\n
", + "datasource": null, + "gridPos": { + "h": 2, + "w": 22, + "x": 0, + "y": 1 + }, + "id": 27, + "links": [], + "mode": "html", + "options": {}, + "title": "", + "type": "text" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 30, + "panels": [], + "repeat": null, + "title": "Global Counts", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "format": "ops", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 10, + "x": 0, + "y": 4 + }, + "id": 16, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "round(sum(irate(seldon_api_executor_server_requests_seconds_count[1m])), 0.001)", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 20 + } + ], + "thresholds": "", + "title": "Global Request Rate", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 10, + "y": 4 + }, + "id": 17, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(seldon_api_executor_server_requests_seconds_count{status!~\"5.*\"}[1m])) / sum(rate(seldon_api_executor_server_requests_seconds_count[1m]))", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 20 + } + ], + "thresholds": "", + "title": "Success", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "format": "ops", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 16, + "y": 4 + }, + "id": 18, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(irate(seldon_api_executor_server_requests_seconds_count{code=~\"4.*\"}[1m])) ", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 20 + } + ], + "thresholds": "", + "title": "4xxs", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 32, + "panels": [], + "repeat": null, + "title": "API Latency", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 22, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(sum(rate(seldon_api_executor_server_requests_seconds_count{exported_service=~\"predictions\",namespace=~\"$project\",deployment_name=~\"$deployment\",code=\"200\"}[1m]))by (project_name, deployment_name, deployment_version, exported_service, code),0.0001)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}} ", + "metric": "", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Prediction API req/sec", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 33, + "panels": [], + "repeat": null, + "title": "API Percentiles", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 22, + "x": 0, + "y": 15 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.5, sum(rate(seldon_api_executor_server_requests_seconds_bucket{exported_service=~\"predictions\",namespace=~\"$project\",deployment_name=~\"$deployment\",code=\"200\"}[1m])) by (namespace, deployment_name, exported_service, code,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}} :{{code}} (p50)", + "refId": "A", + "step": 2 + }, + { + "expr": "histogram_quantile(0.75, sum(rate(seldon_api_executor_server_requests_seconds_bucket{exported_service=~\"predictions\",namespace=~\"$project\",deployment_name=~\"$deployment\",code=\"200\"}[1m])) by (namespace, deployment_name, exported_service, code,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}} :{{code}} (p75)", + "metric": "", + "refId": "B", + "step": 2 + }, + { + "expr": "histogram_quantile(0.9, sum(rate(seldon_api_executor_server_requests_seconds_bucket{exported_service=~\"predictions\",namespace=~\"$project\",deployment_name=~\"$deployment\",code=\"200\"}[1m])) by (namespace, deployment_name, exported_service, code,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}} :{{code}} (p90)", + "refId": "C", + "step": 2 + }, + { + "expr": "histogram_quantile(0.99, sum(rate(seldon_api_executor_server_requests_seconds_bucket{exported_service=~\"predictions\",namespace=~\"$project\",deployment_name=~\"$deployment\",code=\"200\"}[1m])) by (namespace, deployment_name, exported_service, code,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}} :{{code}} (p99)", + "refId": "D", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Prediction API Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 34, + "panels": [], + "repeat": null, + "title": "Predictive Units", + "type": "row" + }, + { + "content": "
\n Models\n
", + "datasource": null, + "gridPos": { + "h": 2, + "w": 22, + "x": 0, + "y": 22 + }, + "id": 8, + "links": [], + "mode": "html", + "options": {}, + "title": "", + "type": "text" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 35, + "panels": [], + "repeat": "model_image", + "scopedVars": { + "model_image": { + "selected": false, + "text": "iot-anomaly-detection", + "value": "iot-anomaly-detection" + } + }, + "title": "Model Metrics", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 10, + "x": 0, + "y": 25 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "scopedVars": { + "model_image": { + "selected": false, + "text": "iot-anomaly-detection", + "value": "iot-anomaly-detection" + } + }, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(seldon_api_executor_client_requests_seconds_count{model_image=~\"$model_image\",namespace=~\"$project\",deployment_name=~\"$deployment\",predictor_name=~\"$predictor\",predictor_version=~\"$version\"}[1m])) by (namespace,deployment_name,predictor_name,predictor_version,model_image,model_version,model_name)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}}/{{predictor_name}}:{{predictor_version}} ({{model_name}} {{model_image}} : {{model_version}})", + "metric": "io_seldon_apife_api_rest_RestClientController_home_snapshot_75thPercentile", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Reqs/sec to $model_image", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 11, + "y": 25 + }, + "hiddenSeries": false, + "id": 11, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "scopedVars": { + "model_image": { + "selected": false, + "text": "iot-anomaly-detection", + "value": "iot-anomaly-detection" + } + }, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.5, sum(rate(seldon_api_executor_client_requests_seconds_bucket{exported_service=~\".*predict\",model_image=~\"$model_image\",namespace=~\"$project\",deployment_name=~\"$deployment\",predictor_name=~\"$predictor\",predictor_version=~\"$version\"}[1m])) by (namespace,deployment_name,predictor_name,predictor_version,model_name,model_image,model_version,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}}/{{predictor_name}}:{{predictor_version}} {{model_name}}: {{model_version}} (p50)", + "metric": "", + "refId": "E", + "step": 2 + }, + { + "expr": "histogram_quantile(0.75, sum(rate(seldon_api_executor_client_requests_seconds_bucket{exported_service=~\".*predict\",model_image=~\"$model_image\",namespace=~\"$project\",deployment_name=~\"$deployment\",predictor_name=~\"$predictor\",predictor_version=~\"$version\"}[1m])) by (namespace,deployment_name,predictor_name,predictor_version,model_name,model_image,model_version,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}}/{{predictor_name}}:{{predictor_version}} {{model_name}} : {{model_version}} (p75)", + "metric": "", + "refId": "B", + "step": 2 + }, + { + "expr": "histogram_quantile(0.9, sum(rate(seldon_api_executor_client_requests_seconds_bucket{exported_service=~\".*predict\",model_image=~\"$model_image\",namespace=~\"$project\",deployment_name=~\"$deployment\",predictor_name=~\"$predictor\",predictor_version=~\"$version\"}[1m])) by (namespace,deployment_name,predictor_name,predictor_version,model_name,model_image,model_version,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}}/{{predictor_name}}:{{predictor_version}} {{model_name}}: {{model_version}} (p90)", + "metric": "", + "refId": "A", + "step": 2 + }, + { + "expr": "histogram_quantile(0.95, sum(rate(seldon_api_executor_client_requests_seconds_bucket{exported_service=~\".*predict\",model_image=~\"$model_image\",namespace=~\"$project\",deployment_name=~\"$deployment\",predictor_name=~\"$predictor\",predictor_version=~\"$version\"}[1m])) by (namespace,deployment_name,predictor_name,predictor_version,model_name,model_image,model_version,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}}/{{predictor_name}}:{{predictor_version}} {{model_name}}: {{model_version}} (p95)", + "metric": "", + "refId": "C", + "step": 2 + }, + { + "expr": "histogram_quantile(0.99, sum(rate(seldon_api_executor_client_requests_seconds_bucket{exported_service=~\".*predict\",model_image=~\"$model_image\",namespace=~\"$project\",deployment_name=~\"$deployment\",predictor_name=~\"$predictor\",predictor_version=~\"$version\"}[1m])) by (namespace,deployment_name,predictor_name,predictor_version,model_name,model_image,model_version,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}}/{{predictor_name}}:{{predictor_version}} {{model_name}}: {{model_version}} (p99)", + "metric": "", + "refId": "D", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "$model_image Latency", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 36, + "panels": [], + "repeat": null, + "repeatIteration": 1604834019741, + "repeatPanelId": 35, + "scopedVars": { + "model_image": { + "selected": false, + "text": "seldonio/mock_classifier_rest", + "value": "seldonio/mock_classifier_rest" + } + }, + "title": "Model Metrics", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 10, + "x": 0, + "y": 35 + }, + "hiddenSeries": false, + "id": 37, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeatIteration": 1604834019741, + "repeatPanelId": 7, + "repeatedByRow": true, + "scopedVars": { + "model_image": { + "selected": false, + "text": "seldonio/mock_classifier_rest", + "value": "seldonio/mock_classifier_rest" + } + }, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(seldon_api_executor_client_requests_seconds_count{model_image=~\"$model_image\",namespace=~\"$project\",deployment_name=~\"$deployment\",predictor_name=~\"$predictor\",predictor_version=~\"$version\"}[1m])) by (namespace,deployment_name,predictor_name,predictor_version,model_image,model_version,model_name)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}}/{{predictor_name}}:{{predictor_version}} ({{model_name}} {{model_image}} : {{model_version}})", + "metric": "io_seldon_apife_api_rest_RestClientController_home_snapshot_75thPercentile", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Reqs/sec to $model_image", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 11, + "y": 35 + }, + "hiddenSeries": false, + "id": 38, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeatIteration": 1604834019741, + "repeatPanelId": 11, + "repeatedByRow": true, + "scopedVars": { + "model_image": { + "selected": false, + "text": "seldonio/mock_classifier_rest", + "value": "seldonio/mock_classifier_rest" + } + }, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.5, sum(rate(seldon_api_executor_client_requests_seconds_bucket{exported_service=~\".*predict\",model_image=~\"$model_image\",namespace=~\"$project\",deployment_name=~\"$deployment\",predictor_name=~\"$predictor\",predictor_version=~\"$version\"}[1m])) by (namespace,deployment_name,predictor_name,predictor_version,model_name,model_image,model_version,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}}/{{predictor_name}}:{{predictor_version}} {{model_name}}: {{model_version}} (p50)", + "metric": "", + "refId": "E", + "step": 2 + }, + { + "expr": "histogram_quantile(0.75, sum(rate(seldon_api_executor_client_requests_seconds_bucket{exported_service=~\".*predict\",model_image=~\"$model_image\",namespace=~\"$project\",deployment_name=~\"$deployment\",predictor_name=~\"$predictor\",predictor_version=~\"$version\"}[1m])) by (namespace,deployment_name,predictor_name,predictor_version,model_name,model_image,model_version,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}}/{{predictor_name}}:{{predictor_version}} {{model_name}} : {{model_version}} (p75)", + "metric": "", + "refId": "B", + "step": 2 + }, + { + "expr": "histogram_quantile(0.9, sum(rate(seldon_api_executor_client_requests_seconds_bucket{exported_service=~\".*predict\",model_image=~\"$model_image\",namespace=~\"$project\",deployment_name=~\"$deployment\",predictor_name=~\"$predictor\",predictor_version=~\"$version\"}[1m])) by (namespace,deployment_name,predictor_name,predictor_version,model_name,model_image,model_version,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}}/{{predictor_name}}:{{predictor_version}} {{model_name}}: {{model_version}} (p90)", + "metric": "", + "refId": "A", + "step": 2 + }, + { + "expr": "histogram_quantile(0.95, sum(rate(seldon_api_executor_client_requests_seconds_bucket{exported_service=~\".*predict\",model_image=~\"$model_image\",namespace=~\"$project\",deployment_name=~\"$deployment\",predictor_name=~\"$predictor\",predictor_version=~\"$version\"}[1m])) by (namespace,deployment_name,predictor_name,predictor_version,model_name,model_image,model_version,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}}/{{predictor_name}}:{{predictor_version}} {{model_name}}: {{model_version}} (p95)", + "metric": "", + "refId": "C", + "step": 2 + }, + { + "expr": "histogram_quantile(0.99, sum(rate(seldon_api_executor_client_requests_seconds_bucket{exported_service=~\".*predict\",model_image=~\"$model_image\",namespace=~\"$project\",deployment_name=~\"$deployment\",predictor_name=~\"$predictor\",predictor_version=~\"$version\"}[1m])) by (namespace,deployment_name,predictor_name,predictor_version,model_name,model_image,model_version,le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{project_name}}/{{deployment_name}}:{{deployment_version}}/{{predictor_name}}:{{predictor_version}} {{model_name}}: {{model_version}} (p99)", + "metric": "", + "refId": "D", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "$model_image Latency", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "prometheus", + "definition": "label_values(seldon_api_executor_server_requests_seconds_count,namespace)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "project", + "options": [], + "query": "label_values(seldon_api_executor_server_requests_seconds_count,namespace)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "prometheus", + "definition": "label_values(seldon_api_executor_client_requests_seconds_count,deployment_name)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "deployment", + "options": [], + "query": "label_values(seldon_api_executor_client_requests_seconds_count,deployment_name)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "prometheus", + "definition": "label_values(seldon_api_executor_client_requests_seconds_count,predictor_name)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "predictor", + "options": [], + "query": "label_values(seldon_api_executor_client_requests_seconds_count,predictor_name)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "prometheus", + "definition": "label_values(seldon_api_executor_client_requests_seconds_count,predictor_version)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "version", + "options": [], + "query": "label_values(seldon_api_executor_client_requests_seconds_count,predictor_version)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "text": "anomaly-detection", + "value": "anomaly-detection" + }, + "datasource": "prometheus", + "definition": "label_values(seldon_api_executor_client_requests_seconds_count,model_name)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "model_name", + "options": [], + "query": "label_values(seldon_api_executor_client_requests_seconds_count,model_name)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "prometheus", + "definition": "label_values(seldon_api_executor_client_requests_seconds_count,model_image)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "model_image", + "options": [], + "query": "label_values(seldon_api_executor_client_requests_seconds_count,model_image)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "prometheus", + "definition": "label_values(seldon_api_executor_client_requests_seconds_count,model_version)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "model_version", + "options": [], + "query": "label_values(seldon_api_executor_client_requests_seconds_count,model_version)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Prediction Analytics", + "uid": "th6ThPhGz", + "version": 7 + } + name: prediction-analytics-seldon-core-1.2.2.json diff --git a/components/grafana_prometheus/services/base/prometheus-instance.yaml b/components/grafana_prometheus/services/base/prometheus-instance.yaml new file mode 100644 index 0000000..a6180b7 --- /dev/null +++ b/components/grafana_prometheus/services/base/prometheus-instance.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: prometheus + labels: + prometheus: k8s + seldon-app: iris-inference-service-iris-inference-service + +spec: + serviceAccountName: prometheus-k8s + serviceMonitorSelector: {} + alerting: + alertmanagers: + - namespace: monitoring + name: alertmanager-main + port: web + securityContext: {} + replicas: 2 + ruleSelector: {} diff --git a/components/grafana_prometheus/services/base/prometheus-route.yaml b/components/grafana_prometheus/services/base/prometheus-route.yaml new file mode 100644 index 0000000..f4cd34c --- /dev/null +++ b/components/grafana_prometheus/services/base/prometheus-route.yaml @@ -0,0 +1,13 @@ +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: prometheus +spec: + port: + targetPort: web + to: + kind: Service + name: prometheus-operated + weight: 100 + wildcardPolicy: None +status: {} diff --git a/components/grafana_prometheus/services/base/rest-seldon-model-servicemonitor.yaml b/components/grafana_prometheus/services/base/rest-seldon-model-servicemonitor.yaml new file mode 100644 index 0000000..6c11f80 --- /dev/null +++ b/components/grafana_prometheus/services/base/rest-seldon-model-servicemonitor.yaml @@ -0,0 +1,15 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + seldon-app: iris-inference-service-iris-inference-service + name: iris-inference-service + namespace: mlops-demo-dev +spec: + endpoints: + - interval: 30s + path: /prometheus + port: http + selector: + matchLabels: + seldon-app: iris-inference-service-iris-inference-service diff --git a/components/grafana_prometheus/services/overlays/default/kustomization.yaml b/components/grafana_prometheus/services/overlays/default/kustomization.yaml new file mode 100644 index 0000000..2e9e797 --- /dev/null +++ b/components/grafana_prometheus/services/overlays/default/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +bases: + - ../../base diff --git a/components/tekton/pipelines/iris-inference-service/base/mlops-demo-application-gitops-github-ssh-key-sealed-secret.yaml b/components/tekton/pipelines/iris-inference-service/base/mlops-demo-application-gitops-github-ssh-key-sealed-secret.yaml index 97998b6..24a14dd 100644 --- a/components/tekton/pipelines/iris-inference-service/base/mlops-demo-application-gitops-github-ssh-key-sealed-secret.yaml +++ b/components/tekton/pipelines/iris-inference-service/base/mlops-demo-application-gitops-github-ssh-key-sealed-secret.yaml @@ -14,4 +14,3 @@ spec: name: mlops-demo-application-gitops-github-ssh-key namespace: mlops-demo-pipelines type: kubernetes.io/ssh-auth -