From 47c41a0b8d7a0c2ea78588dd9fb35b2bb3ddf4ca Mon Sep 17 00:00:00 2001 From: Gianluca Zuccarelli Date: Mon, 1 Nov 2021 10:45:22 +0000 Subject: [PATCH] templates: add latency metrics to dashboard Update the grafana dashboard to with metrics for latency requests, including error budget burn for compose latency. --- ...age-builder-composer-general.configmap.yml | 444 +++++++++++++++++- 1 file changed, 422 insertions(+), 22 deletions(-) diff --git a/templates/dashboards/grafana-dashboard-image-builder-composer-general.configmap.yml b/templates/dashboards/grafana-dashboard-image-builder-composer-general.configmap.yml index 4ee29e5f0..ebe9cf19e 100644 --- a/templates/dashboards/grafana-dashboard-image-builder-composer-general.configmap.yml +++ b/templates/dashboards/grafana-dashboard-image-builder-composer-general.configmap.yml @@ -29,10 +29,13 @@ data: ] }, "editable": true, + "fiscalYearStartMonth": 0, "gnetId": null, "graphTooltip": 0, - "iteration": 1635445778494, + "id": 207, + "iteration": 1635760556540, "links": [], + "liveNow": false, "panels": [ { "collapsed": false, @@ -50,7 +53,7 @@ data: }, { "datasource": "${datasource}", - "description": "The percentage of successful compose requests for the selected time range and interval", + "description": "The percentage of successful compose requests for the selected time range", "fieldConfig": { "defaults": { "color": { @@ -123,22 +126,22 @@ data: "text": {}, "textMode": "auto" }, - "pluginVersion": "8.1.5", + "pluginVersion": "8.2.1", "targets": [ { "exemplar": true, - "expr": "sum(increase(total_successful_compose_requests[$__range]))/sum(increase(total_compose_requests[$__range]))", + "expr": "1 - sum(increase(total_failed_compose_requests[$__range]))/sum(increase(total_compose_requests[$__range]))", "interval": "", "legendFormat": "", "refId": "A" } ], - "title": "Compose Success Rate", + "title": "Compose Request Success Rate", "type": "stat" }, { "datasource": "${datasource}", - "description": "The number of total compose requests for the selected interval", + "description": "The number of total compose requests for the selected date range", "fieldConfig": { "defaults": { "color": { @@ -180,7 +183,7 @@ data: "text": {}, "textMode": "auto" }, - "pluginVersion": "8.1.5", + "pluginVersion": "8.2.1", "targets": [ { "exemplar": true, @@ -195,7 +198,7 @@ data: }, { "datasource": "${datasource}", - "description": "The number of compose errors (as a percentage) over time for the selected time range and interval", + "description": "The number of compose errors (as a percentage) over time for the selected time range", "fieldConfig": { "defaults": { "color": { @@ -264,7 +267,7 @@ data: "targets": [ { "exemplar": true, - "expr": "1 - sum(increase(total_successful_compose_requests[$__range]))/sum(increase(total_compose_requests[$__range]))", + "expr": "sum(increase(total_failed_compose_requests[$__range]))/sum(increase(total_compose_requests[$__range]))", "interval": "", "legendFormat": "", "refId": "A" @@ -276,7 +279,7 @@ data: { "cacheTimeout": 1, "datasource": "${datasource}", - "description": "How long will it take to consume all our budget if our error consumption remains at the current rate for the selected interval.", + "description": "How long will it take to consume all our budget if our error consumption remains at the current rate for the selected date range.", "fieldConfig": { "defaults": { "color": { @@ -293,6 +296,17 @@ data: } }, "type": "special" + }, + { + "options": { + "from": 672, + "result": { + "index": 1, + "text": "∞" + }, + "to": 3360100 + }, + "type": "range" } ], "thresholds": { @@ -343,11 +357,11 @@ data: }, "textMode": "auto" }, - "pluginVersion": "8.1.5", + "pluginVersion": "8.2.1", "targets": [ { "exemplar": true, - "expr": "28 * 24 * $stability_error_budget / ((1 - sum(rate(total_successful_compose_requests[$__range])) by (job) / sum(rate(total_compose_requests[$__range])) by (job)))", + "expr": "28 * 24 * (1 - $stability_slo) / ((sum(rate(total_failed_compose_requests[$__range]))/ sum(rate(total_compose_requests[$__range]))) + 0.001)", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -362,8 +376,8 @@ data: }, { "cacheTimeout": null, - "datasource": "$datasource", - "description": "The percentage of error budget consumed for the selected time range and interval. ", + "datasource": "${datasource}", + "description": "The percentage of error budget consumed for the selected time range. ", "fieldConfig": { "defaults": { "color": { @@ -446,7 +460,393 @@ data: "targets": [ { "exemplar": true, - "expr": "1 - ((sum(increase(total_successful_compose_requests[28d]))/sum(increase(total_compose_requests[28d]))) - $stability_slo)/ (1 - $stability_slo)", + "expr": "1 - ((1 - sum(increase(total_failed_compose_requests[$__range]))/sum(increase(total_compose_requests[$__range]))) - $stability_slo)/ (1 - $stability_slo)", + "instant": false, + "interval": "", + "intervalFactor": 10, + "legendFormat": "errorbudget", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Error Budget Consumed", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 129, + "panels": [], + "title": "Compose Latency", + "type": "row" + }, + { + "datasource": "${datasource}", + "description": "The percentage of successful compose requests for the selected time range", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "index": 0, + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": "175" + }, + { + "color": "red", + "value": "200" + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 0, + "y": 18 + }, + "id": 200, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "8.2.1", + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(composer_http_duration_seconds_bucket[$__range])) by (le)) * 1000", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Compose Latency", + "type": "stat" + }, + { + "datasource": "${datasource}", + "description": "The request latency for composer requests over the selected date range", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 35, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": "175" + }, + { + "color": "red", + "value": "200" + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 19, + "x": 5, + "y": 18 + }, + "id": 201, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.9, sum(rate(composer_http_duration_seconds_bucket[$__range])) by (le)) * 1000", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Compose Request Latency", + "type": "timeseries" + }, + { + "cacheTimeout": 1, + "datasource": "${datasource}", + "description": "How long will it take to consume all our budget if our error consumption remains at the current rate for the selected date range.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "index": 0, + "text": "1.40 days" + } + }, + "type": "special" + }, + { + "options": { + "from": 672, + "result": { + "index": 1, + "text": "∞" + }, + "to": 3360100 + }, + "type": "range" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "#EAB839", + "value": 40 + }, + { + "color": "green", + "value": 50 + } + ] + }, + "unit": "h" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 26 + }, + "id": 198, + "interval": null, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": { + "valueSize": 80 + }, + "textMode": "auto" + }, + "pluginVersion": "8.2.1", + "targets": [ + { + "exemplar": true, + "expr": "28 * 24 * (1 - $latency_slo) / (1 - sum(rate(composer_http_duration_seconds_bucket{le=\"0.2\"}[$__range]))/sum(rate(composer_http_duration_seconds_count[$__range])))", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Error Budget Remaining", + "type": "stat" + }, + { + "cacheTimeout": null, + "datasource": "${datasource}", + "description": "The percentage of error budget consumed for the selected time range. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 0, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 0.95 + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 20, + "x": 4, + "y": 26 + }, + "id": 199, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "pluginVersion": "8.1.5", + "targets": [ + { + "exemplar": true, + "expr": "1 - ((sum(increase(composer_http_duration_seconds_bucket{le=\"0.2\"}[$__range]))/sum(increase(composer_http_duration_seconds_count[$__range]))) - $latency_slo)/ (1 - $latency_slo)", "instant": false, "interval": "", "intervalFactor": 10, @@ -461,14 +861,14 @@ data: } ], "refresh": false, - "schemaVersion": 30, + "schemaVersion": 31, "style": "dark", "tags": [], "templating": { "list": [ { "current": { - "selected": false, + "selected": true, "text": "app-sre-prod-04-prometheus", "value": "app-sre-prod-04-prometheus" }, @@ -492,7 +892,7 @@ data: "auto_count": 30, "auto_min": "10s", "current": { - "selected": true, + "selected": false, "text": "28d", "value": "28d" }, @@ -570,12 +970,12 @@ data: "type": "constant" }, { - "description": "Compose stability error budget", + "description": "Compose latency SLO target", "error": null, "hide": 2, "label": null, - "name": "stability_error_budget", - "query": "0.05", + "name": "latency_slo", + "query": "0.9", "skipUrlSync": false, "type": "constant" } @@ -613,5 +1013,5 @@ data: "timezone": "", "title": "Image Builder Composer", "uid": "cNGfs4Knz", - "version": 1 + "version": 2 }