1
0
Fork 0
selfhostblocks/modules/blocks/monitoring/rules.json
ibizaman adbeef3289 reduce alerting time for 5XX error
It's alerting for too long, even after it recovered.
We'll probably need to something more appropriate than an error budget but it'll do for now.
2024-09-07 11:06:36 -07:00

131 lines
3.5 KiB
JSON

[
{
"uid": "f5246fa3-163f-4eae-9e1d-5b0fe2af0509",
"title": "5XX Requests Error Budgets Under 99%",
"condition": "threshold",
"data": [
{
"refId": "A",
"queryType": "range",
"relativeTimeRange": {
"from": 21600,
"to": 0
},
"datasourceUid": "cd6cc53e-840c-484d-85f7-96fede324006",
"model": {
"datasource": {
"type": "loki",
"uid": "cd6cc53e-840c-484d-85f7-96fede324006"
},
"editorMode": "code",
"expr": "(sum by(server_name) (count_over_time({unit=\"nginx.service\"} | pattern \"<_> <_> <line>\" | line_format \"{{.line}}\" | json | __error__ != \"JSONParserErr\" | status =~ \"[1234]..\" | server_name =~ \".*\" [1h])) / sum by(server_name) (count_over_time({unit=\"nginx.service\"} | pattern \"<_> <_> <line>\" | line_format \"{{.line}}\" | json | __error__ != \"JSONParserErr\" | server_name =~ \".*\" [1h])))",
"intervalMs": 1000,
"legendFormat": "{{server_name}}",
"maxDataPoints": 43200,
"queryType": "range",
"refId": "A"
}
},
{
"refId": "last",
"relativeTimeRange": {
"from": 0,
"to": 0
},
"datasourceUid": "__expr__",
"model": {
"conditions": [
{
"evaluator": {
"params": [],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"B"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "__expr__"
},
"expression": "A",
"intervalMs": 1000,
"maxDataPoints": 43200,
"reducer": "last",
"refId": "last",
"type": "reduce"
}
},
{
"refId": "threshold",
"relativeTimeRange": {
"from": 0,
"to": 0
},
"datasourceUid": "__expr__",
"model": {
"conditions": [
{
"evaluator": {
"params": [
0.99
],
"type": "lt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"C"
]
},
"reducer": {
"params": [],
"type": "last"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "__expr__"
},
"expression": "last",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "threshold",
"type": "threshold"
}
}
],
"dasboardUid": "d66242cf-71e8-417c-8ef7-51b0741545df",
"panelId": 9,
"noDataState": "OK",
"execErrState": "Error",
"for": "20m",
"annotations": {
"__dashboardUid__": "d66242cf-71e8-417c-8ef7-51b0741545df",
"__panelId__": "9",
"description": "",
"runbook_url": "",
"summary": "The error budget for a service for the last 1 hour is under 99%"
},
"labels": {
"": "",
"role": "sysadmin"
},
"isPaused": false
}
]