diff --git a/README.md b/README.md index 0178285..2657c11 100644 --- a/README.md +++ b/README.md @@ -326,43 +326,7 @@ See the [`ldap.nix`](./modules/ldap.nix) and [`authelia.nix`](./modules/authelia ### Deploy the full Grafana, Prometheus and Loki suite -This is not a prerequisite for anything and could be enabled just for debugging. - -```nix -shb.monitoring = { - enable = true; - subdomain = "grafana"; - inherit domain; -}; -``` - -With that, Grafana, Prometheus, Loki and Promtail are setup! You can access `Grafana` at -`grafana.example.com`. - -A few Prometheus metrics scrapers are setup automatically: -- node - cpu, memory, disk I/O, network I/O and a few others of the computer -- smartctl - hard drive health -- prometheus_internal - scraping jobs health -- nginx -- dnsmasq (if the service is enabled) - -The following Loki logs scraper is setup automatically: -- systemd journal - -I intend to provide more options so that you could for example tweak data retention. - -Also, since all logs are now stored in Loki, you can probably reduce the systemd journal retention -time with: - -```nix -# See https://www.freedesktop.org/software/systemd/man/journald.conf.html#SystemMaxUse= -services.journald.extraConfig = '' -SystemMaxUse=2G -SystemKeepFree=4G -SystemMaxFileSize=100M -MaxFileSec=day -''; -``` +See [docs/blocks/monitoring.md](docs/blocks/monitoring.md). ### Set up network tunnel with VPN and Proxy diff --git a/docs/assets/monitoring_grafana_alert_rules_5xx_1.png b/docs/assets/monitoring_grafana_alert_rules_5xx_1.png new file mode 100644 index 0000000..cadaa3b Binary files /dev/null and b/docs/assets/monitoring_grafana_alert_rules_5xx_1.png differ diff --git a/docs/assets/monitoring_grafana_alert_rules_5xx_2.png b/docs/assets/monitoring_grafana_alert_rules_5xx_2.png new file mode 100644 index 0000000..50f7fc1 Binary files /dev/null and b/docs/assets/monitoring_grafana_alert_rules_5xx_2.png differ diff --git a/docs/assets/monitoring_grafana_dashboards_Errors_1.png b/docs/assets/monitoring_grafana_dashboards_Errors_1.png new file mode 100644 index 0000000..6b436ae Binary files /dev/null and b/docs/assets/monitoring_grafana_dashboards_Errors_1.png differ diff --git a/docs/assets/monitoring_grafana_dashboards_Errors_2.png b/docs/assets/monitoring_grafana_dashboards_Errors_2.png new file mode 100644 index 0000000..e736ae4 Binary files /dev/null and b/docs/assets/monitoring_grafana_dashboards_Errors_2.png differ diff --git a/docs/assets/monitoring_grafana_dashboards_Performance_1.png b/docs/assets/monitoring_grafana_dashboards_Performance_1.png new file mode 100644 index 0000000..0cfcaeb Binary files /dev/null and b/docs/assets/monitoring_grafana_dashboards_Performance_1.png differ diff --git a/docs/assets/monitoring_grafana_dashboards_Performance_2.png b/docs/assets/monitoring_grafana_dashboards_Performance_2.png new file mode 100644 index 0000000..0004123 Binary files /dev/null and b/docs/assets/monitoring_grafana_dashboards_Performance_2.png differ diff --git a/docs/assets/monitoring_grafana_folder.png b/docs/assets/monitoring_grafana_folder.png new file mode 100644 index 0000000..1291399 Binary files /dev/null and b/docs/assets/monitoring_grafana_folder.png differ diff --git a/docs/blocks/monitoring.md b/docs/blocks/monitoring.md new file mode 100644 index 0000000..7a26ebb --- /dev/null +++ b/docs/blocks/monitoring.md @@ -0,0 +1,118 @@ +# Monitoring Block + +This block sets up the monitoring stack for Self Host Blocks. It is composed of: + +- Grafana as the dashboard frontend. +- Prometheus as the database for metrics. +- Loki as the database for logs. + +## Configuration + +```nix +shb.monitoring = { + enable = true; + subdomain = "grafana"; + inherit domain; + contactPoints = [ "me@example.com" ]; + adminPasswordFile = config.sops.secrets."monitoring/admin_password".path; + secretKeyFile = config.sops.secrets."monitoring/secret_key".path; +}; + +sops.secrets."monitoring/admin_password" = { + sopsFile = ./secrets.yaml; + mode = "0400"; + owner = "grafana"; + group = "grafana"; + restartUnits = [ "grafana.service" ]; +}; +sops.secrets."monitoring/secret_key" = { + sopsFile = ./secrets.yaml; + mode = "0400"; + owner = "grafana"; + group = "grafana"; + restartUnits = [ "grafana.service" ]; +}; +``` + +With that, Grafana, Prometheus, Loki and Promtail are setup! You can access `Grafana` at +`grafana.example.com` with user `admin` and password ``. + +I recommend adding a STMP server configuration so you receive alerts by email: + +```nix +shb.monitoring.smtp = { + from_address = "grafana@$example.com"; + from_name = "Grafana"; + host = "smtp.mailgun.org"; + port = 587; + username = "postmaster@mg.example.com"; + passwordFile = config.sops.secrets."monitoring/smtp".path; +}; + +sops.secrets."monitoring/secret_key" = { + sopsFile = ./secrets.yaml; + mode = "0400"; + owner = "grafana"; + group = "grafana"; + restartUnits = [ "grafana.service" ]; +}; +``` + +Since all logs are now stored in Loki, you can probably reduce the systemd journal retention +time with: + +```nix +# See https://www.freedesktop.org/software/systemd/man/journald.conf.html#SystemMaxUse= +services.journald.extraConfig = '' +SystemMaxUse=2G +SystemKeepFree=4G +SystemMaxFileSize=100M +MaxFileSec=day +''; +``` + +## Provisioning + +Self Host Blocks will create automatically the following resources: + +- For Grafana: + - datasources + - dashboards + - contact points + - notification policies + - alerts +- For Prometheus, the following exporters and related scrapers: + - node + - smartctl + - nginx +- For Loki, the following exporters and related scrapers: + - systemd + +Those resources are namespaced as appropriate under the Self Host Blocks namespace: + +![](../assets/monitoring_grafana_folder.png) + +## Errors Dashboard + +This dashboard is meant to be the first stop to understand why a service is misbehaving. + +![](../assets/monitoring_grafana_dashboards_Errors_1.png) +![](../assets/monitoring_grafana_dashboards_Errors_2.png) + +The yellow and red dashed vertical bars correspond to the [Requests Error Budget +Alert](#requests-error-budget-alert) firing. + +## Performance Dashboard + +This dashboard is meant to be the first stop to understand why a service is performing poorly. + +![](../assets/monitoring_grafana_dashboards_Performance_1.png) +![](../assets/monitoring_grafana_dashboards_Performance_2.png) + +## Requests Error Budget Alert + +This alert will fire when the ratio between number of requests getting a 5XX response from a service +and the total requests to that service exceeds 1%. + +![](../assets/monitoring_grafana_alert_rules_5xx_1.png) +![](../assets/monitoring_grafana_alert_rules_5xx_2.png) diff --git a/flake.nix b/flake.nix index 8bb849a..895028d 100644 --- a/flake.nix +++ b/flake.nix @@ -49,6 +49,13 @@ flattenAttrs = root: attrset: pkgs.lib.attrsets.foldlAttrs (acc: name: value: acc // { "${root}_${name}" = value; }) {} attrset; + + vm_test = name: path: flattenAttrs "vm_${name}" ( + import path { + inherit pkgs; + inherit (pkgs) lib; + } + ); in (rec { all = mergeTests [ modules @@ -64,7 +71,8 @@ ]); }; } - // (flattenAttrs "vm_postgresql" (import ./test/vm/postgresql.nix {inherit pkgs; inherit (pkgs) lib;})) + // (vm_test "postgresql" ./test/vm/postgresql.nix) + // (vm_test "monitoring" ./test/vm/monitoring.nix) ); } ); diff --git a/modules/blocks/monitoring.nix b/modules/blocks/monitoring.nix index d268a78..abba7e4 100644 --- a/modules/blocks/monitoring.nix +++ b/modules/blocks/monitoring.nix @@ -1,4 +1,4 @@ -{ config, pkgs, lib, ... }: +{ config, options, pkgs, lib, ... }: let cfg = config.shb.monitoring; @@ -9,12 +9,6 @@ in options.shb.monitoring = { enable = lib.mkEnableOption "selfhostblocks.monitoring"; - # sopsFile = lib.mkOption { - # type = lib.types.path; - # description = "Sops file location"; - # example = "secrets/monitoring.yaml"; - # }; - subdomain = lib.mkOption { type = lib.types.str; description = "Subdomain under which home-assistant will be served."; @@ -27,15 +21,103 @@ in example = "mydomain.com"; }; + grafanaPort = lib.mkOption { + type = lib.types.port; + description = "Port where Grafana listens to HTTP requests."; + default = 3000; + }; + + prometheusPort = lib.mkOption { + type = lib.types.port; + description = "Port where Prometheus listens to HTTP requests."; + default = 3001; + }; + + lokiPort = lib.mkOption { + type = lib.types.port; + description = "Port where Loki listens to HTTP requests."; + default = 3002; + }; + debugLog = lib.mkOption { type = lib.types.bool; description = "Set to true to enable debug logging of the infrastructure serving Grafana."; default = false; example = true; }; + + orgId = lib.mkOption { + type = lib.types.int; + description = "Org ID where all self host blocks related config will be stored."; + default = 1; + }; + + provisionDashboards = lib.mkOption { + type = lib.types.bool; + description = "Provision Self Host Blocks dashboards under 'Self Host Blocks' folder."; + default = true; + }; + + contactPoints = lib.mkOption { + type = lib.types.listOf lib.types.str; + description = "List of email addresses to send alerts to"; + default = []; + }; + + adminPasswordFile = lib.mkOption { + type = lib.types.path; + description = "File containing the initial admin password."; + }; + + secretKeyFile = lib.mkOption { + type = lib.types.path; + description = "File containing the secret key used for signing."; + }; + + smtp = lib.mkOption { + default = null; + type = lib.types.nullOr (lib.types.submodule { + options = { + from_address = lib.mkOption { + type = lib.types.str; + description = "SMTP address from which the emails originate."; + example = "vaultwarden@mydomain.com"; + }; + from_name = lib.mkOption { + type = lib.types.str; + description = "SMTP name from which the emails originate."; + default = "Vaultwarden"; + }; + host = lib.mkOption { + type = lib.types.str; + description = "SMTP host to send the emails to."; + }; + port = lib.mkOption { + type = lib.types.port; + description = "SMTP port to send the emails to."; + default = 25; + }; + username = lib.mkOption { + type = lib.types.str; + description = "Username to connect to the SMTP host."; + }; + passwordFile = lib.mkOption { + type = lib.types.str; + description = "File containing the password to connect to the SMTP host."; + }; + }; + }); + }; }; config = lib.mkIf cfg.enable { + assertions = [ + { + assertion = (!(isNull cfg.smtp)) -> builtins.length cfg.contactPoints > 0; + message = "Must have at least one contact point for alerting"; + } + ]; + shb.postgresql.ensures = [ { username = "grafana"; @@ -57,19 +139,119 @@ in # password = "$__file{/run/secrets/homeassistant/dbpass}"; }; + security = { + secret_key = "$__file{${cfg.secretKeyFile}}"; + disable_initial_admin_creation = false; # Enable when LDAP support is configured. + admin_password = "$__file{${cfg.adminPasswordFile}}"; # Remove when LDAP support is configured. + }; + server = { http_addr = "127.0.0.1"; - http_port = 3000; + http_port = cfg.grafanaPort; domain = fqdn; root_url = "https://${fqdn}"; router_logging = cfg.debugLog; }; + + smtp = lib.mkIf (!(isNull cfg.smtp)) { + enabled = true; + inherit (cfg.smtp) from_address from_name; + host = "${cfg.smtp.host}:${toString cfg.smtp.port}"; + user = cfg.smtp.username; + password = "$__file{${cfg.smtp.passwordFile}}"; + }; }; }; + services.grafana.provision = { + dashboards.settings = lib.mkIf cfg.provisionDashboards { + apiVersion = 1; + providers = [{ + folder = "Self Host Blocks"; + options.path = ./monitoring/dashboards; + allowUiUpdates = true; + disableDeletion = true; + }]; + }; + datasources.settings = { + apiVersion = 1; + datasources = [ + { + inherit (cfg) orgId; + name = "Prometheus"; + type = "prometheus"; + url = "http://127.0.0.1:${toString config.services.prometheus.port}"; + uid = "df80f9f5-97d7-4112-91d8-72f523a02b09"; + isDefault = true; + version = 1; + } + { + inherit (cfg) orgId; + name = "Loki"; + type = "loki"; + url = "http://127.0.0.1:${toString config.services.loki.configuration.server.http_listen_port}"; + uid = "cd6cc53e-840c-484d-85f7-96fede324006"; + version = 1; + } + ]; + deleteDatasources = [ + { + inherit (cfg) orgId; + name = "Prometheus"; + } + { + inherit (cfg) orgId; + name = "Loki"; + } + ]; + }; + alerting.contactPoints.settings = { + apiVersion = 1; + contactPoints = [{ + inherit (cfg) orgId; + name = "grafana-default-email"; + receivers = lib.optionals ((builtins.length cfg.contactPoints) > 0) [{ + uid = "sysadmin"; + type = "email"; + settings.addresses = lib.concatStringsSep ";" cfg.contactPoints; + }]; + }]; + }; + alerting.policies.settings = { + apiVersion = 1; + policies = [{ + inherit (cfg) orgId; + receiver = "grafana-default-email"; + group_by = [ "grafana_folder" "alertname" ]; + group_wait = "30s"; + group_interval = "5m"; + repeat_interval = "4h"; + }]; + # resetPolicies seems to happen after setting the above policies, effectively rolling back + # any updates. + }; + alerting.rules.settings = + let + rules = builtins.fromJSON (builtins.readFile ./monitoring/rules.json); + ruleIds = map (r: r.uid) rules; + in + { + apiVersion = 1; + groups = [{ + inherit (cfg) orgId; + name = "SysAdmin"; + folder = "Self Host Blocks"; + interval = "10m"; + inherit rules; + }]; + # deleteRules seems to happen after creating the above rules, effectively rolling back + # any updates. + }; + }; + services.prometheus = { enable = true; - port = 3001; + port = cfg.prometheusPort; }; services.loki = { @@ -78,7 +260,7 @@ in configuration = { auth_enabled = false; - server.http_listen_port = 3002; + server.http_listen_port = cfg.lokiPort; ingester = { lifecycler = { @@ -179,9 +361,9 @@ in enable = true; virtualHosts.${fqdn} = { - forceSSL = true; - sslCertificate = "/var/lib/acme/${cfg.domain}/cert.pem"; - sslCertificateKey = "/var/lib/acme/${cfg.domain}/key.pem"; + forceSSL = lib.mkIf config.shb.ssl.enable true; + sslCertificate = lib.mkIf config.shb.ssl.enable "/var/lib/acme/${cfg.domain}/cert.pem"; + sslCertificateKey = lib.mkIf config.shb.ssl.enable "/var/lib/acme/${cfg.domain}/key.pem"; locations."/" = { proxyPass = "http://${toString config.services.grafana.settings.server.http_addr}:${toString config.services.grafana.settings.server.http_port}"; proxyWebsockets = true; @@ -278,14 +460,5 @@ in listenAddress = "127.0.0.1"; }; services.nginx.statusPage = lib.mkDefault config.services.nginx.enable; - - # sops.secrets."grafana" = { - # inherit (cfg) sopsFile; - # mode = "0440"; - # owner = "grafana"; - # group = "grafana"; - # # path = "${config.services.home-assistant.configDir}/secrets.yaml"; - # restartUnits = [ "grafana.service" ]; - # }; }; } diff --git a/modules/blocks/monitoring/dashboards/Errors.json b/modules/blocks/monitoring/dashboards/Errors.json new file mode 100644 index 0000000..c134a34 --- /dev/null +++ b/modules/blocks/monitoring/dashboards/Errors.json @@ -0,0 +1,782 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 8, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "light-red", + "value": null + }, + { + "color": "transparent", + "value": 0.99 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 12, + "links": [ + { + "title": "explore", + "url": "https://grafana.tiserbox.com/explore?panes=%7B%22HWt%22:%7B%22datasource%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bunit%3D%5C%22nginx.service%5C%22%7D%20%7C%20pattern%20%5C%22%3C_%3E%20%3C_%3E%20%3Cline%3E%5C%22%20%7C%20line_format%20%5C%22%7B%7B.line%7D%7D%5C%22%20%7C%20json%20%7C%20status%20%21~%20%5C%222..%5C%22%20%7C%20__error__%20%21%3D%20%5C%22JSONParserErr%5C%22%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22from%22:%22now-6h%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1" + } + ], + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "editorMode": "code", + "expr": "sum by(server_name) (rate({unit=\"nginx.service\"} | pattern \"<_> <_> \" | line_format \"{{.line}}\" | json | __error__ != \"JSONParserErr\" | server_name =~ \"[[server_name]].*\" [$__auto]))", + "legendFormat": "{{server_name}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Rate of Requests", + "transformations": [ + { + "id": "extractFields", + "options": { + "replace": true, + "source": "Line" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": { + "body_bytes_sent": 9, + "bytes_sent": 8, + "gzip_ration": 11, + "post": 12, + "referrer": 10, + "remote_addr": 3, + "remote_user": 6, + "request": 4, + "request_length": 7, + "request_time": 15, + "server_name": 2, + "status": 1, + "time_local": 0, + "upstream_addr": 13, + "upstream_connect_time": 17, + "upstream_header_time": 18, + "upstream_response_time": 16, + "upstream_status": 14, + "user_agent": 5 + }, + "renameByName": {} + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "dashed+area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "light-red", + "value": null + }, + { + "color": "transparent", + "value": 0.99 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 9, + "links": [ + { + "title": "explore", + "url": "https://grafana.tiserbox.com/explore?panes=%7B%22HWt%22:%7B%22datasource%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bunit%3D%5C%22nginx.service%5C%22%7D%20%7C%20pattern%20%5C%22%3C_%3E%20%3C_%3E%20%3Cline%3E%5C%22%20%7C%20line_format%20%5C%22%7B%7B.line%7D%7D%5C%22%20%7C%20json%20%7C%20status%20%21~%20%5C%222..%5C%22%20%7C%20__error__%20%21%3D%20%5C%22JSONParserErr%5C%22%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22from%22:%22now-6h%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1" + } + ], + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "editorMode": "code", + "expr": "(sum by(server_name) (count_over_time({unit=\"nginx.service\"} | pattern \"<_> <_> \" | line_format \"{{.line}}\" | json | __error__ != \"JSONParserErr\" | status =~ \"[1234]..\" | server_name =~ \"[[server_name]].*\" [7d])) / sum by(server_name) (count_over_time({unit=\"nginx.service\"} | pattern \"<_> <_> \" | line_format \"{{.line}}\" | json | __error__ != \"JSONParserErr\" | server_name =~ \"[[server_name]].*\" [7d])))", + "legendFormat": "{{server_name}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "5XX Requests Error Budgets", + "transformations": [ + { + "id": "extractFields", + "options": { + "replace": true, + "source": "Line" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": { + "body_bytes_sent": 9, + "bytes_sent": 8, + "gzip_ration": 11, + "post": 12, + "referrer": 10, + "remote_addr": 3, + "remote_user": 6, + "request": 4, + "request_length": 7, + "request_time": 15, + "server_name": 2, + "status": 1, + "time_local": 0, + "upstream_addr": 13, + "upstream_connect_time": 17, + "upstream_header_time": 18, + "upstream_response_time": 16, + "upstream_status": 14, + "user_agent": 5 + }, + "renameByName": {} + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "light-red", + "value": null + }, + { + "color": "transparent", + "value": 0.99 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 10, + "links": [ + { + "title": "explore", + "url": "https://grafana.tiserbox.com/explore?panes=%7B%22HWt%22:%7B%22datasource%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bunit%3D%5C%22nginx.service%5C%22%7D%20%7C%20pattern%20%5C%22%3C_%3E%20%3C_%3E%20%3Cline%3E%5C%22%20%7C%20line_format%20%5C%22%7B%7B.line%7D%7D%5C%22%20%7C%20json%20%7C%20status%20%21~%20%5C%222..%5C%22%20%7C%20__error__%20%21%3D%20%5C%22JSONParserErr%5C%22%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22from%22:%22now-6h%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1" + } + ], + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "editorMode": "code", + "expr": "(sum by(server_name) (count_over_time({unit=\"nginx.service\"} | pattern \"<_> <_> \" | line_format \"{{.line}}\" | json | __error__ != \"JSONParserErr\" | status =~ \"[1235]..\" | server_name =~ \"[[server_name]].*\" [7d])) / sum by(server_name) (count_over_time({unit=\"nginx.service\"} | pattern \"<_> <_> \" | line_format \"{{.line}}\" | json | __error__ != \"JSONParserErr\" | server_name =~ \"[[server_name]].*\" [7d])))", + "legendFormat": "{{server_name}}", + "queryType": "range", + "refId": "A" + } + ], + "title": "4XX Requests Error Budgets", + "transformations": [ + { + "id": "extractFields", + "options": { + "replace": true, + "source": "Line" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": { + "body_bytes_sent": 9, + "bytes_sent": 8, + "gzip_ration": 11, + "post": 12, + "referrer": 10, + "remote_addr": 3, + "remote_user": 6, + "request": 4, + "request_length": 7, + "request_time": 15, + "server_name": 2, + "status": 1, + "time_local": 0, + "upstream_addr": 13, + "upstream_connect_time": 17, + "upstream_header_time": 18, + "upstream_response_time": 16, + "upstream_status": 14, + "user_agent": 5 + }, + "renameByName": {} + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 8, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "editorMode": "code", + "expr": "{unit=~\"[[service]].*\"}", + "queryType": "range", + "refId": "A" + } + ], + "title": "Log Errors", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 7, + "links": [ + { + "title": "explore", + "url": "https://grafana.tiserbox.com/explore?panes=%7B%22HWt%22:%7B%22datasource%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bunit%3D%5C%22nginx.service%5C%22%7D%20%7C%20pattern%20%5C%22%3C_%3E%20%3C_%3E%20%3Cline%3E%5C%22%20%7C%20line_format%20%5C%22%7B%7B.line%7D%7D%5C%22%20%7C%20json%20%7C%20status%20%21~%20%5C%222..%5C%22%20%7C%20__error__%20%21%3D%20%5C%22JSONParserErr%5C%22%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22from%22:%22now-6h%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1" + } + ], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "editorMode": "code", + "expr": "{unit=\"nginx.service\"} | pattern \"<_> <_> \" | line_format \"{{.line}}\" | json | __error__ != \"JSONParserErr\" | status =~ \"5..\" | server_name =~ \"[[server_name]].*\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "5XX Requests Errors", + "transformations": [ + { + "id": "extractFields", + "options": { + "replace": true, + "source": "Line" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": { + "body_bytes_sent": 9, + "bytes_sent": 8, + "gzip_ration": 11, + "post": 12, + "referrer": 10, + "remote_addr": 3, + "remote_user": 6, + "request": 4, + "request_length": 7, + "request_time": 15, + "server_name": 2, + "status": 1, + "time_local": 0, + "upstream_addr": 13, + "upstream_connect_time": 17, + "upstream_header_time": 18, + "upstream_response_time": 16, + "upstream_status": 14, + "user_agent": 5 + }, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 11, + "links": [ + { + "title": "explore", + "url": "https://grafana.tiserbox.com/explore?panes=%7B%22HWt%22:%7B%22datasource%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bunit%3D%5C%22nginx.service%5C%22%7D%20%7C%20pattern%20%5C%22%3C_%3E%20%3C_%3E%20%3Cline%3E%5C%22%20%7C%20line_format%20%5C%22%7B%7B.line%7D%7D%5C%22%20%7C%20json%20%7C%20status%20%21~%20%5C%222..%5C%22%20%7C%20__error__%20%21%3D%20%5C%22JSONParserErr%5C%22%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22from%22:%22now-6h%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1" + } + ], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "editorMode": "code", + "expr": "{unit=\"nginx.service\"} | pattern \"<_> <_> \" | line_format \"{{.line}}\" | json | __error__ != \"JSONParserErr\" | status =~ \"4..\" | server_name =~ \"[[server_name]].*\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "4XX Requests Errors", + "transformations": [ + { + "id": "extractFields", + "options": { + "replace": true, + "source": "Line" + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": { + "body_bytes_sent": 9, + "bytes_sent": 8, + "gzip_ration": 11, + "post": 12, + "referrer": 10, + "remote_addr": 3, + "remote_user": 6, + "request": 4, + "request_length": 7, + "request_time": 15, + "server_name": 2, + "status": 1, + "time_local": 0, + "upstream_addr": 13, + "upstream_connect_time": 17, + "upstream_header_time": 18, + "upstream_response_time": 16, + "upstream_status": 14, + "user_agent": 5 + }, + "renameByName": {} + } + } + ], + "type": "table" + } + ], + "refresh": "", + "schemaVersion": 38, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": [ + "jellyfin" + ], + "value": [ + "jellyfin" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "df80f9f5-97d7-4112-91d8-72f523a02b09" + }, + "definition": "query_result(max by (name) (node_systemd_unit_state))", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "service", + "options": [], + "query": { + "qryType": 3, + "query": "query_result(max by (name) (node_systemd_unit_state))", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "/name=\"(?.*)\\.service\"/", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "hide": 0, + "name": "server_name", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": ".+", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Errors", + "uid": "d66242cf-71e8-417c-8ef7-51b0741545df", + "version": 16, + "weekStart": "" +} diff --git a/modules/blocks/monitoring/dashboards/Performance.json b/modules/blocks/monitoring/dashboards/Performance.json new file mode 100644 index 0000000..945afe9 --- /dev/null +++ b/modules/blocks/monitoring/dashboards/Performance.json @@ -0,0 +1,672 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 6, + "links": [], + "liveNow": false, + "panels": [ + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 12, + "title": "Node", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "df80f9f5-97d7-4112-91d8-72f523a02b09" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 14, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "repeat": "cpu", + "repeatDirection": "h", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "df80f9f5-97d7-4112-91d8-72f523a02b09" + }, + "editorMode": "code", + "expr": "rate(node_cpu_seconds_total{mode!=\"idle\",cpu=\"$cpu\"}[2m])", + "instant": false, + "legendFormat": "{{mode}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU $cpu", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "df80f9f5-97d7-4112-91d8-72f523a02b09" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "df80f9f5-97d7-4112-91d8-72f523a02b09" + }, + "editorMode": "code", + "expr": "rate(process_cpu_seconds_total[2m])", + "instant": false, + "legendFormat": "{{job}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU per process", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 4, + "panels": [], + "title": "Network Requests", + "type": "row" + }, + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 3, + "links": [ + { + "title": "explore", + "url": "https://grafana.tiserbox.com/explore?panes=%7B%22HWt%22:%7B%22datasource%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bunit%3D%5C%22nginx.service%5C%22%7D%20%7C%20pattern%20%5C%22%3C_%3E%20%3C_%3E%20%3Cline%3E%5C%22%20%7C%20line_format%20%5C%22%7B%7B.line%7D%7D%5C%22%20%7C%20json%20%7C%20status%20%21~%20%5C%222..%5C%22%20%7C%20__error__%20%21%3D%20%5C%22JSONParserErr%5C%22%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22from%22:%22now-6h%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1" + } + ], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "editorMode": "code", + "expr": "{unit=\"nginx.service\"} | pattern \"<_> <_> \" | line_format \"{{.line}}\" | json | status =~ \"[12]..\" | __error__ != \"JSONParserErr\" | request_time > 500", + "queryType": "range", + "refId": "A" + } + ], + "title": "Network Requests Above 1s", + "transformations": [ + { + "id": "extractFields", + "options": { + "replace": true, + "source": "Line" + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "points", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "max", + "mean", + "variance" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "editorMode": "code", + "expr": "{unit=\"nginx.service\"} | pattern \"<_> <_> \" | line_format \"{{.line}}\" | json | __error__ != \"JSONParserErr\" | request_time > 500", + "legendFormat": "", + "queryType": "range", + "refId": "A" + } + ], + "title": "Slow Queries", + "transformations": [ + { + "id": "extractFields", + "options": { + "keepTime": true, + "replace": true, + "source": "labels" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "body_bytes_sent": true, + "bytes_sent": true, + "gzip_ration": true, + "job": true, + "line": true, + "post": true, + "referrer": true, + "remote_addr": true, + "remote_user": true, + "request": true, + "request_length": true, + "status": true, + "time_local": true, + "unit": true, + "upstream_addr": true, + "upstream_connect_time": true, + "upstream_header_time": true, + "upstream_response_time": true, + "upstream_status": true, + "user_agent": true + }, + "indexByName": {}, + "renameByName": {} + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "dateFormat": "", + "destinationType": "number", + "targetField": "request_time" + } + ], + "fields": {} + } + }, + { + "id": "partitionByValues", + "options": { + "fields": [ + "server_name" + ] + } + }, + { + "id": "renameByRegex", + "options": { + "regex": "request_time (.*)", + "renamePattern": "$1" + } + } + ], + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 7, + "panels": [], + "title": "Databases", + "type": "row" + }, + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "duration_ms" + }, + "properties": [ + { + "id": "custom.width", + "value": 100 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "unit" + }, + "properties": [ + { + "id": "custom.width", + "value": 150 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 6, + "links": [ + { + "title": "explore", + "url": "https://grafana.tiserbox.com/explore?panes=%7B%22HWt%22:%7B%22datasource%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22expr%22:%22%7Bunit%3D%5C%22nginx.service%5C%22%7D%20%7C%20pattern%20%5C%22%3C_%3E%20%3C_%3E%20%3Cline%3E%5C%22%20%7C%20line_format%20%5C%22%7B%7B.line%7D%7D%5C%22%20%7C%20json%20%7C%20status%20%21~%20%5C%222..%5C%22%20%7C%20__error__%20%21%3D%20%5C%22JSONParserErr%5C%22%22,%22queryType%22:%22range%22,%22datasource%22:%7B%22type%22:%22loki%22,%22uid%22:%22cd6cc53e-840c-484d-85f7-96fede324006%22%7D,%22editorMode%22:%22code%22%7D%5D,%22range%22:%7B%22from%22:%22now-6h%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1" + } + ], + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "editorMode": "code", + "expr": "{unit=\"postgresql.service\"} | regexp \".*duration: (?P[0-9.]+) ms (?P.*)\" | duration_ms > 500 | __error__ != \"LabelFilterErr\"", + "queryType": "range", + "refId": "A" + } + ], + "title": "Slow DB Queries", + "transformations": [ + { + "id": "extractFields", + "options": { + "replace": true, + "source": "labels" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "job": true + }, + "indexByName": { + "duration_ms": 0, + "job": 1, + "statement": 3, + "unit": 2 + }, + "renameByName": {} + } + } + ], + "type": "table" + } + ], + "refresh": "", + "schemaVersion": 38, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "0", + "value": "0" + }, + "datasource": { + "type": "prometheus", + "uid": "df80f9f5-97d7-4112-91d8-72f523a02b09" + }, + "definition": "label_values(node_cpu_seconds_total,cpu)", + "hide": 2, + "includeAll": true, + "multi": false, + "name": "cpu", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(node_cpu_seconds_total,cpu)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 3, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Performance", + "uid": "e01156bf-cdba-42eb-9845-a401dd634d41", + "version": 25, + "weekStart": "" +} diff --git a/modules/blocks/monitoring/rules.json b/modules/blocks/monitoring/rules.json new file mode 100644 index 0000000..6519d5f --- /dev/null +++ b/modules/blocks/monitoring/rules.json @@ -0,0 +1,131 @@ +[ + { + "uid": "f5246fa3-163f-4eae-9e1d-5b0fe2af0509", + "title": "5XX Requests Error Budgets Under 99%", + "condition": "threshold", + "data": [ + { + "refId": "A", + "queryType": "range", + "relativeTimeRange": { + "from": 21600, + "to": 0 + }, + "datasourceUid": "cd6cc53e-840c-484d-85f7-96fede324006", + "model": { + "datasource": { + "type": "loki", + "uid": "cd6cc53e-840c-484d-85f7-96fede324006" + }, + "editorMode": "code", + "expr": "(sum by(server_name) (count_over_time({unit=\"nginx.service\"} | pattern \"<_> <_> \" | line_format \"{{.line}}\" | json | __error__ != \"JSONParserErr\" | status =~ \"[1234]..\" | server_name =~ \".*\" [7d])) / sum by(server_name) (count_over_time({unit=\"nginx.service\"} | pattern \"<_> <_> \" | line_format \"{{.line}}\" | json | __error__ != \"JSONParserErr\" | server_name =~ \".*\" [7d])))", + "intervalMs": 1000, + "legendFormat": "{{server_name}}", + "maxDataPoints": 43200, + "queryType": "range", + "refId": "A" + } + }, + { + "refId": "last", + "relativeTimeRange": { + "from": 0, + "to": 0 + }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "B" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "A", + "intervalMs": 1000, + "maxDataPoints": 43200, + "reducer": "last", + "refId": "last", + "type": "reduce" + } + }, + { + "refId": "threshold", + "relativeTimeRange": { + "from": 0, + "to": 0 + }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 0.99 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "last", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "threshold", + "type": "threshold" + } + } + ], + "dasboardUid": "d66242cf-71e8-417c-8ef7-51b0741545df", + "panelId": 9, + "noDataState": "OK", + "execErrState": "Error", + "for": "20m", + "annotations": { + "__dashboardUid__": "d66242cf-71e8-417c-8ef7-51b0741545df", + "__panelId__": "9", + "description": "", + "runbook_url": "", + "summary": "The error budget for a service for the last 7 days is under 99%" + }, + "labels": { + "": "", + "role": "sysadmin" + }, + "isPaused": false + } +] diff --git a/test/vm/monitoring.nix b/test/vm/monitoring.nix new file mode 100644 index 0000000..12723ce --- /dev/null +++ b/test/vm/monitoring.nix @@ -0,0 +1,42 @@ +{ pkgs, lib, ... }: +{ + # This test, although simple, makes sure all provisioning went fine. + auth = pkgs.nixosTest { + name = "monitoring-basic"; + + nodes.machine = { config, pkgs, ... }: { + imports = [ + { + options = { + shb.ssl.enable = lib.mkEnableOption "ssl"; + }; + } + ../../modules/blocks/postgresql.nix + ../../modules/blocks/monitoring.nix + ]; + + shb.monitoring = { + enable = true; + subdomain = "grafana"; + domain = "example.com"; + grafanaPort = 3000; + adminPasswordFile = pkgs.writeText "admin_password" "securepw"; + secretKeyFile = pkgs.writeText "secret_key" "secret_key"; + }; + }; + + testScript = { nodes, ... }: '' + start_all() + machine.wait_for_unit("grafana.service") + + def curl_req(password, wantStatus, endpoint): + response = machine.wait_until_succeeds("curl -i http://admin:{password}@localhost:3000{endpoint}".format(password=password, endpoint=endpoint), timeout=10) + if not response.startswith("HTTP/1.1 {wantStatus}".format(wantStatus=wantStatus)): + raise Exception("Wrong status, expected {}, got {}".format(wantStatus, response[9:12])) + return response + + curl_req("securepw", 200, "/api/org") + curl_req("wrong", 401, "/api/org") + ''; + }; +}