{ config, lib, pkgs, namespace, ... }: with lib; let name = "grafana"; cfg = config.${namespace}.services.${name}; # --------------------------------------------------------------------------- # Community dashboards — fetched at build time, pinned by hash. # # Community dashboards use __inputs with a template variable (e.g. # ${DS_PROM} or ${DS_PROMETHEUS}) for the datasource UID. When provisioned # via file Grafana never substitutes those, so every panel is datasource- # broken. We patch each file at build time: replace all occurrences of the # template variable with our fixed datasource UID "prometheus", and strip # __inputs/__requires so Grafana doesn't treat the file as an import. # --------------------------------------------------------------------------- # Patch a community Grafana dashboard JSON at eval time using pure Nix: # 1. Parse the JSON with builtins.fromJSON # 2. Strip __inputs and __requires (import-only metadata) # 3. Replace the datasource UID template variable with our fixed UID # using builtins.replaceStrings on the re-serialised JSON string — # this avoids any ${} interpolation issues in Nix strings entirely. # 4. Write the result to the store with pkgs.writeText patchDashboard = name: src: dsVar: let raw = builtins.readFile src; d = builtins.fromJSON raw; # Strip import metadata then re-serialise stripped = builtins.toJSON ( builtins.removeAttrs d [ "__inputs" "__requires" ] ); # Replace the template variable (e.g. "${DS_PROMETHEUS}") with our UID. # builtins.replaceStrings takes lists so we never write ${} in Nix source. patched = builtins.replaceStrings [ ("\${" + dsVar + "}") ] [ "prometheus" ] stripped; in pkgs.writeText name patched; communityDashboards = pkgs.linkFarm "grafana-community-dashboards" [ { # Node Exporter Full — https://grafana.com/grafana/dashboards/1860 # Uses ${ds_prometheus} (lowercase) name = "node-exporter-full.json"; path = patchDashboard "node-exporter-full.json" (pkgs.fetchurl { url = "https://grafana.com/api/dashboards/1860/revisions/latest/download"; sha256 = "sha256-IeQ72CZhtckDEihcVLhAFuSs77uWsZSENsdomSrWTHo="; }) "ds_prometheus"; } { # PostgreSQL Database — https://grafana.com/grafana/dashboards/9628 name = "postgresql.json"; path = patchDashboard "postgresql.json" (pkgs.fetchurl { url = "https://grafana.com/api/dashboards/9628/revisions/latest/download"; sha256 = "sha256-UhusNAZbyt7fJV/DhFUK4FKOmnTpG0R15YO2r+nDnMc="; }) "DS_PROMETHEUS"; } { # Redis Dashboard for prometheus-redis-exporter 1.x — https://grafana.com/grafana/dashboards/763 # Uses DS_PROM; also patches out the 'namespace' template variable # since our metrics have no namespace label — all done in pure Nix. name = "redis.json"; path = let src = pkgs.fetchurl { url = "https://grafana.com/api/dashboards/763/revisions/latest/download"; sha256 = "sha256-pThz+zHjcTT9vf8fpUuZK/ejNnH9GwEZVXOY27c9Aw8="; }; raw = builtins.readFile src; d = builtins.removeAttrs (builtins.fromJSON raw) [ "__inputs" "__requires" ]; # Drop the 'namespace' variable and fix 'instance' to query directly. fixedTemplating = d // { templating = d.templating // { list = map ( v: if v.name == "instance" then v // { query = "label_values(redis_up, instance)"; definition = "label_values(redis_up, instance)"; } else v ) (builtins.filter (v: v.name != "namespace") d.templating.list); }; }; patched = builtins.replaceStrings [ ("\${" + "DS_PROM" + "}") ] [ "prometheus" ] ( builtins.toJSON fixedTemplating ); in pkgs.writeText "redis.json" patched; } { # MySQL Overview — https://grafana.com/grafana/dashboards/7362 name = "mysql.json"; path = patchDashboard "mysql.json" (pkgs.fetchurl { url = "https://grafana.com/api/dashboards/7362/revisions/latest/download"; sha256 = "sha256-WW7g60KY20XAdyUpumA0hBrjFC9MQGuGjiJKUhSVBXI="; }) "DS_PROMETHEUS"; } { # Nextcloud — https://grafana.com/grafana/dashboards/9632 name = "nextcloud.json"; path = patchDashboard "nextcloud.json" (pkgs.fetchurl { url = "https://grafana.com/api/dashboards/9632/revisions/latest/download"; sha256 = "sha256-Z28Q/sMg3jxglkszAs83IpL8f4p9loNnTQzjc3S/SAQ="; }) "DS_PROMETHEUS"; } ]; # --------------------------------------------------------------------------- # Custom dashboards — maintained in this repo under dashboards/ # --------------------------------------------------------------------------- customDashboards = pkgs.linkFarm "grafana-custom-dashboards" [ { name = "nut.json"; path = ./dashboards/nut.json; } { name = "caddy.json"; path = ./dashboards/caddy.json; } { name = "gitea.json"; path = ./dashboards/gitea.json; } { name = "nas-overview.json"; path = ./dashboards/nas-overview.json; } ]; # Minimal .my.cnf for the mysqld exporter. No credentials are needed # because runAsLocalSuperUser = true runs as the mysql OS user, which # MariaDB authenticates via the unix_socket plugin automatically. mysqldExporterCnf = pkgs.writeText "prometheus-mysqld-exporter.cnf" '' [client] user=root socket=/run/mysqld/mysqld.sock ''; giteaPort = config.${namespace}.services.gitea.port; resticPort = config.${namespace}.services.restic-server.port; nextcloudPort = config.${namespace}.services.nextcloud.port; grafanaConfig = lib.${namespace}.mkModule { inherit config name; description = "grafana"; options = { }; moduleConfig = { services = { prometheus = { enable = true; # bearer_token_file paths (e.g. Gitea metrics key) are SOPS secrets # that only exist at runtime, not in the Nix build sandbox. # "syntax-only" still catches config errors without stat-ing the files. checkConfig = "syntax-only"; exporters = { node = { enable = true; enabledCollectors = [ "filesystem" "diskstats" "meminfo" "cpu" "systemd" "processes" ]; extraFlags = [ "--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run)($|/)" ]; }; libvirt = { enable = false; openFirewall = true; }; nut = { enable = true; openFirewall = true; passwordPath = config.sops.secrets."jallen-nas/ups_password".path; nutUser = upsUser; }; # PostgreSQL — runs as the local postgres superuser via peer auth # (Unix socket, no password required). postgres = { enable = true; runAsLocalSuperUser = true; }; # Redis — single exporter instance covering all four Redis servers # via the multi-target scrape pattern (/scrape?target=). # The exporter needs AF_INET to reach TCP Redis instances. redis = { enable = true; # No fixed --redis.addr: multi-target mode uses ?target= param. }; # MariaDB — runs as the mysql OS user so it can connect via the # Unix socket without a password (unix_socket auth). mysqld = { enable = true; runAsLocalSuperUser = true; configFile = mysqldExporterCnf; }; # Nextcloud — authenticates with the admin account. # passwordFile must be readable by the prometheus-nextcloud-exporter # user; sops mode 0440 + group keys covers that. nextcloud = { enable = true; url = "http://localhost:${toString nextcloudPort}"; username = "mjallen"; passwordFile = config.sops.secrets."jallen-nas/nextcloud/adminpassword".path; }; }; scrapeConfigs = [ # ── System ────────────────────────────────────────────────────────── { job_name = "node"; static_configs = [ { targets = [ "localhost:${toString config.services.prometheus.exporters.node.port}" ]; } ]; } # ── UPS (NUT) ──────────────────────────────────────────────────────── { job_name = "nut"; # DRuggeri's nut_exporter serves UPS metrics at /ups_metrics, not /metrics. metrics_path = "/ups_metrics"; static_configs = [ { targets = [ "localhost:${toString config.services.prometheus.exporters.nut.port}" ]; } ]; } # ── Databases ──────────────────────────────────────────────────────── { job_name = "postgres"; static_configs = [ { targets = [ "localhost:${toString config.services.prometheus.exporters.postgres.port}" ]; } ]; } { # Redis multi-target: one exporter, four Redis instances. # The redis_exporter's /scrape?target= endpoint proxies each target # so a single exporter process covers all servers. job_name = "redis"; metrics_path = "/scrape"; static_configs = [ { targets = [ "redis://localhost:6379" # authentik "redis://localhost:6363" # ccache "redis://localhost:6380" # manyfold "redis://localhost:6381" # onlyoffice ]; } ]; relabel_configs = [ { source_labels = [ "__address__" ]; target_label = "__param_target"; } { source_labels = [ "__param_target" ]; target_label = "instance"; } { target_label = "__address__"; replacement = "localhost:${toString config.services.prometheus.exporters.redis.port}"; } ]; } { job_name = "mysqld"; static_configs = [ { targets = [ "localhost:${toString config.services.prometheus.exporters.mysqld.port}" ]; } ]; } # ── Application services ───────────────────────────────────────────── { # Caddy exposes its built-in Prometheus endpoint on port 2019. job_name = "caddy"; static_configs = [ { targets = [ "localhost:2019" ]; } ]; } { # Gitea's /metrics endpoint is protected by a Bearer token. job_name = "gitea"; metrics_path = "/metrics"; bearer_token_file = config.sops.secrets."jallen-nas/gitea/metrics-key".path; static_configs = [ { targets = [ "localhost:${toString giteaPort}" ]; } ]; } { # restic REST server exposes Prometheus metrics at /metrics. job_name = "restic"; metrics_path = "/metrics"; static_configs = [ { targets = [ "localhost:${toString resticPort}" ]; } ]; } { job_name = "nextcloud"; static_configs = [ { targets = [ "localhost:${toString config.services.prometheus.exporters.nextcloud.port}" ]; } ]; } ]; }; grafana = { enable = true; settings = { server = { http_port = cfg.port; http_addr = "0.0.0.0"; }; security = { # Read the secret key from a SOPS-managed file at runtime so it # never appears in the Nix store. The "$__file{}" syntax is # Grafana's built-in file provider. secret_key = "$__file{${config.sops.secrets."jallen-nas/grafana/secret-key".path}}"; }; # Grafana 12 enables kubernetesDashboards by default, which uses a # new storage backend that validates datasource refs in dashboard # files concurrently with datasource provisioning, causing a race # that always fails on a clean install. Disable it to use the # classic file provisioner that tolerates missing datasource refs. "feature_toggles" = { kubernetesDashboards = false; }; # Grafana 12 introduced permitted_provisioning_paths as a security # allowlist. The NixOS module stores all provisioning files in the # Nix store, which is not in the default allowlist, causing the # provisioner to silently refuse to load any files and then error # with "data source not found". paths.permitted_provisioning_paths = "/nix/store"; }; dataDir = "${cfg.configDir}/grafana"; provision = { enable = true; # Use path instead of settings to avoid the NixOS serializer # writing `secureJsonData: null` which Grafana 12 chokes on. datasources.path = pkgs.writeTextDir "datasource.yaml" '' apiVersion: 1 datasources: - name: Prometheus uid: prometheus type: prometheus access: proxy orgId: 1 url: http://localhost:${toString config.services.prometheus.port} editable: false jsonData: httpMethod: POST timeInterval: 15s ''; # --------------------------------------------------------------------------- # Alerting provisioning # --------------------------------------------------------------------------- # TEMPORARILY DISABLED - template format incompatible with Grafana 12 /* alerting = { # ── Contact points ────────────────────────────────────────────────── # ntfy via the Grafana webhook contact point. Grafana POSTs a JSON # body; ntfy accepts any body as the message text. We use the # message template below to format it nicely. # # Credentials are injected via Grafana's $__env{} provider, which # reads from the process environment. The GRAFANA_NTFY_USER and # GRAFANA_NTFY_PASSWORD variables are set via the SOPS-managed # grafana.env EnvironmentFile on the grafana.service unit. # # Note: $__file{} only works in grafana.ini settings, not in # provisioning YAML files — using it here causes a parse error. contactPoints.path = pkgs.writeTextDir "contactPoints.yaml" '' apiVersion: 1 contactPoints: - name: ntfy receivers: - uid: ntfy-webhook type: webhook disableResolveMessage: false settings: url: https://ntfy.mjallen.dev/grafana-alerts httpMethod: POST username: $__env{GRAFANA_NTFY_USER} password: $__env{GRAFANA_NTFY_PASSWORD} httpHeaders: Tags: "chart,bell" ''; # ── Notification message template ─────────────────────────────────── # Grafana sends the rendered template body as the POST body. # ntfy treats the body as the message text. templates.settings = { apiVersion = 1; templates = [ { name = "ntfy_message"; template = '' {{ define "ntfy_message" -}} {{ .CommonAnnotations.summary | default .GroupLabels.alertname }} {{ range .Alerts -}} Status: {{ .Status | title }} Alert: {{ .Labels.alertname }} Severity: {{ .Labels.severity | default "unknown" }} Instance: {{ .Labels.instance | default "unknown" }} {{ if .Annotations.description -}} Details: {{ .Annotations.description }} {{ end -}} {{ end -}} {{ end }} ''; } ]; }; # ── Notification routing policy ───────────────────────────────────── policies.settings = { apiVersion = 1; policies = [ { receiver = "ntfy"; group_by = [ "alertname" "severity" ]; group_wait = "30s"; group_interval = "5m"; repeat_interval = "4h"; routes = [ # Critical alerts: repeat every 1h, no grouping wait { receiver = "ntfy"; matchers = [ "severity = critical" ]; group_wait = "0s"; repeat_interval = "1h"; } ]; } ]; }; # ── Alert rules ───────────────────────────────────────────────────── rules.settings = { apiVersion = 1; groups = [ { name = "nas-system"; folder = "NAS Alerts"; interval = "1m"; rules = [ # Disk usage > 85% warning, > 95% critical { uid = "nas-disk-warning"; title = "Disk usage high"; condition = "C"; data = [ { refId = "A"; datasourceUid = "prometheus"; model = { expr = '' ( node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"} - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"} ) / node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"} * 100 ''; intervalMs = 60000; maxDataPoints = 43200; refId = "A"; }; } { refId = "B"; datasourceUid = "__expr__"; model = { type = "reduce"; refId = "B"; expression = "A"; reducer = "last"; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "threshold"; refId = "C"; expression = "B"; conditions = [ { evaluator = { type = "gt"; params = [ 85 ]; }; } ]; }; } ]; noDataState = "NoData"; execErrState = "Error"; for = "5m"; annotations = { summary = "Disk usage above 85%"; description = "Filesystem {{ $labels.mountpoint }} is {{ $values.B | printf \"%.1f\" }}% full."; }; labels = { severity = "warning"; }; isPaused = false; } # Memory usage > 90% { uid = "nas-memory-high"; title = "Memory usage high"; condition = "C"; data = [ { refId = "A"; datasourceUid = "prometheus"; model = { expr = '' (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 ''; intervalMs = 60000; maxDataPoints = 43200; refId = "A"; }; } { refId = "B"; datasourceUid = "__expr__"; model = { type = "reduce"; refId = "B"; expression = "A"; reducer = "last"; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "threshold"; refId = "C"; expression = "B"; conditions = [ { evaluator = { type = "gt"; params = [ 90 ]; }; } ]; }; } ]; noDataState = "NoData"; execErrState = "Error"; for = "5m"; annotations = { summary = "Memory usage above 90%"; description = "Memory usage is {{ $values.B | printf \"%.1f\" }}%."; }; labels = { severity = "warning"; }; isPaused = false; } # CPU > 90% sustained for 10m { uid = "nas-cpu-high"; title = "CPU usage sustained high"; condition = "C"; data = [ { refId = "A"; datasourceUid = "prometheus"; model = { expr = '' 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) ''; intervalMs = 60000; maxDataPoints = 43200; refId = "A"; }; } { refId = "B"; datasourceUid = "__expr__"; model = { type = "reduce"; refId = "B"; expression = "A"; reducer = "last"; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "threshold"; refId = "C"; expression = "B"; conditions = [ { evaluator = { type = "gt"; params = [ 90 ]; }; } ]; }; } ]; noDataState = "NoData"; execErrState = "Error"; for = "10m"; annotations = { summary = "CPU sustained above 90%"; description = "CPU usage has been above 90% for 10 minutes (currently {{ $values.B | printf \"%.1f\" }}%)."; }; labels = { severity = "warning"; }; isPaused = false; } # UPS on battery (network_ups_tools_ups_status == 0 means OB/on-battery) { uid = "nas-ups-onbatt"; title = "UPS on battery"; condition = "C"; data = [ { refId = "A"; datasourceUid = "prometheus"; model = { expr = "network_ups_tools_ups_status"; intervalMs = 60000; maxDataPoints = 43200; refId = "A"; }; } { refId = "B"; datasourceUid = "__expr__"; model = { type = "reduce"; refId = "B"; expression = "A"; reducer = "last"; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "threshold"; refId = "C"; expression = "B"; # status 0 = OB (on battery), 1 = OL (online) conditions = [ { evaluator = { type = "lt"; params = [ 1 ]; }; } ]; }; } ]; noDataState = "NoData"; execErrState = "Error"; for = "1m"; annotations = { summary = "UPS is running on battery"; description = "Mains power failure detected. UPS battery charge: {{ with query \"network_ups_tools_battery_charge\" }}{{ . | first | value | printf \"%.0f\" }}%{{ end }}."; }; labels = { severity = "critical"; }; isPaused = false; } # UPS battery charge < 30% { uid = "nas-ups-lowbatt"; title = "UPS battery low"; condition = "C"; data = [ { refId = "A"; datasourceUid = "prometheus"; model = { expr = "network_ups_tools_battery_charge"; intervalMs = 60000; maxDataPoints = 43200; refId = "A"; }; } { refId = "B"; datasourceUid = "__expr__"; model = { type = "reduce"; refId = "B"; expression = "A"; reducer = "last"; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "threshold"; refId = "C"; expression = "B"; conditions = [ { evaluator = { type = "lt"; params = [ 30 ]; }; } ]; }; } ]; noDataState = "NoData"; execErrState = "Error"; for = "2m"; annotations = { summary = "UPS battery charge below 30%"; description = "UPS battery is at {{ $values.B | printf \"%.0f\" }}%. Shutdown may be imminent."; }; labels = { severity = "critical"; }; isPaused = false; } # PostgreSQL not responding { uid = "nas-postgres-down"; title = "PostgreSQL down"; condition = "C"; data = [ { refId = "A"; datasourceUid = "prometheus"; model = { expr = "pg_up"; intervalMs = 60000; maxDataPoints = 43200; refId = "A"; }; } { refId = "B"; datasourceUid = "__expr__"; model = { type = "reduce"; refId = "B"; expression = "A"; reducer = "last"; }; } { refId = "C"; datasourceUid = "__expr__"; model = { type = "threshold"; refId = "C"; expression = "B"; conditions = [ { evaluator = { type = "lt"; params = [ 1 ]; }; } ]; }; } ]; noDataState = "Alerting"; execErrState = "Error"; for = "2m"; annotations = { summary = "PostgreSQL is down"; description = "The PostgreSQL exporter reports pg_up=0. Database may be unavailable."; }; labels = { severity = "critical"; }; isPaused = false; } ]; } ]; }; muteTimings.settings = { apiVersion = 1; muteTimes = [ ]; }; }; */ dashboards.settings.providers = [ { name = "community"; orgId = 1; type = "file"; disableDeletion = true; updateIntervalSeconds = 60; allowUiUpdates = false; options.path = communityDashboards; } { name = "custom"; orgId = 1; type = "file"; disableDeletion = true; updateIntervalSeconds = 60; allowUiUpdates = false; options.path = customDashboards; } ]; }; }; }; # Inject ntfy credentials into Grafana's environment so the $__env{} # provider in contactPoints.yaml can resolve them at runtime. # The grafana.env template is managed by SOPS and owned by grafana:grafana. sops.templates."grafana.env" = { content = '' GRAFANA_NTFY_USER=${config.sops.placeholder."jallen-nas/ntfy/user"} GRAFANA_NTFY_PASSWORD=${config.sops.placeholder."jallen-nas/ntfy/password"} ''; mode = "0400"; owner = "grafana"; restartUnits = [ "grafana.service" ]; }; systemd.services.grafana.serviceConfig = { EnvironmentFile = config.sops.templates."grafana.env".path; # Grafana downloads plugins at runtime and occasionally creates subdirectories # with overly restrictive permissions (e.g. 0700 for locales/*), which causes # the next startup to fail with "permission denied" during plugin discovery. # Fix any such directories before Grafana starts. ExecStartPre = [ ( "+" + pkgs.writeShellScript "grafana-fix-plugin-perms" '' pluginDir="${cfg.configDir}/grafana/plugins" if [ -d "$pluginDir" ]; then ${pkgs.coreutils}/bin/chmod -R a+rX "$pluginDir" fi '' ) ]; }; # The redis exporter needs AF_INET to reach TCP Redis instances. # The default systemd hardening only allows AF_UNIX. systemd.services.prometheus-redis-exporter.serviceConfig.RestrictAddressFamilies = [ "AF_UNIX" "AF_INET" "AF_INET6" ]; }; }; upsUser = "nas-admin"; in { imports = [ grafanaConfig ]; }