Files
nix-config/modules/nixos/services/grafana/default.nix
mjallen18 3234029ae5 hmm
2026-04-07 22:02:54 -05:00

927 lines
38 KiB
Nix
Executable File

{
config,
lib,
pkgs,
namespace,
...
}:
with lib;
let
name = "grafana";
cfg = config.${namespace}.services.${name};
# ---------------------------------------------------------------------------
# Community dashboards — fetched at build time, pinned by hash.
#
# Community dashboards use __inputs with a template variable (e.g.
# ${DS_PROM} or ${DS_PROMETHEUS}) for the datasource UID. When provisioned
# via file Grafana never substitutes those, so every panel is datasource-
# broken. We patch each file at build time: replace all occurrences of the
# template variable with our fixed datasource UID "prometheus", and strip
# __inputs/__requires so Grafana doesn't treat the file as an import.
# ---------------------------------------------------------------------------
# Patch a community Grafana dashboard JSON at eval time using pure Nix:
# 1. Parse the JSON with builtins.fromJSON
# 2. Strip __inputs and __requires (import-only metadata)
# 3. Replace the datasource UID template variable with our fixed UID
# using builtins.replaceStrings on the re-serialised JSON string —
# this avoids any ${} interpolation issues in Nix strings entirely.
# 4. Write the result to the store with pkgs.writeText
patchDashboard =
name: src: dsVar:
let
raw = builtins.readFile src;
d = builtins.fromJSON raw;
# Strip import metadata then re-serialise
stripped = builtins.toJSON (
builtins.removeAttrs d [
"__inputs"
"__requires"
]
);
# Replace the template variable (e.g. "${DS_PROMETHEUS}") with our UID.
# builtins.replaceStrings takes lists so we never write ${} in Nix source.
patched = builtins.replaceStrings [ ("\${" + dsVar + "}") ] [ "prometheus" ] stripped;
in
pkgs.writeText name patched;
communityDashboards = pkgs.linkFarm "grafana-community-dashboards" [
{
# Node Exporter Full — https://grafana.com/grafana/dashboards/1860
# Uses ${ds_prometheus} (lowercase)
name = "node-exporter-full.json";
path = patchDashboard "node-exporter-full.json" (pkgs.fetchurl {
url = "https://grafana.com/api/dashboards/1860/revisions/latest/download";
sha256 = "sha256-mEWSdsTn1EKpW6xoJv/s0XST46EOoUPbDugQwyngIss=";
}) "ds_prometheus";
}
{
# PostgreSQL Database — https://grafana.com/grafana/dashboards/9628
name = "postgresql.json";
path = patchDashboard "postgresql.json" (pkgs.fetchurl {
url = "https://grafana.com/api/dashboards/9628/revisions/latest/download";
sha256 = "sha256-UhusNAZbyt7fJV/DhFUK4FKOmnTpG0R15YO2r+nDnMc=";
}) "DS_PROMETHEUS";
}
{
# Redis Dashboard for prometheus-redis-exporter 1.x — https://grafana.com/grafana/dashboards/763
# Uses DS_PROM; also patches out the 'namespace' template variable
# since our metrics have no namespace label — all done in pure Nix.
name = "redis.json";
path =
let
src = pkgs.fetchurl {
url = "https://grafana.com/api/dashboards/763/revisions/latest/download";
sha256 = "sha256-pThz+zHjcTT9vf8fpUuZK/ejNnH9GwEZVXOY27c9Aw8=";
};
raw = builtins.readFile src;
d = builtins.removeAttrs (builtins.fromJSON raw) [
"__inputs"
"__requires"
];
# Drop the 'namespace' variable and fix 'instance' to query directly.
fixedTemplating = d // {
templating = d.templating // {
list = map (
v:
if v.name == "instance" then
v
// {
query = "label_values(redis_up, instance)";
definition = "label_values(redis_up, instance)";
}
else
v
) (builtins.filter (v: v.name != "namespace") d.templating.list);
};
};
patched = builtins.replaceStrings [ ("\${" + "DS_PROM" + "}") ] [ "prometheus" ] (
builtins.toJSON fixedTemplating
);
in
pkgs.writeText "redis.json" patched;
}
{
# MySQL Overview — https://grafana.com/grafana/dashboards/7362
name = "mysql.json";
path = patchDashboard "mysql.json" (pkgs.fetchurl {
url = "https://grafana.com/api/dashboards/7362/revisions/latest/download";
sha256 = "sha256-WW7g60KY20XAdyUpumA0hBrjFC9MQGuGjiJKUhSVBXI=";
}) "DS_PROMETHEUS";
}
{
# Nextcloud — https://grafana.com/grafana/dashboards/9632
name = "nextcloud.json";
path = patchDashboard "nextcloud.json" (pkgs.fetchurl {
url = "https://grafana.com/api/dashboards/9632/revisions/latest/download";
sha256 = "sha256-Z28Q/sMg3jxglkszAs83IpL8f4p9loNnTQzjc3S/SAQ=";
}) "DS_PROMETHEUS";
}
];
# ---------------------------------------------------------------------------
# Custom dashboards — maintained in this repo under dashboards/
# ---------------------------------------------------------------------------
customDashboards = pkgs.linkFarm "grafana-custom-dashboards" [
{
name = "nut.json";
path = ./dashboards/nut.json;
}
{
name = "caddy.json";
path = ./dashboards/caddy.json;
}
{
name = "gitea.json";
path = ./dashboards/gitea.json;
}
{
name = "nas-overview.json";
path = ./dashboards/nas-overview.json;
}
];
# Minimal .my.cnf for the mysqld exporter. No credentials are needed
# because runAsLocalSuperUser = true runs as the mysql OS user, which
# MariaDB authenticates via the unix_socket plugin automatically.
mysqldExporterCnf = pkgs.writeText "prometheus-mysqld-exporter.cnf" ''
[client]
user=root
socket=/run/mysqld/mysqld.sock
'';
giteaPort = config.${namespace}.services.gitea.port;
resticPort = config.${namespace}.services.restic-server.port;
nextcloudPort = config.${namespace}.services.nextcloud.port;
grafanaConfig = lib.${namespace}.mkModule {
inherit config name;
description = "grafana";
options = { };
moduleConfig = {
services = {
prometheus = {
enable = true;
# bearer_token_file paths (e.g. Gitea metrics key) are SOPS secrets
# that only exist at runtime, not in the Nix build sandbox.
# "syntax-only" still catches config errors without stat-ing the files.
checkConfig = "syntax-only";
exporters = {
node = {
enable = true;
enabledCollectors = [
"filesystem"
"diskstats"
"meminfo"
"cpu"
"systemd"
"processes"
];
extraFlags = [
"--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run)($|/)"
];
};
libvirt = {
enable = false;
openFirewall = true;
};
nut = {
enable = true;
openFirewall = true;
passwordPath = config.sops.secrets."jallen-nas/ups_password".path;
nutUser = upsUser;
};
# PostgreSQL — runs as the local postgres superuser via peer auth
# (Unix socket, no password required).
postgres = {
enable = true;
runAsLocalSuperUser = true;
};
# Redis — single exporter instance covering all four Redis servers
# via the multi-target scrape pattern (/scrape?target=<addr>).
# The exporter needs AF_INET to reach TCP Redis instances.
redis = {
enable = true;
# No fixed --redis.addr: multi-target mode uses ?target= param.
};
# MariaDB — runs as the mysql OS user so it can connect via the
# Unix socket without a password (unix_socket auth).
mysqld = {
enable = true;
runAsLocalSuperUser = true;
configFile = mysqldExporterCnf;
};
# Nextcloud — authenticates with the admin account.
# passwordFile must be readable by the prometheus-nextcloud-exporter
# user; sops mode 0440 + group keys covers that.
nextcloud = {
enable = true;
url = "http://localhost:${toString nextcloudPort}";
username = "mjallen";
passwordFile = config.sops.secrets."jallen-nas/nextcloud/adminpassword".path;
};
};
scrapeConfigs = [
# ── System ──────────────────────────────────────────────────────────
{
job_name = "node";
static_configs = [
{
targets = [ "localhost:${toString config.services.prometheus.exporters.node.port}" ];
}
];
}
# ── UPS (NUT) ────────────────────────────────────────────────────────
{
job_name = "nut";
# DRuggeri's nut_exporter serves UPS metrics at /ups_metrics, not /metrics.
metrics_path = "/ups_metrics";
static_configs = [
{
targets = [ "localhost:${toString config.services.prometheus.exporters.nut.port}" ];
}
];
}
# ── Databases ────────────────────────────────────────────────────────
{
job_name = "postgres";
static_configs = [
{
targets = [ "localhost:${toString config.services.prometheus.exporters.postgres.port}" ];
}
];
}
{
# Redis multi-target: one exporter, four Redis instances.
# The redis_exporter's /scrape?target= endpoint proxies each target
# so a single exporter process covers all servers.
job_name = "redis";
metrics_path = "/scrape";
static_configs = [
{
targets = [
"redis://localhost:6379" # authentik
"redis://localhost:6363" # ccache
"redis://localhost:6380" # manyfold
"redis://localhost:6381" # onlyoffice
];
}
];
relabel_configs = [
{
source_labels = [ "__address__" ];
target_label = "__param_target";
}
{
source_labels = [ "__param_target" ];
target_label = "instance";
}
{
target_label = "__address__";
replacement = "localhost:${toString config.services.prometheus.exporters.redis.port}";
}
];
}
{
job_name = "mysqld";
static_configs = [
{
targets = [ "localhost:${toString config.services.prometheus.exporters.mysqld.port}" ];
}
];
}
# ── Application services ─────────────────────────────────────────────
{
# Caddy exposes its built-in Prometheus endpoint on port 2019.
job_name = "caddy";
static_configs = [
{
targets = [ "localhost:2019" ];
}
];
}
{
# Gitea's /metrics endpoint is protected by a Bearer token.
job_name = "gitea";
metrics_path = "/metrics";
bearer_token_file = config.sops.secrets."jallen-nas/gitea/metrics-key".path;
static_configs = [
{
targets = [ "localhost:${toString giteaPort}" ];
}
];
}
{
# restic REST server exposes Prometheus metrics at /metrics.
job_name = "restic";
metrics_path = "/metrics";
static_configs = [
{
targets = [ "localhost:${toString resticPort}" ];
}
];
}
{
job_name = "nextcloud";
static_configs = [
{
targets = [ "localhost:${toString config.services.prometheus.exporters.nextcloud.port}" ];
}
];
}
];
};
grafana = {
enable = true;
settings = {
server = {
http_port = cfg.port;
http_addr = "0.0.0.0";
};
security = {
# Read the secret key from a SOPS-managed file at runtime so it
# never appears in the Nix store. The "$__file{}" syntax is
# Grafana's built-in file provider.
secret_key = "$__file{${config.sops.secrets."jallen-nas/grafana/secret-key".path}}";
};
# Grafana 12 enables kubernetesDashboards by default, which uses a
# new storage backend that validates datasource refs in dashboard
# files concurrently with datasource provisioning, causing a race
# that always fails on a clean install. Disable it to use the
# classic file provisioner that tolerates missing datasource refs.
"feature_toggles" = {
kubernetesDashboards = false;
};
# Grafana 12 introduced permitted_provisioning_paths as a security
# allowlist. The NixOS module stores all provisioning files in the
# Nix store, which is not in the default allowlist, causing the
# provisioner to silently refuse to load any files and then error
# with "data source not found".
paths.permitted_provisioning_paths = "/nix/store";
};
dataDir = "${cfg.configDir}/grafana";
provision = {
enable = true;
# Use path instead of settings to avoid the NixOS serializer
# writing `secureJsonData: null` which Grafana 12 chokes on.
datasources.path = pkgs.writeTextDir "datasource.yaml" ''
apiVersion: 1
datasources:
- name: Prometheus
uid: prometheus
type: prometheus
access: proxy
orgId: 1
url: http://localhost:${toString config.services.prometheus.port}
editable: false
jsonData:
httpMethod: POST
timeInterval: 15s
'';
# ---------------------------------------------------------------------------
# Alerting provisioning
# ---------------------------------------------------------------------------
# TEMPORARILY DISABLED - template format incompatible with Grafana 12
/*
alerting = {
# Contact points
# ntfy via the Grafana webhook contact point. Grafana POSTs a JSON
# body; ntfy accepts any body as the message text. We use the
# message template below to format it nicely.
#
# Credentials are injected via Grafana's $__env{} provider, which
# reads from the process environment. The GRAFANA_NTFY_USER and
# GRAFANA_NTFY_PASSWORD variables are set via the SOPS-managed
# grafana.env EnvironmentFile on the grafana.service unit.
#
# Note: $__file{} only works in grafana.ini settings, not in
# provisioning YAML files using it here causes a parse error.
contactPoints.path = pkgs.writeTextDir "contactPoints.yaml" ''
apiVersion: 1
contactPoints:
- name: ntfy
receivers:
- uid: ntfy-webhook
type: webhook
disableResolveMessage: false
settings:
url: https://ntfy.mjallen.dev/grafana-alerts
httpMethod: POST
username: $__env{GRAFANA_NTFY_USER}
password: $__env{GRAFANA_NTFY_PASSWORD}
httpHeaders:
Tags: "chart,bell"
'';
# Notification message template
# Grafana sends the rendered template body as the POST body.
# ntfy treats the body as the message text.
templates.settings = {
apiVersion = 1;
templates = [
{
name = "ntfy_message";
template = ''
{{ define "ntfy_message" -}}
{{ .CommonAnnotations.summary | default .GroupLabels.alertname }}
{{ range .Alerts -}}
Status: {{ .Status | title }}
Alert: {{ .Labels.alertname }}
Severity: {{ .Labels.severity | default "unknown" }}
Instance: {{ .Labels.instance | default "unknown" }}
{{ if .Annotations.description -}}
Details: {{ .Annotations.description }}
{{ end -}}
{{ end -}}
{{ end }}
'';
}
];
};
# Notification routing policy
policies.settings = {
apiVersion = 1;
policies = [
{
receiver = "ntfy";
group_by = [
"alertname"
"severity"
];
group_wait = "30s";
group_interval = "5m";
repeat_interval = "4h";
routes = [
# Critical alerts: repeat every 1h, no grouping wait
{
receiver = "ntfy";
matchers = [ "severity = critical" ];
group_wait = "0s";
repeat_interval = "1h";
}
];
}
];
};
# Alert rules
rules.settings = {
apiVersion = 1;
groups = [
{
name = "nas-system";
folder = "NAS Alerts";
interval = "1m";
rules = [
# Disk usage > 85% warning, > 95% critical
{
uid = "nas-disk-warning";
title = "Disk usage high";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = ''
(
node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
)
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
* 100
'';
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "gt";
params = [ 85 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "5m";
annotations = {
summary = "Disk usage above 85%";
description = "Filesystem {{ $labels.mountpoint }} is {{ $values.B | printf \"%.1f\" }}% full.";
};
labels = {
severity = "warning";
};
isPaused = false;
}
# Memory usage > 90%
{
uid = "nas-memory-high";
title = "Memory usage high";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = ''
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
'';
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "gt";
params = [ 90 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "5m";
annotations = {
summary = "Memory usage above 90%";
description = "Memory usage is {{ $values.B | printf \"%.1f\" }}%.";
};
labels = {
severity = "warning";
};
isPaused = false;
}
# CPU > 90% sustained for 10m
{
uid = "nas-cpu-high";
title = "CPU usage sustained high";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = ''
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
'';
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "gt";
params = [ 90 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "10m";
annotations = {
summary = "CPU sustained above 90%";
description = "CPU usage has been above 90% for 10 minutes (currently {{ $values.B | printf \"%.1f\" }}%).";
};
labels = {
severity = "warning";
};
isPaused = false;
}
# UPS on battery (network_ups_tools_ups_status == 0 means OB/on-battery)
{
uid = "nas-ups-onbatt";
title = "UPS on battery";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = "network_ups_tools_ups_status";
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
# status 0 = OB (on battery), 1 = OL (online)
conditions = [
{
evaluator = {
type = "lt";
params = [ 1 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "1m";
annotations = {
summary = "UPS is running on battery";
description = "Mains power failure detected. UPS battery charge: {{ with query \"network_ups_tools_battery_charge\" }}{{ . | first | value | printf \"%.0f\" }}%{{ end }}.";
};
labels = {
severity = "critical";
};
isPaused = false;
}
# UPS battery charge < 30%
{
uid = "nas-ups-lowbatt";
title = "UPS battery low";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = "network_ups_tools_battery_charge";
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "lt";
params = [ 30 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "2m";
annotations = {
summary = "UPS battery charge below 30%";
description = "UPS battery is at {{ $values.B | printf \"%.0f\" }}%. Shutdown may be imminent.";
};
labels = {
severity = "critical";
};
isPaused = false;
}
# PostgreSQL not responding
{
uid = "nas-postgres-down";
title = "PostgreSQL down";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = "pg_up";
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "lt";
params = [ 1 ];
};
}
];
};
}
];
noDataState = "Alerting";
execErrState = "Error";
for = "2m";
annotations = {
summary = "PostgreSQL is down";
description = "The PostgreSQL exporter reports pg_up=0. Database may be unavailable.";
};
labels = {
severity = "critical";
};
isPaused = false;
}
];
}
];
};
muteTimings.settings = {
apiVersion = 1;
muteTimes = [ ];
};
};
*/
dashboards.settings.providers = [
{
name = "community";
orgId = 1;
type = "file";
disableDeletion = true;
updateIntervalSeconds = 60;
allowUiUpdates = false;
options.path = communityDashboards;
}
{
name = "custom";
orgId = 1;
type = "file";
disableDeletion = true;
updateIntervalSeconds = 60;
allowUiUpdates = false;
options.path = customDashboards;
}
];
};
};
};
# Inject ntfy credentials into Grafana's environment so the $__env{}
# provider in contactPoints.yaml can resolve them at runtime.
# The grafana.env template is managed by SOPS and owned by grafana:grafana.
sops.templates."grafana.env" = {
content = ''
GRAFANA_NTFY_USER=${config.sops.placeholder."jallen-nas/ntfy/user"}
GRAFANA_NTFY_PASSWORD=${config.sops.placeholder."jallen-nas/ntfy/password"}
'';
mode = "0400";
owner = "grafana";
restartUnits = [ "grafana.service" ];
};
systemd.services.grafana.serviceConfig = {
EnvironmentFile = config.sops.templates."grafana.env".path;
# Grafana downloads plugins at runtime and occasionally creates subdirectories
# with overly restrictive permissions (e.g. 0700 for locales/*), which causes
# the next startup to fail with "permission denied" during plugin discovery.
# Fix any such directories before Grafana starts.
ExecStartPre = [
(
"+"
+ pkgs.writeShellScript "grafana-fix-plugin-perms" ''
pluginDir="${cfg.configDir}/grafana/plugins"
if [ -d "$pluginDir" ]; then
${pkgs.coreutils}/bin/chmod -R a+rX "$pluginDir"
fi
''
)
];
};
# The redis exporter needs AF_INET to reach TCP Redis instances.
# The default systemd hardening only allows AF_UNIX.
systemd.services.prometheus-redis-exporter.serviceConfig.RestrictAddressFamilies = [
"AF_UNIX"
"AF_INET"
"AF_INET6"
];
};
};
upsUser = "nas-admin";
in
{
imports = [ grafanaConfig ];
}