927 lines
38 KiB
Nix
Executable File
927 lines
38 KiB
Nix
Executable File
{
|
|
config,
|
|
lib,
|
|
pkgs,
|
|
namespace,
|
|
...
|
|
}:
|
|
with lib;
|
|
let
|
|
name = "grafana";
|
|
cfg = config.${namespace}.services.${name};
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Community dashboards — fetched at build time, pinned by hash.
|
|
#
|
|
# Community dashboards use __inputs with a template variable (e.g.
|
|
# ${DS_PROM} or ${DS_PROMETHEUS}) for the datasource UID. When provisioned
|
|
# via file Grafana never substitutes those, so every panel is datasource-
|
|
# broken. We patch each file at build time: replace all occurrences of the
|
|
# template variable with our fixed datasource UID "prometheus", and strip
|
|
# __inputs/__requires so Grafana doesn't treat the file as an import.
|
|
# ---------------------------------------------------------------------------
|
|
# Patch a community Grafana dashboard JSON at eval time using pure Nix:
|
|
# 1. Parse the JSON with builtins.fromJSON
|
|
# 2. Strip __inputs and __requires (import-only metadata)
|
|
# 3. Replace the datasource UID template variable with our fixed UID
|
|
# using builtins.replaceStrings on the re-serialised JSON string —
|
|
# this avoids any ${} interpolation issues in Nix strings entirely.
|
|
# 4. Write the result to the store with pkgs.writeText
|
|
patchDashboard =
|
|
name: src: dsVar:
|
|
let
|
|
raw = builtins.readFile src;
|
|
d = builtins.fromJSON raw;
|
|
# Strip import metadata then re-serialise
|
|
stripped = builtins.toJSON (
|
|
builtins.removeAttrs d [
|
|
"__inputs"
|
|
"__requires"
|
|
]
|
|
);
|
|
# Replace the template variable (e.g. "${DS_PROMETHEUS}") with our UID.
|
|
# builtins.replaceStrings takes lists so we never write ${} in Nix source.
|
|
patched = builtins.replaceStrings [ ("\${" + dsVar + "}") ] [ "prometheus" ] stripped;
|
|
in
|
|
pkgs.writeText name patched;
|
|
|
|
communityDashboards = pkgs.linkFarm "grafana-community-dashboards" [
|
|
{
|
|
# Node Exporter Full — https://grafana.com/grafana/dashboards/1860
|
|
# Uses ${ds_prometheus} (lowercase)
|
|
name = "node-exporter-full.json";
|
|
path = patchDashboard "node-exporter-full.json" (pkgs.fetchurl {
|
|
url = "https://grafana.com/api/dashboards/1860/revisions/latest/download";
|
|
sha256 = "sha256-IeQ72CZhtckDEihcVLhAFuSs77uWsZSENsdomSrWTHo=";
|
|
}) "ds_prometheus";
|
|
}
|
|
{
|
|
# PostgreSQL Database — https://grafana.com/grafana/dashboards/9628
|
|
name = "postgresql.json";
|
|
path = patchDashboard "postgresql.json" (pkgs.fetchurl {
|
|
url = "https://grafana.com/api/dashboards/9628/revisions/latest/download";
|
|
sha256 = "sha256-UhusNAZbyt7fJV/DhFUK4FKOmnTpG0R15YO2r+nDnMc=";
|
|
}) "DS_PROMETHEUS";
|
|
}
|
|
{
|
|
# Redis Dashboard for prometheus-redis-exporter 1.x — https://grafana.com/grafana/dashboards/763
|
|
# Uses DS_PROM; also patches out the 'namespace' template variable
|
|
# since our metrics have no namespace label — all done in pure Nix.
|
|
name = "redis.json";
|
|
path =
|
|
let
|
|
src = pkgs.fetchurl {
|
|
url = "https://grafana.com/api/dashboards/763/revisions/latest/download";
|
|
sha256 = "sha256-pThz+zHjcTT9vf8fpUuZK/ejNnH9GwEZVXOY27c9Aw8=";
|
|
};
|
|
raw = builtins.readFile src;
|
|
d = builtins.removeAttrs (builtins.fromJSON raw) [
|
|
"__inputs"
|
|
"__requires"
|
|
];
|
|
# Drop the 'namespace' variable and fix 'instance' to query directly.
|
|
fixedTemplating = d // {
|
|
templating = d.templating // {
|
|
list = map (
|
|
v:
|
|
if v.name == "instance" then
|
|
v
|
|
// {
|
|
query = "label_values(redis_up, instance)";
|
|
definition = "label_values(redis_up, instance)";
|
|
}
|
|
else
|
|
v
|
|
) (builtins.filter (v: v.name != "namespace") d.templating.list);
|
|
};
|
|
};
|
|
patched = builtins.replaceStrings [ ("\${" + "DS_PROM" + "}") ] [ "prometheus" ] (
|
|
builtins.toJSON fixedTemplating
|
|
);
|
|
in
|
|
pkgs.writeText "redis.json" patched;
|
|
}
|
|
{
|
|
# MySQL Overview — https://grafana.com/grafana/dashboards/7362
|
|
name = "mysql.json";
|
|
path = patchDashboard "mysql.json" (pkgs.fetchurl {
|
|
url = "https://grafana.com/api/dashboards/7362/revisions/latest/download";
|
|
sha256 = "sha256-WW7g60KY20XAdyUpumA0hBrjFC9MQGuGjiJKUhSVBXI=";
|
|
}) "DS_PROMETHEUS";
|
|
}
|
|
{
|
|
# Nextcloud — https://grafana.com/grafana/dashboards/9632
|
|
name = "nextcloud.json";
|
|
path = patchDashboard "nextcloud.json" (pkgs.fetchurl {
|
|
url = "https://grafana.com/api/dashboards/9632/revisions/latest/download";
|
|
sha256 = "sha256-Z28Q/sMg3jxglkszAs83IpL8f4p9loNnTQzjc3S/SAQ=";
|
|
}) "DS_PROMETHEUS";
|
|
}
|
|
];
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Custom dashboards — maintained in this repo under dashboards/
|
|
# ---------------------------------------------------------------------------
|
|
customDashboards = pkgs.linkFarm "grafana-custom-dashboards" [
|
|
{
|
|
name = "nut.json";
|
|
path = ./dashboards/nut.json;
|
|
}
|
|
{
|
|
name = "caddy.json";
|
|
path = ./dashboards/caddy.json;
|
|
}
|
|
{
|
|
name = "gitea.json";
|
|
path = ./dashboards/gitea.json;
|
|
}
|
|
{
|
|
name = "nas-overview.json";
|
|
path = ./dashboards/nas-overview.json;
|
|
}
|
|
];
|
|
|
|
# Minimal .my.cnf for the mysqld exporter. No credentials are needed
|
|
# because runAsLocalSuperUser = true runs as the mysql OS user, which
|
|
# MariaDB authenticates via the unix_socket plugin automatically.
|
|
mysqldExporterCnf = pkgs.writeText "prometheus-mysqld-exporter.cnf" ''
|
|
[client]
|
|
user=root
|
|
socket=/run/mysqld/mysqld.sock
|
|
'';
|
|
|
|
giteaPort = config.${namespace}.services.gitea.port;
|
|
resticPort = config.${namespace}.services.restic-server.port;
|
|
nextcloudPort = config.${namespace}.services.nextcloud.port;
|
|
|
|
grafanaConfig = lib.${namespace}.mkModule {
|
|
inherit config name;
|
|
description = "grafana";
|
|
options = { };
|
|
moduleConfig = {
|
|
services = {
|
|
prometheus = {
|
|
enable = true;
|
|
# bearer_token_file paths (e.g. Gitea metrics key) are SOPS secrets
|
|
# that only exist at runtime, not in the Nix build sandbox.
|
|
# "syntax-only" still catches config errors without stat-ing the files.
|
|
checkConfig = "syntax-only";
|
|
exporters = {
|
|
node = {
|
|
enable = true;
|
|
enabledCollectors = [
|
|
"filesystem"
|
|
"diskstats"
|
|
"meminfo"
|
|
"cpu"
|
|
"systemd"
|
|
"processes"
|
|
];
|
|
extraFlags = [
|
|
"--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run)($|/)"
|
|
];
|
|
};
|
|
|
|
libvirt = {
|
|
enable = false;
|
|
openFirewall = true;
|
|
};
|
|
|
|
nut = {
|
|
enable = true;
|
|
openFirewall = true;
|
|
passwordPath = config.sops.secrets."jallen-nas/ups_password".path;
|
|
nutUser = upsUser;
|
|
};
|
|
|
|
# PostgreSQL — runs as the local postgres superuser via peer auth
|
|
# (Unix socket, no password required).
|
|
postgres = {
|
|
enable = true;
|
|
runAsLocalSuperUser = true;
|
|
};
|
|
|
|
# Redis — single exporter instance covering all four Redis servers
|
|
# via the multi-target scrape pattern (/scrape?target=<addr>).
|
|
# The exporter needs AF_INET to reach TCP Redis instances.
|
|
redis = {
|
|
enable = true;
|
|
# No fixed --redis.addr: multi-target mode uses ?target= param.
|
|
};
|
|
|
|
# MariaDB — runs as the mysql OS user so it can connect via the
|
|
# Unix socket without a password (unix_socket auth).
|
|
mysqld = {
|
|
enable = true;
|
|
runAsLocalSuperUser = true;
|
|
configFile = mysqldExporterCnf;
|
|
};
|
|
|
|
# Nextcloud — authenticates with the admin account.
|
|
# passwordFile must be readable by the prometheus-nextcloud-exporter
|
|
# user; sops mode 0440 + group keys covers that.
|
|
nextcloud = {
|
|
enable = true;
|
|
url = "http://localhost:${toString nextcloudPort}";
|
|
username = "mjallen";
|
|
passwordFile = config.sops.secrets."jallen-nas/nextcloud/adminpassword".path;
|
|
};
|
|
};
|
|
|
|
scrapeConfigs = [
|
|
# ── System ──────────────────────────────────────────────────────────
|
|
{
|
|
job_name = "node";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:${toString config.services.prometheus.exporters.node.port}" ];
|
|
}
|
|
];
|
|
}
|
|
|
|
# ── UPS (NUT) ────────────────────────────────────────────────────────
|
|
{
|
|
job_name = "nut";
|
|
# DRuggeri's nut_exporter serves UPS metrics at /ups_metrics, not /metrics.
|
|
metrics_path = "/ups_metrics";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:${toString config.services.prometheus.exporters.nut.port}" ];
|
|
}
|
|
];
|
|
}
|
|
|
|
# ── Databases ────────────────────────────────────────────────────────
|
|
{
|
|
job_name = "postgres";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:${toString config.services.prometheus.exporters.postgres.port}" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
# Redis multi-target: one exporter, four Redis instances.
|
|
# The redis_exporter's /scrape?target= endpoint proxies each target
|
|
# so a single exporter process covers all servers.
|
|
job_name = "redis";
|
|
metrics_path = "/scrape";
|
|
static_configs = [
|
|
{
|
|
targets = [
|
|
"redis://localhost:6379" # authentik
|
|
"redis://localhost:6363" # ccache
|
|
"redis://localhost:6380" # manyfold
|
|
"redis://localhost:6381" # onlyoffice
|
|
];
|
|
}
|
|
];
|
|
relabel_configs = [
|
|
{
|
|
source_labels = [ "__address__" ];
|
|
target_label = "__param_target";
|
|
}
|
|
{
|
|
source_labels = [ "__param_target" ];
|
|
target_label = "instance";
|
|
}
|
|
{
|
|
target_label = "__address__";
|
|
replacement = "localhost:${toString config.services.prometheus.exporters.redis.port}";
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "mysqld";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:${toString config.services.prometheus.exporters.mysqld.port}" ];
|
|
}
|
|
];
|
|
}
|
|
|
|
# ── Application services ─────────────────────────────────────────────
|
|
{
|
|
# Caddy exposes its built-in Prometheus endpoint on port 2019.
|
|
job_name = "caddy";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:2019" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
# Gitea's /metrics endpoint is protected by a Bearer token.
|
|
job_name = "gitea";
|
|
metrics_path = "/metrics";
|
|
bearer_token_file = config.sops.secrets."jallen-nas/gitea/metrics-key".path;
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:${toString giteaPort}" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
# restic REST server exposes Prometheus metrics at /metrics.
|
|
job_name = "restic";
|
|
metrics_path = "/metrics";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:${toString resticPort}" ];
|
|
}
|
|
];
|
|
}
|
|
{
|
|
job_name = "nextcloud";
|
|
static_configs = [
|
|
{
|
|
targets = [ "localhost:${toString config.services.prometheus.exporters.nextcloud.port}" ];
|
|
}
|
|
];
|
|
}
|
|
];
|
|
};
|
|
|
|
grafana = {
|
|
enable = true;
|
|
settings = {
|
|
server = {
|
|
http_port = cfg.port;
|
|
http_addr = "0.0.0.0";
|
|
};
|
|
security = {
|
|
# Read the secret key from a SOPS-managed file at runtime so it
|
|
# never appears in the Nix store. The "$__file{}" syntax is
|
|
# Grafana's built-in file provider.
|
|
secret_key = "$__file{${config.sops.secrets."jallen-nas/grafana/secret-key".path}}";
|
|
};
|
|
# Grafana 12 enables kubernetesDashboards by default, which uses a
|
|
# new storage backend that validates datasource refs in dashboard
|
|
# files concurrently with datasource provisioning, causing a race
|
|
# that always fails on a clean install. Disable it to use the
|
|
# classic file provisioner that tolerates missing datasource refs.
|
|
"feature_toggles" = {
|
|
kubernetesDashboards = false;
|
|
};
|
|
|
|
# Grafana 12 introduced permitted_provisioning_paths as a security
|
|
# allowlist. The NixOS module stores all provisioning files in the
|
|
# Nix store, which is not in the default allowlist, causing the
|
|
# provisioner to silently refuse to load any files and then error
|
|
# with "data source not found".
|
|
paths.permitted_provisioning_paths = "/nix/store";
|
|
};
|
|
|
|
dataDir = "${cfg.configDir}/grafana";
|
|
|
|
provision = {
|
|
enable = true;
|
|
# Use path instead of settings to avoid the NixOS serializer
|
|
# writing `secureJsonData: null` which Grafana 12 chokes on.
|
|
datasources.path = pkgs.writeTextDir "datasource.yaml" ''
|
|
apiVersion: 1
|
|
datasources:
|
|
- name: Prometheus
|
|
uid: prometheus
|
|
type: prometheus
|
|
access: proxy
|
|
orgId: 1
|
|
url: http://localhost:${toString config.services.prometheus.port}
|
|
editable: false
|
|
jsonData:
|
|
httpMethod: POST
|
|
timeInterval: 15s
|
|
'';
|
|
# ---------------------------------------------------------------------------
|
|
# Alerting provisioning
|
|
# ---------------------------------------------------------------------------
|
|
# TEMPORARILY DISABLED - template format incompatible with Grafana 12
|
|
/*
|
|
alerting = {
|
|
# ── Contact points ──────────────────────────────────────────────────
|
|
# ntfy via the Grafana webhook contact point. Grafana POSTs a JSON
|
|
# body; ntfy accepts any body as the message text. We use the
|
|
# message template below to format it nicely.
|
|
#
|
|
# Credentials are injected via Grafana's $__env{} provider, which
|
|
# reads from the process environment. The GRAFANA_NTFY_USER and
|
|
# GRAFANA_NTFY_PASSWORD variables are set via the SOPS-managed
|
|
# grafana.env EnvironmentFile on the grafana.service unit.
|
|
#
|
|
# Note: $__file{} only works in grafana.ini settings, not in
|
|
# provisioning YAML files — using it here causes a parse error.
|
|
contactPoints.path = pkgs.writeTextDir "contactPoints.yaml" ''
|
|
apiVersion: 1
|
|
contactPoints:
|
|
- name: ntfy
|
|
receivers:
|
|
- uid: ntfy-webhook
|
|
type: webhook
|
|
disableResolveMessage: false
|
|
settings:
|
|
url: https://ntfy.mjallen.dev/grafana-alerts
|
|
httpMethod: POST
|
|
username: $__env{GRAFANA_NTFY_USER}
|
|
password: $__env{GRAFANA_NTFY_PASSWORD}
|
|
httpHeaders:
|
|
Tags: "chart,bell"
|
|
'';
|
|
|
|
# ── Notification message template ───────────────────────────────────
|
|
# Grafana sends the rendered template body as the POST body.
|
|
# ntfy treats the body as the message text.
|
|
templates.settings = {
|
|
apiVersion = 1;
|
|
templates = [
|
|
{
|
|
name = "ntfy_message";
|
|
template = ''
|
|
{{ define "ntfy_message" -}}
|
|
{{ .CommonAnnotations.summary | default .GroupLabels.alertname }}
|
|
{{ range .Alerts -}}
|
|
Status: {{ .Status | title }}
|
|
Alert: {{ .Labels.alertname }}
|
|
Severity: {{ .Labels.severity | default "unknown" }}
|
|
Instance: {{ .Labels.instance | default "unknown" }}
|
|
{{ if .Annotations.description -}}
|
|
Details: {{ .Annotations.description }}
|
|
{{ end -}}
|
|
{{ end -}}
|
|
{{ end }}
|
|
'';
|
|
}
|
|
];
|
|
};
|
|
|
|
# ── Notification routing policy ─────────────────────────────────────
|
|
policies.settings = {
|
|
apiVersion = 1;
|
|
policies = [
|
|
{
|
|
receiver = "ntfy";
|
|
group_by = [
|
|
"alertname"
|
|
"severity"
|
|
];
|
|
group_wait = "30s";
|
|
group_interval = "5m";
|
|
repeat_interval = "4h";
|
|
routes = [
|
|
# Critical alerts: repeat every 1h, no grouping wait
|
|
{
|
|
receiver = "ntfy";
|
|
matchers = [ "severity = critical" ];
|
|
group_wait = "0s";
|
|
repeat_interval = "1h";
|
|
}
|
|
];
|
|
}
|
|
];
|
|
};
|
|
|
|
# ── Alert rules ─────────────────────────────────────────────────────
|
|
rules.settings = {
|
|
apiVersion = 1;
|
|
groups = [
|
|
{
|
|
name = "nas-system";
|
|
folder = "NAS Alerts";
|
|
interval = "1m";
|
|
rules = [
|
|
# Disk usage > 85% warning, > 95% critical
|
|
{
|
|
uid = "nas-disk-warning";
|
|
title = "Disk usage high";
|
|
condition = "C";
|
|
data = [
|
|
{
|
|
refId = "A";
|
|
datasourceUid = "prometheus";
|
|
model = {
|
|
expr = ''
|
|
(
|
|
node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
|
|
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
|
|
)
|
|
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
|
|
* 100
|
|
'';
|
|
intervalMs = 60000;
|
|
maxDataPoints = 43200;
|
|
refId = "A";
|
|
};
|
|
}
|
|
{
|
|
refId = "B";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "reduce";
|
|
refId = "B";
|
|
expression = "A";
|
|
reducer = "last";
|
|
};
|
|
}
|
|
{
|
|
refId = "C";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "threshold";
|
|
refId = "C";
|
|
expression = "B";
|
|
conditions = [
|
|
{
|
|
evaluator = {
|
|
type = "gt";
|
|
params = [ 85 ];
|
|
};
|
|
}
|
|
];
|
|
};
|
|
}
|
|
];
|
|
noDataState = "NoData";
|
|
execErrState = "Error";
|
|
for = "5m";
|
|
annotations = {
|
|
summary = "Disk usage above 85%";
|
|
description = "Filesystem {{ $labels.mountpoint }} is {{ $values.B | printf \"%.1f\" }}% full.";
|
|
};
|
|
labels = {
|
|
severity = "warning";
|
|
};
|
|
isPaused = false;
|
|
}
|
|
|
|
# Memory usage > 90%
|
|
{
|
|
uid = "nas-memory-high";
|
|
title = "Memory usage high";
|
|
condition = "C";
|
|
data = [
|
|
{
|
|
refId = "A";
|
|
datasourceUid = "prometheus";
|
|
model = {
|
|
expr = ''
|
|
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
|
|
'';
|
|
intervalMs = 60000;
|
|
maxDataPoints = 43200;
|
|
refId = "A";
|
|
};
|
|
}
|
|
{
|
|
refId = "B";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "reduce";
|
|
refId = "B";
|
|
expression = "A";
|
|
reducer = "last";
|
|
};
|
|
}
|
|
{
|
|
refId = "C";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "threshold";
|
|
refId = "C";
|
|
expression = "B";
|
|
conditions = [
|
|
{
|
|
evaluator = {
|
|
type = "gt";
|
|
params = [ 90 ];
|
|
};
|
|
}
|
|
];
|
|
};
|
|
}
|
|
];
|
|
noDataState = "NoData";
|
|
execErrState = "Error";
|
|
for = "5m";
|
|
annotations = {
|
|
summary = "Memory usage above 90%";
|
|
description = "Memory usage is {{ $values.B | printf \"%.1f\" }}%.";
|
|
};
|
|
labels = {
|
|
severity = "warning";
|
|
};
|
|
isPaused = false;
|
|
}
|
|
|
|
# CPU > 90% sustained for 10m
|
|
{
|
|
uid = "nas-cpu-high";
|
|
title = "CPU usage sustained high";
|
|
condition = "C";
|
|
data = [
|
|
{
|
|
refId = "A";
|
|
datasourceUid = "prometheus";
|
|
model = {
|
|
expr = ''
|
|
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
|
'';
|
|
intervalMs = 60000;
|
|
maxDataPoints = 43200;
|
|
refId = "A";
|
|
};
|
|
}
|
|
{
|
|
refId = "B";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "reduce";
|
|
refId = "B";
|
|
expression = "A";
|
|
reducer = "last";
|
|
};
|
|
}
|
|
{
|
|
refId = "C";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "threshold";
|
|
refId = "C";
|
|
expression = "B";
|
|
conditions = [
|
|
{
|
|
evaluator = {
|
|
type = "gt";
|
|
params = [ 90 ];
|
|
};
|
|
}
|
|
];
|
|
};
|
|
}
|
|
];
|
|
noDataState = "NoData";
|
|
execErrState = "Error";
|
|
for = "10m";
|
|
annotations = {
|
|
summary = "CPU sustained above 90%";
|
|
description = "CPU usage has been above 90% for 10 minutes (currently {{ $values.B | printf \"%.1f\" }}%).";
|
|
};
|
|
labels = {
|
|
severity = "warning";
|
|
};
|
|
isPaused = false;
|
|
}
|
|
|
|
# UPS on battery (network_ups_tools_ups_status == 0 means OB/on-battery)
|
|
{
|
|
uid = "nas-ups-onbatt";
|
|
title = "UPS on battery";
|
|
condition = "C";
|
|
data = [
|
|
{
|
|
refId = "A";
|
|
datasourceUid = "prometheus";
|
|
model = {
|
|
expr = "network_ups_tools_ups_status";
|
|
intervalMs = 60000;
|
|
maxDataPoints = 43200;
|
|
refId = "A";
|
|
};
|
|
}
|
|
{
|
|
refId = "B";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "reduce";
|
|
refId = "B";
|
|
expression = "A";
|
|
reducer = "last";
|
|
};
|
|
}
|
|
{
|
|
refId = "C";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "threshold";
|
|
refId = "C";
|
|
expression = "B";
|
|
# status 0 = OB (on battery), 1 = OL (online)
|
|
conditions = [
|
|
{
|
|
evaluator = {
|
|
type = "lt";
|
|
params = [ 1 ];
|
|
};
|
|
}
|
|
];
|
|
};
|
|
}
|
|
];
|
|
noDataState = "NoData";
|
|
execErrState = "Error";
|
|
for = "1m";
|
|
annotations = {
|
|
summary = "UPS is running on battery";
|
|
description = "Mains power failure detected. UPS battery charge: {{ with query \"network_ups_tools_battery_charge\" }}{{ . | first | value | printf \"%.0f\" }}%{{ end }}.";
|
|
};
|
|
labels = {
|
|
severity = "critical";
|
|
};
|
|
isPaused = false;
|
|
}
|
|
|
|
# UPS battery charge < 30%
|
|
{
|
|
uid = "nas-ups-lowbatt";
|
|
title = "UPS battery low";
|
|
condition = "C";
|
|
data = [
|
|
{
|
|
refId = "A";
|
|
datasourceUid = "prometheus";
|
|
model = {
|
|
expr = "network_ups_tools_battery_charge";
|
|
intervalMs = 60000;
|
|
maxDataPoints = 43200;
|
|
refId = "A";
|
|
};
|
|
}
|
|
{
|
|
refId = "B";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "reduce";
|
|
refId = "B";
|
|
expression = "A";
|
|
reducer = "last";
|
|
};
|
|
}
|
|
{
|
|
refId = "C";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "threshold";
|
|
refId = "C";
|
|
expression = "B";
|
|
conditions = [
|
|
{
|
|
evaluator = {
|
|
type = "lt";
|
|
params = [ 30 ];
|
|
};
|
|
}
|
|
];
|
|
};
|
|
}
|
|
];
|
|
noDataState = "NoData";
|
|
execErrState = "Error";
|
|
for = "2m";
|
|
annotations = {
|
|
summary = "UPS battery charge below 30%";
|
|
description = "UPS battery is at {{ $values.B | printf \"%.0f\" }}%. Shutdown may be imminent.";
|
|
};
|
|
labels = {
|
|
severity = "critical";
|
|
};
|
|
isPaused = false;
|
|
}
|
|
|
|
# PostgreSQL not responding
|
|
{
|
|
uid = "nas-postgres-down";
|
|
title = "PostgreSQL down";
|
|
condition = "C";
|
|
data = [
|
|
{
|
|
refId = "A";
|
|
datasourceUid = "prometheus";
|
|
model = {
|
|
expr = "pg_up";
|
|
intervalMs = 60000;
|
|
maxDataPoints = 43200;
|
|
refId = "A";
|
|
};
|
|
}
|
|
{
|
|
refId = "B";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "reduce";
|
|
refId = "B";
|
|
expression = "A";
|
|
reducer = "last";
|
|
};
|
|
}
|
|
{
|
|
refId = "C";
|
|
datasourceUid = "__expr__";
|
|
model = {
|
|
type = "threshold";
|
|
refId = "C";
|
|
expression = "B";
|
|
conditions = [
|
|
{
|
|
evaluator = {
|
|
type = "lt";
|
|
params = [ 1 ];
|
|
};
|
|
}
|
|
];
|
|
};
|
|
}
|
|
];
|
|
noDataState = "Alerting";
|
|
execErrState = "Error";
|
|
for = "2m";
|
|
annotations = {
|
|
summary = "PostgreSQL is down";
|
|
description = "The PostgreSQL exporter reports pg_up=0. Database may be unavailable.";
|
|
};
|
|
labels = {
|
|
severity = "critical";
|
|
};
|
|
isPaused = false;
|
|
}
|
|
];
|
|
}
|
|
];
|
|
};
|
|
|
|
muteTimings.settings = {
|
|
apiVersion = 1;
|
|
muteTimes = [ ];
|
|
};
|
|
};
|
|
*/
|
|
|
|
dashboards.settings.providers = [
|
|
{
|
|
name = "community";
|
|
orgId = 1;
|
|
type = "file";
|
|
disableDeletion = true;
|
|
updateIntervalSeconds = 60;
|
|
allowUiUpdates = false;
|
|
options.path = communityDashboards;
|
|
}
|
|
{
|
|
name = "custom";
|
|
orgId = 1;
|
|
type = "file";
|
|
disableDeletion = true;
|
|
updateIntervalSeconds = 60;
|
|
allowUiUpdates = false;
|
|
options.path = customDashboards;
|
|
}
|
|
];
|
|
};
|
|
};
|
|
};
|
|
|
|
# Inject ntfy credentials into Grafana's environment so the $__env{}
|
|
# provider in contactPoints.yaml can resolve them at runtime.
|
|
# The grafana.env template is managed by SOPS and owned by grafana:grafana.
|
|
sops.templates."grafana.env" = {
|
|
content = ''
|
|
GRAFANA_NTFY_USER=${config.sops.placeholder."jallen-nas/ntfy/user"}
|
|
GRAFANA_NTFY_PASSWORD=${config.sops.placeholder."jallen-nas/ntfy/password"}
|
|
'';
|
|
mode = "0400";
|
|
owner = "grafana";
|
|
restartUnits = [ "grafana.service" ];
|
|
};
|
|
|
|
systemd.services.grafana.serviceConfig = {
|
|
EnvironmentFile = config.sops.templates."grafana.env".path;
|
|
# Grafana downloads plugins at runtime and occasionally creates subdirectories
|
|
# with overly restrictive permissions (e.g. 0700 for locales/*), which causes
|
|
# the next startup to fail with "permission denied" during plugin discovery.
|
|
# Fix any such directories before Grafana starts.
|
|
ExecStartPre = [
|
|
(
|
|
"+"
|
|
+ pkgs.writeShellScript "grafana-fix-plugin-perms" ''
|
|
pluginDir="${cfg.configDir}/grafana/plugins"
|
|
if [ -d "$pluginDir" ]; then
|
|
${pkgs.coreutils}/bin/chmod -R a+rX "$pluginDir"
|
|
fi
|
|
''
|
|
)
|
|
];
|
|
};
|
|
|
|
# The redis exporter needs AF_INET to reach TCP Redis instances.
|
|
# The default systemd hardening only allows AF_UNIX.
|
|
systemd.services.prometheus-redis-exporter.serviceConfig.RestrictAddressFamilies = [
|
|
"AF_UNIX"
|
|
"AF_INET"
|
|
"AF_INET6"
|
|
];
|
|
};
|
|
};
|
|
|
|
upsUser = "nas-admin";
|
|
in
|
|
{
|
|
imports = [ grafanaConfig ];
|
|
}
|