xtr temp
This commit is contained in:
@@ -395,460 +395,463 @@ let
|
||||
# ---------------------------------------------------------------------------
|
||||
# Alerting provisioning
|
||||
# ---------------------------------------------------------------------------
|
||||
alerting = {
|
||||
# ── Contact points ──────────────────────────────────────────────────
|
||||
# ntfy via the Grafana webhook contact point. Grafana POSTs a JSON
|
||||
# body; ntfy accepts any body as the message text. We use the
|
||||
# message template below to format it nicely.
|
||||
#
|
||||
# Credentials are injected via Grafana's $__env{} provider, which
|
||||
# reads from the process environment. The GRAFANA_NTFY_USER and
|
||||
# GRAFANA_NTFY_PASSWORD variables are set via the SOPS-managed
|
||||
# grafana.env EnvironmentFile on the grafana.service unit.
|
||||
#
|
||||
# Note: $__file{} only works in grafana.ini settings, not in
|
||||
# provisioning YAML files — using it here causes a parse error.
|
||||
contactPoints.path = pkgs.writeTextDir "contactPoints.yaml" ''
|
||||
apiVersion: 1
|
||||
contactPoints:
|
||||
- name: ntfy
|
||||
receivers:
|
||||
- uid: ntfy-webhook
|
||||
type: webhook
|
||||
disableResolveMessage: false
|
||||
settings:
|
||||
url: https://ntfy.mjallen.dev/grafana-alerts
|
||||
httpMethod: POST
|
||||
username: $__env{GRAFANA_NTFY_USER}
|
||||
password: $__env{GRAFANA_NTFY_PASSWORD}
|
||||
httpHeaders:
|
||||
Tags: "chart,bell"
|
||||
'';
|
||||
# TEMPORARILY DISABLED - template format incompatible with Grafana 12
|
||||
/*
|
||||
alerting = {
|
||||
# ── Contact points ──────────────────────────────────────────────────
|
||||
# ntfy via the Grafana webhook contact point. Grafana POSTs a JSON
|
||||
# body; ntfy accepts any body as the message text. We use the
|
||||
# message template below to format it nicely.
|
||||
#
|
||||
# Credentials are injected via Grafana's $__env{} provider, which
|
||||
# reads from the process environment. The GRAFANA_NTFY_USER and
|
||||
# GRAFANA_NTFY_PASSWORD variables are set via the SOPS-managed
|
||||
# grafana.env EnvironmentFile on the grafana.service unit.
|
||||
#
|
||||
# Note: $__file{} only works in grafana.ini settings, not in
|
||||
# provisioning YAML files — using it here causes a parse error.
|
||||
contactPoints.path = pkgs.writeTextDir "contactPoints.yaml" ''
|
||||
apiVersion: 1
|
||||
contactPoints:
|
||||
- name: ntfy
|
||||
receivers:
|
||||
- uid: ntfy-webhook
|
||||
type: webhook
|
||||
disableResolveMessage: false
|
||||
settings:
|
||||
url: https://ntfy.mjallen.dev/grafana-alerts
|
||||
httpMethod: POST
|
||||
username: $__env{GRAFANA_NTFY_USER}
|
||||
password: $__env{GRAFANA_NTFY_PASSWORD}
|
||||
httpHeaders:
|
||||
Tags: "chart,bell"
|
||||
'';
|
||||
|
||||
# ── Notification message template ───────────────────────────────────
|
||||
# Grafana sends the rendered template body as the POST body.
|
||||
# ntfy treats the body as the message text.
|
||||
templates.settings = {
|
||||
apiVersion = 1;
|
||||
templates = [
|
||||
{
|
||||
name = "ntfy_message";
|
||||
template = ''
|
||||
{{ define "ntfy_message" -}}
|
||||
{{ .CommonAnnotations.summary | default .GroupLabels.alertname }}
|
||||
{{ range .Alerts -}}
|
||||
Status: {{ .Status | title }}
|
||||
Alert: {{ .Labels.alertname }}
|
||||
Severity: {{ .Labels.severity | default "unknown" }}
|
||||
Instance: {{ .Labels.instance | default "unknown" }}
|
||||
{{ if .Annotations.description -}}
|
||||
Details: {{ .Annotations.description }}
|
||||
{{ end -}}
|
||||
{{ end -}}
|
||||
{{ end }}
|
||||
'';
|
||||
}
|
||||
];
|
||||
# ── Notification message template ───────────────────────────────────
|
||||
# Grafana sends the rendered template body as the POST body.
|
||||
# ntfy treats the body as the message text.
|
||||
templates.settings = {
|
||||
apiVersion = 1;
|
||||
templates = [
|
||||
{
|
||||
name = "ntfy_message";
|
||||
template = ''
|
||||
{{ define "ntfy_message" -}}
|
||||
{{ .CommonAnnotations.summary | default .GroupLabels.alertname }}
|
||||
{{ range .Alerts -}}
|
||||
Status: {{ .Status | title }}
|
||||
Alert: {{ .Labels.alertname }}
|
||||
Severity: {{ .Labels.severity | default "unknown" }}
|
||||
Instance: {{ .Labels.instance | default "unknown" }}
|
||||
{{ if .Annotations.description -}}
|
||||
Details: {{ .Annotations.description }}
|
||||
{{ end -}}
|
||||
{{ end -}}
|
||||
{{ end }}
|
||||
'';
|
||||
}
|
||||
];
|
||||
};
|
||||
|
||||
# ── Notification routing policy ─────────────────────────────────────
|
||||
policies.settings = {
|
||||
apiVersion = 1;
|
||||
policies = [
|
||||
{
|
||||
receiver = "ntfy";
|
||||
group_by = [
|
||||
"alertname"
|
||||
"severity"
|
||||
];
|
||||
group_wait = "30s";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
routes = [
|
||||
# Critical alerts: repeat every 1h, no grouping wait
|
||||
{
|
||||
receiver = "ntfy";
|
||||
matchers = [ "severity = critical" ];
|
||||
group_wait = "0s";
|
||||
repeat_interval = "1h";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
|
||||
# ── Alert rules ─────────────────────────────────────────────────────
|
||||
rules.settings = {
|
||||
apiVersion = 1;
|
||||
groups = [
|
||||
{
|
||||
name = "nas-system";
|
||||
folder = "NAS Alerts";
|
||||
interval = "1m";
|
||||
rules = [
|
||||
# Disk usage > 85% warning, > 95% critical
|
||||
{
|
||||
uid = "nas-disk-warning";
|
||||
title = "Disk usage high";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
datasourceUid = "prometheus";
|
||||
model = {
|
||||
expr = ''
|
||||
(
|
||||
node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
|
||||
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
|
||||
)
|
||||
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
|
||||
* 100
|
||||
'';
|
||||
intervalMs = 60000;
|
||||
maxDataPoints = 43200;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
refId = "B";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "threshold";
|
||||
refId = "C";
|
||||
expression = "B";
|
||||
conditions = [
|
||||
{
|
||||
evaluator = {
|
||||
type = "gt";
|
||||
params = [ 85 ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "NoData";
|
||||
execErrState = "Error";
|
||||
for = "5m";
|
||||
annotations = {
|
||||
summary = "Disk usage above 85%";
|
||||
description = "Filesystem {{ $labels.mountpoint }} is {{ $values.B | printf \"%.1f\" }}% full.";
|
||||
};
|
||||
labels = {
|
||||
severity = "warning";
|
||||
};
|
||||
isPaused = false;
|
||||
}
|
||||
|
||||
# Memory usage > 90%
|
||||
{
|
||||
uid = "nas-memory-high";
|
||||
title = "Memory usage high";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
datasourceUid = "prometheus";
|
||||
model = {
|
||||
expr = ''
|
||||
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
|
||||
'';
|
||||
intervalMs = 60000;
|
||||
maxDataPoints = 43200;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
refId = "B";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "threshold";
|
||||
refId = "C";
|
||||
expression = "B";
|
||||
conditions = [
|
||||
{
|
||||
evaluator = {
|
||||
type = "gt";
|
||||
params = [ 90 ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "NoData";
|
||||
execErrState = "Error";
|
||||
for = "5m";
|
||||
annotations = {
|
||||
summary = "Memory usage above 90%";
|
||||
description = "Memory usage is {{ $values.B | printf \"%.1f\" }}%.";
|
||||
};
|
||||
labels = {
|
||||
severity = "warning";
|
||||
};
|
||||
isPaused = false;
|
||||
}
|
||||
|
||||
# CPU > 90% sustained for 10m
|
||||
{
|
||||
uid = "nas-cpu-high";
|
||||
title = "CPU usage sustained high";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
datasourceUid = "prometheus";
|
||||
model = {
|
||||
expr = ''
|
||||
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||
'';
|
||||
intervalMs = 60000;
|
||||
maxDataPoints = 43200;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
refId = "B";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "threshold";
|
||||
refId = "C";
|
||||
expression = "B";
|
||||
conditions = [
|
||||
{
|
||||
evaluator = {
|
||||
type = "gt";
|
||||
params = [ 90 ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "NoData";
|
||||
execErrState = "Error";
|
||||
for = "10m";
|
||||
annotations = {
|
||||
summary = "CPU sustained above 90%";
|
||||
description = "CPU usage has been above 90% for 10 minutes (currently {{ $values.B | printf \"%.1f\" }}%).";
|
||||
};
|
||||
labels = {
|
||||
severity = "warning";
|
||||
};
|
||||
isPaused = false;
|
||||
}
|
||||
|
||||
# UPS on battery (network_ups_tools_ups_status == 0 means OB/on-battery)
|
||||
{
|
||||
uid = "nas-ups-onbatt";
|
||||
title = "UPS on battery";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
datasourceUid = "prometheus";
|
||||
model = {
|
||||
expr = "network_ups_tools_ups_status";
|
||||
intervalMs = 60000;
|
||||
maxDataPoints = 43200;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
refId = "B";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "threshold";
|
||||
refId = "C";
|
||||
expression = "B";
|
||||
# status 0 = OB (on battery), 1 = OL (online)
|
||||
conditions = [
|
||||
{
|
||||
evaluator = {
|
||||
type = "lt";
|
||||
params = [ 1 ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "NoData";
|
||||
execErrState = "Error";
|
||||
for = "1m";
|
||||
annotations = {
|
||||
summary = "UPS is running on battery";
|
||||
description = "Mains power failure detected. UPS battery charge: {{ with query \"network_ups_tools_battery_charge\" }}{{ . | first | value | printf \"%.0f\" }}%{{ end }}.";
|
||||
};
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
isPaused = false;
|
||||
}
|
||||
|
||||
# UPS battery charge < 30%
|
||||
{
|
||||
uid = "nas-ups-lowbatt";
|
||||
title = "UPS battery low";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
datasourceUid = "prometheus";
|
||||
model = {
|
||||
expr = "network_ups_tools_battery_charge";
|
||||
intervalMs = 60000;
|
||||
maxDataPoints = 43200;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
refId = "B";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "threshold";
|
||||
refId = "C";
|
||||
expression = "B";
|
||||
conditions = [
|
||||
{
|
||||
evaluator = {
|
||||
type = "lt";
|
||||
params = [ 30 ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "NoData";
|
||||
execErrState = "Error";
|
||||
for = "2m";
|
||||
annotations = {
|
||||
summary = "UPS battery charge below 30%";
|
||||
description = "UPS battery is at {{ $values.B | printf \"%.0f\" }}%. Shutdown may be imminent.";
|
||||
};
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
isPaused = false;
|
||||
}
|
||||
|
||||
# PostgreSQL not responding
|
||||
{
|
||||
uid = "nas-postgres-down";
|
||||
title = "PostgreSQL down";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
datasourceUid = "prometheus";
|
||||
model = {
|
||||
expr = "pg_up";
|
||||
intervalMs = 60000;
|
||||
maxDataPoints = 43200;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
refId = "B";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "threshold";
|
||||
refId = "C";
|
||||
expression = "B";
|
||||
conditions = [
|
||||
{
|
||||
evaluator = {
|
||||
type = "lt";
|
||||
params = [ 1 ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "Alerting";
|
||||
execErrState = "Error";
|
||||
for = "2m";
|
||||
annotations = {
|
||||
summary = "PostgreSQL is down";
|
||||
description = "The PostgreSQL exporter reports pg_up=0. Database may be unavailable.";
|
||||
};
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
isPaused = false;
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
|
||||
muteTimings.settings = {
|
||||
apiVersion = 1;
|
||||
muteTimes = [ ];
|
||||
};
|
||||
};
|
||||
|
||||
# ── Notification routing policy ─────────────────────────────────────
|
||||
policies.settings = {
|
||||
apiVersion = 1;
|
||||
policies = [
|
||||
{
|
||||
receiver = "ntfy";
|
||||
group_by = [
|
||||
"alertname"
|
||||
"severity"
|
||||
];
|
||||
group_wait = "30s";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "4h";
|
||||
routes = [
|
||||
# Critical alerts: repeat every 1h, no grouping wait
|
||||
{
|
||||
receiver = "ntfy";
|
||||
matchers = [ "severity = critical" ];
|
||||
group_wait = "0s";
|
||||
repeat_interval = "1h";
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
|
||||
# ── Alert rules ─────────────────────────────────────────────────────
|
||||
rules.settings = {
|
||||
apiVersion = 1;
|
||||
groups = [
|
||||
{
|
||||
name = "nas-system";
|
||||
folder = "NAS Alerts";
|
||||
interval = "1m";
|
||||
rules = [
|
||||
# Disk usage > 85% warning, > 95% critical
|
||||
{
|
||||
uid = "nas-disk-warning";
|
||||
title = "Disk usage high";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
datasourceUid = "prometheus";
|
||||
model = {
|
||||
expr = ''
|
||||
(
|
||||
node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
|
||||
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
|
||||
)
|
||||
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
|
||||
* 100
|
||||
'';
|
||||
intervalMs = 60000;
|
||||
maxDataPoints = 43200;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
refId = "B";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "threshold";
|
||||
refId = "C";
|
||||
expression = "B";
|
||||
conditions = [
|
||||
{
|
||||
evaluator = {
|
||||
type = "gt";
|
||||
params = [ 85 ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "NoData";
|
||||
execErrState = "Error";
|
||||
for = "5m";
|
||||
annotations = {
|
||||
summary = "Disk usage above 85%";
|
||||
description = "Filesystem {{ $labels.mountpoint }} is {{ $values.B | printf \"%.1f\" }}% full.";
|
||||
};
|
||||
labels = {
|
||||
severity = "warning";
|
||||
};
|
||||
isPaused = false;
|
||||
}
|
||||
|
||||
# Memory usage > 90%
|
||||
{
|
||||
uid = "nas-memory-high";
|
||||
title = "Memory usage high";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
datasourceUid = "prometheus";
|
||||
model = {
|
||||
expr = ''
|
||||
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
|
||||
'';
|
||||
intervalMs = 60000;
|
||||
maxDataPoints = 43200;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
refId = "B";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "threshold";
|
||||
refId = "C";
|
||||
expression = "B";
|
||||
conditions = [
|
||||
{
|
||||
evaluator = {
|
||||
type = "gt";
|
||||
params = [ 90 ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "NoData";
|
||||
execErrState = "Error";
|
||||
for = "5m";
|
||||
annotations = {
|
||||
summary = "Memory usage above 90%";
|
||||
description = "Memory usage is {{ $values.B | printf \"%.1f\" }}%.";
|
||||
};
|
||||
labels = {
|
||||
severity = "warning";
|
||||
};
|
||||
isPaused = false;
|
||||
}
|
||||
|
||||
# CPU > 90% sustained for 10m
|
||||
{
|
||||
uid = "nas-cpu-high";
|
||||
title = "CPU usage sustained high";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
datasourceUid = "prometheus";
|
||||
model = {
|
||||
expr = ''
|
||||
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||
'';
|
||||
intervalMs = 60000;
|
||||
maxDataPoints = 43200;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
refId = "B";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "threshold";
|
||||
refId = "C";
|
||||
expression = "B";
|
||||
conditions = [
|
||||
{
|
||||
evaluator = {
|
||||
type = "gt";
|
||||
params = [ 90 ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "NoData";
|
||||
execErrState = "Error";
|
||||
for = "10m";
|
||||
annotations = {
|
||||
summary = "CPU sustained above 90%";
|
||||
description = "CPU usage has been above 90% for 10 minutes (currently {{ $values.B | printf \"%.1f\" }}%).";
|
||||
};
|
||||
labels = {
|
||||
severity = "warning";
|
||||
};
|
||||
isPaused = false;
|
||||
}
|
||||
|
||||
# UPS on battery (network_ups_tools_ups_status == 0 means OB/on-battery)
|
||||
{
|
||||
uid = "nas-ups-onbatt";
|
||||
title = "UPS on battery";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
datasourceUid = "prometheus";
|
||||
model = {
|
||||
expr = "network_ups_tools_ups_status";
|
||||
intervalMs = 60000;
|
||||
maxDataPoints = 43200;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
refId = "B";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "threshold";
|
||||
refId = "C";
|
||||
expression = "B";
|
||||
# status 0 = OB (on battery), 1 = OL (online)
|
||||
conditions = [
|
||||
{
|
||||
evaluator = {
|
||||
type = "lt";
|
||||
params = [ 1 ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "NoData";
|
||||
execErrState = "Error";
|
||||
for = "1m";
|
||||
annotations = {
|
||||
summary = "UPS is running on battery";
|
||||
description = "Mains power failure detected. UPS battery charge: {{ with query \"network_ups_tools_battery_charge\" }}{{ . | first | value | printf \"%.0f\" }}%{{ end }}.";
|
||||
};
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
isPaused = false;
|
||||
}
|
||||
|
||||
# UPS battery charge < 30%
|
||||
{
|
||||
uid = "nas-ups-lowbatt";
|
||||
title = "UPS battery low";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
datasourceUid = "prometheus";
|
||||
model = {
|
||||
expr = "network_ups_tools_battery_charge";
|
||||
intervalMs = 60000;
|
||||
maxDataPoints = 43200;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
refId = "B";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "threshold";
|
||||
refId = "C";
|
||||
expression = "B";
|
||||
conditions = [
|
||||
{
|
||||
evaluator = {
|
||||
type = "lt";
|
||||
params = [ 30 ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "NoData";
|
||||
execErrState = "Error";
|
||||
for = "2m";
|
||||
annotations = {
|
||||
summary = "UPS battery charge below 30%";
|
||||
description = "UPS battery is at {{ $values.B | printf \"%.0f\" }}%. Shutdown may be imminent.";
|
||||
};
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
isPaused = false;
|
||||
}
|
||||
|
||||
# PostgreSQL not responding
|
||||
{
|
||||
uid = "nas-postgres-down";
|
||||
title = "PostgreSQL down";
|
||||
condition = "C";
|
||||
data = [
|
||||
{
|
||||
refId = "A";
|
||||
datasourceUid = "prometheus";
|
||||
model = {
|
||||
expr = "pg_up";
|
||||
intervalMs = 60000;
|
||||
maxDataPoints = 43200;
|
||||
refId = "A";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "B";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "reduce";
|
||||
refId = "B";
|
||||
expression = "A";
|
||||
reducer = "last";
|
||||
};
|
||||
}
|
||||
{
|
||||
refId = "C";
|
||||
datasourceUid = "__expr__";
|
||||
model = {
|
||||
type = "threshold";
|
||||
refId = "C";
|
||||
expression = "B";
|
||||
conditions = [
|
||||
{
|
||||
evaluator = {
|
||||
type = "lt";
|
||||
params = [ 1 ];
|
||||
};
|
||||
}
|
||||
];
|
||||
};
|
||||
}
|
||||
];
|
||||
noDataState = "Alerting";
|
||||
execErrState = "Error";
|
||||
for = "2m";
|
||||
annotations = {
|
||||
summary = "PostgreSQL is down";
|
||||
description = "The PostgreSQL exporter reports pg_up=0. Database may be unavailable.";
|
||||
};
|
||||
labels = {
|
||||
severity = "critical";
|
||||
};
|
||||
isPaused = false;
|
||||
}
|
||||
];
|
||||
}
|
||||
];
|
||||
};
|
||||
|
||||
muteTimings.settings = {
|
||||
apiVersion = 1;
|
||||
muteTimes = [ ];
|
||||
};
|
||||
};
|
||||
*/
|
||||
|
||||
dashboards.settings.providers = [
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user