This commit is contained in:
mjallen18
2026-03-25 22:24:19 -05:00
parent ab81e78b60
commit e119ffaabb
7 changed files with 1637 additions and 517 deletions

View File

@@ -15,6 +15,7 @@ in
home.packages = with pkgs.kdePackages; [
plasma-browser-integration
kdeplasma-addons
kvantum
];
programs.plasma = {

View File

@@ -0,0 +1,103 @@
{
lib,
pkgs,
config,
namespace,
...
}:
with lib;
let
inherit (lib.${namespace}) mkBoolOpt;
cfg = config.${namespace}.hardware.npu;
in
{
# AMD XDNA 2 NPU support module.
#
# Enables the amdxdna kernel driver and installs the XRT userspace runtime
# (libxrt_coreutil + the XDNA shim plugin) built from packages/xrt.
#
# Prerequisites:
# • Linux >= 6.14 (amdxdna in-tree) OR linux-firmware >= 20260221
# for the NPU firmware blobs. CachyOS kernels >= 6.14 satisfy this.
# • AMD XDNA 2 NPU silicon (Strix Point, Strix Halo, Kraken Point,
# Gorgon Point — Ryzen AI 300-series and later).
#
# What this module does:
# 1. Installs xrt (libxrt_coreutil, libxrt_driver_xdna, xrt-smi) from
# the local flake package.
# 2. Loads the amdxdna kernel driver.
# 3. Raises the per-process memlock limit (required for NPU DMA buffers).
# 4. Optionally installs fastflowlm and exposes it system-wide.
#
# Usage (NixOS config):
# ${namespace}.hardware.npu.enable = true;
# # Enable FLM system-wide if you also run the lemonade service:
# ${namespace}.hardware.npu.fastflowlm.enable = true;
options.${namespace}.hardware.npu = {
enable = mkEnableOption "AMD XDNA 2 NPU support (XRT + amdxdna driver)";
fastflowlm.enable = mkBoolOpt false ''
Install FastFlowLM (flm) system-wide.
FastFlowLM runs LLMs directly on the AMD XDNA 2 NPU.
Enable this when you also run the lemonade service with an NPU backend,
or want standalone `flm` access.
'';
};
config = mkIf cfg.enable {
assertions = [
{
assertion = pkgs.stdenv.hostPlatform.isx86_64;
message = "${namespace}.hardware.npu: AMD XDNA NPU support is only available on x86_64-linux.";
}
];
# ── Kernel driver ──────────────────────────────────────────────────────
# amdxdna is built-in since Linux 6.14. On older kernels (e.g. CachyOS
# 6.12/6.13) this explicit load request triggers the DKMS module if it is
# installed, or is silently ignored if the driver is already built-in.
boot.kernelModules = [ "amdxdna" ];
# ── XRT userspace runtime ──────────────────────────────────────────────
environment.systemPackages = [
pkgs.${namespace}.xrt
]
++ lib.optional cfg.fastflowlm.enable pkgs.${namespace}.fastflowlm;
# ── Memlock limit ──────────────────────────────────────────────────────
# NPU workloads require locking large memory regions for DMA.
# Without unlimited memlock the NPU will refuse to allocate buffers.
security.pam.loginLimits = [
{
domain = "*";
type = "-";
item = "memlock";
value = "unlimited";
}
];
# For system services (e.g. lemonade, fastflowlm) that run under systemd,
# the PAM limit above does not apply — they must set LimitMEMLOCK in their
# unit. We set a system-wide default via systemd.settings so every service
# inherits unlimited memlock unless it explicitly overrides it.
systemd.settings.Manager.DefaultLimitMEMLOCK = "infinity";
# ── NPU device permissions ─────────────────────────────────────────────
# amdxdna exposes the NPU as /dev/accel/accel0 (DRM accelerator device).
# Add a udev rule so members of the "render" group can open it without root.
services.udev.extraRules = ''
# AMD XDNA 2 NPU grant access to the render group
SUBSYSTEM=="accel", KERNEL=="accel*", GROUP="render", MODE="0660"
'';
# Ensure the render group exists.
users.groups.render = { };
# ── Firmware ──────────────────────────────────────────────────────────
# The NPU firmware blobs ship in linux-firmware >= 20260221.
# hardware.enableAllFirmware (set by modules/nixos/hardware/common) already
# pulls in the full firmware set; this is an explicit belt-and-braces note.
hardware.firmware = [ pkgs.linux-firmware ];
};
}

View File

@@ -395,460 +395,463 @@ let
# ---------------------------------------------------------------------------
# Alerting provisioning
# ---------------------------------------------------------------------------
alerting = {
# ── Contact points ──────────────────────────────────────────────────
# ntfy via the Grafana webhook contact point. Grafana POSTs a JSON
# body; ntfy accepts any body as the message text. We use the
# message template below to format it nicely.
#
# Credentials are injected via Grafana's $__env{} provider, which
# reads from the process environment. The GRAFANA_NTFY_USER and
# GRAFANA_NTFY_PASSWORD variables are set via the SOPS-managed
# grafana.env EnvironmentFile on the grafana.service unit.
#
# Note: $__file{} only works in grafana.ini settings, not in
# provisioning YAML files — using it here causes a parse error.
contactPoints.path = pkgs.writeTextDir "contactPoints.yaml" ''
apiVersion: 1
contactPoints:
- name: ntfy
receivers:
- uid: ntfy-webhook
type: webhook
disableResolveMessage: false
settings:
url: https://ntfy.mjallen.dev/grafana-alerts
httpMethod: POST
username: $__env{GRAFANA_NTFY_USER}
password: $__env{GRAFANA_NTFY_PASSWORD}
httpHeaders:
Tags: "chart,bell"
'';
# TEMPORARILY DISABLED - template format incompatible with Grafana 12
/*
alerting = {
# Contact points
# ntfy via the Grafana webhook contact point. Grafana POSTs a JSON
# body; ntfy accepts any body as the message text. We use the
# message template below to format it nicely.
#
# Credentials are injected via Grafana's $__env{} provider, which
# reads from the process environment. The GRAFANA_NTFY_USER and
# GRAFANA_NTFY_PASSWORD variables are set via the SOPS-managed
# grafana.env EnvironmentFile on the grafana.service unit.
#
# Note: $__file{} only works in grafana.ini settings, not in
# provisioning YAML files using it here causes a parse error.
contactPoints.path = pkgs.writeTextDir "contactPoints.yaml" ''
apiVersion: 1
contactPoints:
- name: ntfy
receivers:
- uid: ntfy-webhook
type: webhook
disableResolveMessage: false
settings:
url: https://ntfy.mjallen.dev/grafana-alerts
httpMethod: POST
username: $__env{GRAFANA_NTFY_USER}
password: $__env{GRAFANA_NTFY_PASSWORD}
httpHeaders:
Tags: "chart,bell"
'';
# ── Notification message template ───────────────────────────────────
# Grafana sends the rendered template body as the POST body.
# ntfy treats the body as the message text.
templates.settings = {
apiVersion = 1;
templates = [
{
name = "ntfy_message";
template = ''
{{ define "ntfy_message" -}}
{{ .CommonAnnotations.summary | default .GroupLabels.alertname }}
{{ range .Alerts -}}
Status: {{ .Status | title }}
Alert: {{ .Labels.alertname }}
Severity: {{ .Labels.severity | default "unknown" }}
Instance: {{ .Labels.instance | default "unknown" }}
{{ if .Annotations.description -}}
Details: {{ .Annotations.description }}
{{ end -}}
{{ end -}}
{{ end }}
'';
}
];
# Notification message template
# Grafana sends the rendered template body as the POST body.
# ntfy treats the body as the message text.
templates.settings = {
apiVersion = 1;
templates = [
{
name = "ntfy_message";
template = ''
{{ define "ntfy_message" -}}
{{ .CommonAnnotations.summary | default .GroupLabels.alertname }}
{{ range .Alerts -}}
Status: {{ .Status | title }}
Alert: {{ .Labels.alertname }}
Severity: {{ .Labels.severity | default "unknown" }}
Instance: {{ .Labels.instance | default "unknown" }}
{{ if .Annotations.description -}}
Details: {{ .Annotations.description }}
{{ end -}}
{{ end -}}
{{ end }}
'';
}
];
};
# Notification routing policy
policies.settings = {
apiVersion = 1;
policies = [
{
receiver = "ntfy";
group_by = [
"alertname"
"severity"
];
group_wait = "30s";
group_interval = "5m";
repeat_interval = "4h";
routes = [
# Critical alerts: repeat every 1h, no grouping wait
{
receiver = "ntfy";
matchers = [ "severity = critical" ];
group_wait = "0s";
repeat_interval = "1h";
}
];
}
];
};
# Alert rules
rules.settings = {
apiVersion = 1;
groups = [
{
name = "nas-system";
folder = "NAS Alerts";
interval = "1m";
rules = [
# Disk usage > 85% warning, > 95% critical
{
uid = "nas-disk-warning";
title = "Disk usage high";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = ''
(
node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
)
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
* 100
'';
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "gt";
params = [ 85 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "5m";
annotations = {
summary = "Disk usage above 85%";
description = "Filesystem {{ $labels.mountpoint }} is {{ $values.B | printf \"%.1f\" }}% full.";
};
labels = {
severity = "warning";
};
isPaused = false;
}
# Memory usage > 90%
{
uid = "nas-memory-high";
title = "Memory usage high";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = ''
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
'';
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "gt";
params = [ 90 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "5m";
annotations = {
summary = "Memory usage above 90%";
description = "Memory usage is {{ $values.B | printf \"%.1f\" }}%.";
};
labels = {
severity = "warning";
};
isPaused = false;
}
# CPU > 90% sustained for 10m
{
uid = "nas-cpu-high";
title = "CPU usage sustained high";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = ''
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
'';
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "gt";
params = [ 90 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "10m";
annotations = {
summary = "CPU sustained above 90%";
description = "CPU usage has been above 90% for 10 minutes (currently {{ $values.B | printf \"%.1f\" }}%).";
};
labels = {
severity = "warning";
};
isPaused = false;
}
# UPS on battery (network_ups_tools_ups_status == 0 means OB/on-battery)
{
uid = "nas-ups-onbatt";
title = "UPS on battery";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = "network_ups_tools_ups_status";
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
# status 0 = OB (on battery), 1 = OL (online)
conditions = [
{
evaluator = {
type = "lt";
params = [ 1 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "1m";
annotations = {
summary = "UPS is running on battery";
description = "Mains power failure detected. UPS battery charge: {{ with query \"network_ups_tools_battery_charge\" }}{{ . | first | value | printf \"%.0f\" }}%{{ end }}.";
};
labels = {
severity = "critical";
};
isPaused = false;
}
# UPS battery charge < 30%
{
uid = "nas-ups-lowbatt";
title = "UPS battery low";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = "network_ups_tools_battery_charge";
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "lt";
params = [ 30 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "2m";
annotations = {
summary = "UPS battery charge below 30%";
description = "UPS battery is at {{ $values.B | printf \"%.0f\" }}%. Shutdown may be imminent.";
};
labels = {
severity = "critical";
};
isPaused = false;
}
# PostgreSQL not responding
{
uid = "nas-postgres-down";
title = "PostgreSQL down";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = "pg_up";
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "lt";
params = [ 1 ];
};
}
];
};
}
];
noDataState = "Alerting";
execErrState = "Error";
for = "2m";
annotations = {
summary = "PostgreSQL is down";
description = "The PostgreSQL exporter reports pg_up=0. Database may be unavailable.";
};
labels = {
severity = "critical";
};
isPaused = false;
}
];
}
];
};
muteTimings.settings = {
apiVersion = 1;
muteTimes = [ ];
};
};
# ── Notification routing policy ─────────────────────────────────────
policies.settings = {
apiVersion = 1;
policies = [
{
receiver = "ntfy";
group_by = [
"alertname"
"severity"
];
group_wait = "30s";
group_interval = "5m";
repeat_interval = "4h";
routes = [
# Critical alerts: repeat every 1h, no grouping wait
{
receiver = "ntfy";
matchers = [ "severity = critical" ];
group_wait = "0s";
repeat_interval = "1h";
}
];
}
];
};
# ── Alert rules ─────────────────────────────────────────────────────
rules.settings = {
apiVersion = 1;
groups = [
{
name = "nas-system";
folder = "NAS Alerts";
interval = "1m";
rules = [
# Disk usage > 85% warning, > 95% critical
{
uid = "nas-disk-warning";
title = "Disk usage high";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = ''
(
node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
- node_filesystem_avail_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
)
/ node_filesystem_size_bytes{fstype!~"tmpfs|overlay|squashfs",mountpoint!~"/boot.*"}
* 100
'';
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "gt";
params = [ 85 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "5m";
annotations = {
summary = "Disk usage above 85%";
description = "Filesystem {{ $labels.mountpoint }} is {{ $values.B | printf \"%.1f\" }}% full.";
};
labels = {
severity = "warning";
};
isPaused = false;
}
# Memory usage > 90%
{
uid = "nas-memory-high";
title = "Memory usage high";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = ''
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
'';
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "gt";
params = [ 90 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "5m";
annotations = {
summary = "Memory usage above 90%";
description = "Memory usage is {{ $values.B | printf \"%.1f\" }}%.";
};
labels = {
severity = "warning";
};
isPaused = false;
}
# CPU > 90% sustained for 10m
{
uid = "nas-cpu-high";
title = "CPU usage sustained high";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = ''
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
'';
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "gt";
params = [ 90 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "10m";
annotations = {
summary = "CPU sustained above 90%";
description = "CPU usage has been above 90% for 10 minutes (currently {{ $values.B | printf \"%.1f\" }}%).";
};
labels = {
severity = "warning";
};
isPaused = false;
}
# UPS on battery (network_ups_tools_ups_status == 0 means OB/on-battery)
{
uid = "nas-ups-onbatt";
title = "UPS on battery";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = "network_ups_tools_ups_status";
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
# status 0 = OB (on battery), 1 = OL (online)
conditions = [
{
evaluator = {
type = "lt";
params = [ 1 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "1m";
annotations = {
summary = "UPS is running on battery";
description = "Mains power failure detected. UPS battery charge: {{ with query \"network_ups_tools_battery_charge\" }}{{ . | first | value | printf \"%.0f\" }}%{{ end }}.";
};
labels = {
severity = "critical";
};
isPaused = false;
}
# UPS battery charge < 30%
{
uid = "nas-ups-lowbatt";
title = "UPS battery low";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = "network_ups_tools_battery_charge";
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "lt";
params = [ 30 ];
};
}
];
};
}
];
noDataState = "NoData";
execErrState = "Error";
for = "2m";
annotations = {
summary = "UPS battery charge below 30%";
description = "UPS battery is at {{ $values.B | printf \"%.0f\" }}%. Shutdown may be imminent.";
};
labels = {
severity = "critical";
};
isPaused = false;
}
# PostgreSQL not responding
{
uid = "nas-postgres-down";
title = "PostgreSQL down";
condition = "C";
data = [
{
refId = "A";
datasourceUid = "prometheus";
model = {
expr = "pg_up";
intervalMs = 60000;
maxDataPoints = 43200;
refId = "A";
};
}
{
refId = "B";
datasourceUid = "__expr__";
model = {
type = "reduce";
refId = "B";
expression = "A";
reducer = "last";
};
}
{
refId = "C";
datasourceUid = "__expr__";
model = {
type = "threshold";
refId = "C";
expression = "B";
conditions = [
{
evaluator = {
type = "lt";
params = [ 1 ];
};
}
];
};
}
];
noDataState = "Alerting";
execErrState = "Error";
for = "2m";
annotations = {
summary = "PostgreSQL is down";
description = "The PostgreSQL exporter reports pg_up=0. Database may be unavailable.";
};
labels = {
severity = "critical";
};
isPaused = false;
}
];
}
];
};
muteTimings.settings = {
apiVersion = 1;
muteTimes = [ ];
};
};
*/
dashboards.settings.providers = [
{