init xrt and fflm

2026-03-25 20:46:42 -05:00
parent 2013804b17
commit ab81e78b60
4 changed files with 404 additions and 28 deletions
--- a/modules/nixos/services/grafana/default.nix
+++ b/modules/nixos/services/grafana/default.nix
@@ -400,34 +400,30 @@ let
              # ntfy via the Grafana webhook contact point.  Grafana POSTs a JSON
              # body; ntfy accepts any body as the message text.  We use the
              # message template below to format it nicely.
-              # Basic auth credentials are read from the SOPS secret at runtime
-              # via Grafana's $__file{} provider.
-              contactPoints.settings = {
-                apiVersion = 1;
-                contactPoints = [
-                  {
-                    name = "ntfy";
-                    receivers = [
-                      {
-                        uid = "ntfy-webhook";
-                        type = "webhook";
-                        settings = {
-                          url = "https://ntfy.mjallen.dev/grafana-alerts";
-                          httpMethod = "POST";
-                          username = "$__file{${config.sops.secrets."jallen-nas/ntfy/user".path}}";
-                          password = "$__file{${config.sops.secrets."jallen-nas/ntfy/password".path}}";
-                          # Pass alert title and state as ntfy headers via the
-                          # custom message template (defined below).
-                          httpHeaders = {
-                            "Tags" = "chart,bell";
-                          };
-                        };
-                        disableResolveMessage = false;
-                      }
-                    ];
-                  }
-                ];
-              };
+              #
+              # Credentials are injected via Grafana's $__env{} provider, which
+              # reads from the process environment.  The GRAFANA_NTFY_USER and
+              # GRAFANA_NTFY_PASSWORD variables are set via the SOPS-managed
+              # grafana.env EnvironmentFile on the grafana.service unit.
+              #
+              # Note: $__file{} only works in grafana.ini settings, not in
+              # provisioning YAML files — using it here causes a parse error.
+              contactPoints.path = pkgs.writeTextDir "contactPoints.yaml" ''
+                apiVersion: 1
+                contactPoints:
+                  - name: ntfy
+                    receivers:
+                      - uid: ntfy-webhook
+                        type: webhook
+                        disableResolveMessage: false
+                        settings:
+                          url: https://ntfy.mjallen.dev/grafana-alerts
+                          httpMethod: POST
+                          username: $__env{GRAFANA_NTFY_USER}
+                          password: $__env{GRAFANA_NTFY_PASSWORD}
+                          httpHeaders:
+                            Tags: "chart,bell"
+              '';

              # ── Notification message template ───────────────────────────────────
              # Grafana sends the rendered template body as the POST body.
@@ -878,6 +874,11 @@ let
        };
      };

+      # Inject ntfy credentials into Grafana's environment so the $__env{}
+      # provider in contactPoints.yaml can resolve them at runtime.
+      # The grafana.env template is managed by SOPS and owned by grafana:grafana.
+      systemd.services.grafana.serviceConfig.EnvironmentFile = config.sops.templates."grafana.env".path;
+
      # The redis exporter needs AF_INET to reach TCP Redis instances.
      # The default systemd hardening only allows AF_UNIX.
      systemd.services.prometheus-redis-exporter.serviceConfig.RestrictAddressFamilies = [
--- a/packages/fastflowlm/default.nix
+++ b/packages/fastflowlm/default.nix
@@ -0,0 +1,248 @@
+{
+  lib,
+  stdenv,
+  fetchFromGitHub,
+  cmake,
+  ninja,
+  pkg-config,
+  rustPlatform,
+  cargo,
+  rustc,
+  # C++ build-time dependencies
+  boost,
+  curl,
+  openssl,
+  fftw,
+  fftwFloat, # fftw3f (single-precision)
+  fftwLongDouble, # fftw3l (long-double-precision)
+  ffmpeg,
+  readline,
+  libdrm,
+  libuuid,
+  # ELF patching for the bundled proprietary .so files
+  autoPatchelfHook,
+  patchelf,
+  gcc-unwrapped,
+  # Access to other flake packages (packages/xrt)
+  pkgs,
+  namespace,
+}:
+
+# FastFlowLM (FLM) — Ollama-style LLM runtime for AMD Ryzen AI (XDNA 2) NPUs.
+#
+# Build overview
+# ==============
+# The repository contains:
+#   src/                  C++20 CMake project → produces the `flm` binary
+#   third_party/
+#     tokenizers-cpp/     git submodule — builds tokenizers_cpp (C++) +
+#                         libtokenizers_c.a (Rust staticlib via cargo)
+#   src/lib/*.so          Proprietary NPU kernel libraries (pre-built, bundled)
+#   src/xclbins/          AIE bitstreams (pre-built, loaded at runtime by .so)
+#   src/model_list.json   Model registry
+#
+# Runtime prerequisites (managed outside this package):
+#   • Linux >= 6.14 with amdxdna in-tree driver, or amdxdna-dkms on older
+#     kernels
+#   • linux-firmware >= 20260221 (NPU firmware >= 1.1.0.0)
+#   • Memlock = unlimited for the FLM process
+#   • packages/xrt (libxrt_coreutil) built and available
+#
+# To update to a new release
+# ==========================
+#  1. Bump `version` below.
+#  2. Update `srcHash` (run: nix-prefetch-git --url ...FastFlowLM --rev v<X>).
+#  3. If the tokenizers-cpp submodule rev changed (check .gitmodules / git
+#     submodule status), update `tokenizersRev` and `tokenizersHash`:
+#       nix-prefetch-git --url .../tokenizers-cpp --rev <REV> --fetch-submodules
+#  4. Update `cargoVendorHash`: set to lib.fakeHash, run nix build, copy hash.
+
+let
+  version = "0.9.36";
+
+  # XRT userspace runtime — built from packages/xrt in this flake.
+  xrt = pkgs.${namespace}.xrt;
+
+  # ── tokenizers-cpp submodule ──────────────────────────────────────────────
+  # Pinned to the commit referenced in FastFlowLM v0.9.36 .gitmodules.
+  tokenizersRev = "34885cfd7b9ef27b859c28a41e71413dd31926f5";
+
+  tokenizers-cpp-src = fetchFromGitHub {
+    owner = "mlc-ai";
+    repo = "tokenizers-cpp";
+    rev = tokenizersRev;
+    # Includes sentencepiece + msgpack sub-submodules.
+    hash = "sha256-m3A9OhCXJgvvV9UbVL/ijaUC1zkLHlddnQLqZEA5t4w=";
+    fetchSubmodules = true;
+  };
+
+  # Vendor the Rust crates from tokenizers-cpp/rust/Cargo.toml offline.
+  # This fixed-output derivation has network access; everything else is sandboxed.
+  # To compute the hash:  set to lib.fakeHash → nix build → copy printed hash.
+  cargoVendorDir = rustPlatform.fetchCargoVendor {
+    src = tokenizers-cpp-src;
+    sourceRoot = "source/rust";
+    hash = lib.fakeHash; # FIXME: replace after first successful build attempt
+  };
+
+in
+stdenv.mkDerivation rec {
+  pname = "fastflowlm";
+  inherit version;
+
+  src = fetchFromGitHub {
+    owner = "FastFlowLM";
+    repo = "FastFlowLM";
+    rev = "v${version}";
+    # We do NOT fetch submodules here — tokenizers-cpp is injected separately
+    # (above) so that its Rust deps can be vendored in a fixed-output derivation.
+    hash = "sha256-uq/ZxvJA5HTJbMxofO4Hrz7ULvV1fPC7OHRXulMqwqw=";
+  };
+
+  nativeBuildInputs = [
+    cmake
+    ninja
+    pkg-config
+    cargo
+    rustc
+    autoPatchelfHook
+    patchelf
+  ];
+
+  buildInputs = [
+    boost
+    curl
+    openssl
+    fftw
+    fftwFloat
+    fftwLongDouble
+    ffmpeg
+    readline
+    libdrm
+    libuuid
+    xrt
+    # libstdc++ / libgcc_s needed at runtime by the bundled NPU .so files.
+    gcc-unwrapped.lib
+  ];
+
+  # autoPatchelfHook uses runtimeDependencies to add NEEDED entries to the
+  # ELF RPATH, covering libraries that the bundled .so files depend on.
+  runtimeDependencies = [
+    xrt
+    gcc-unwrapped.lib
+    fftw
+    fftwFloat
+    fftwLongDouble
+    ffmpeg
+    curl
+    openssl
+    boost
+    readline
+    libdrm
+  ];
+
+  # CMakeLists.txt lives in src/, not the repo root.
+  cmakeDir = "src";
+
+  preConfigure = ''
+        # ── 1. Populate the tokenizers-cpp submodule directory ───────────────────
+        # CMakeLists.txt references the submodule as:
+        #   add_subdirectory(''${CMAKE_SOURCE_DIR}/../third_party/tokenizers-cpp ...)
+        # The cmake setup hook unpacks sources to $TMPDIR/source; we write the
+        # submodule content there before cmake is invoked.
+        mkdir -p third_party/tokenizers-cpp
+        cp -r --no-preserve=mode,ownership "${tokenizers-cpp-src}/." \
+              third_party/tokenizers-cpp/
+
+        # ── 2. Configure cargo to use the pre-vendored crates (offline) ──────────
+        mkdir -p third_party/tokenizers-cpp/rust/.cargo
+        cat > third_party/tokenizers-cpp/rust/.cargo/config.toml << EOF
+    [source.crates-io]
+    replace-with = "vendored-sources"
+
+    [source.vendored-sources]
+    directory = "${cargoVendorDir}"
+    EOF
+  '';
+
+  cmakeFlags = [
+    # The build system requires these two version strings (checked at configure).
+    "-DFLM_VERSION=${version}"
+    "-DNPU_VERSION=32.0.203.311"
+    "-DCMAKE_BUILD_TYPE=Release"
+    # Override the default XRT install prefix (/opt/xilinx/xrt).
+    "-DXRT_INCLUDE_DIR=${xrt}/include"
+    "-DXRT_LIB_DIR=${xrt}/lib"
+    # xclbins/ path baked into the binary via CMAKE_XCLBIN_PREFIX.
+    "-DCMAKE_XCLBIN_PREFIX=${placeholder "out"}/share/flm"
+  ];
+
+  installPhase = ''
+    runHook preInstall
+
+    cmake --install . --prefix "$out"
+
+    # ── Copy bundled proprietary NPU kernel .so files ─────────────────────────
+    # The upstream CMakeLists installs them via:
+    #   file(GLOB so_libs "''${CMAKE_SOURCE_DIR}/lib/*.so")
+    #   install(FILES ''${so_libs} DESTINATION lib)
+    # and sets RPATH=$ORIGIN/../lib on the flm binary.
+    # We reproduce that layout: $out/lib/lib*.so alongside $out/bin/flm.
+    mkdir -p "$out/lib"
+    for so in "$src/src/lib"/lib*.so; do
+      install -m755 "$so" "$out/lib/"
+    done
+
+    runHook postInstall
+  '';
+
+  # autoPatchelfHook runs automatically and patches the bundled .so files.
+  # We additionally fix the RPATH on the flm binary to include both:
+  #   • $out/lib          (bundled NPU .so files)
+  #   • system libs path  (XRT, ffmpeg, boost, …)
+  postFixup = ''
+    patchelf \
+      --set-rpath "${lib.makeLibraryPath buildInputs}:$out/lib" \
+      "$out/bin/flm"
+  '';
+
+  meta = with lib; {
+    description = "LLM runtime for AMD Ryzen AI XDNA 2 NPUs";
+    longDescription = ''
+      FastFlowLM (FLM) runs large language models on AMD Ryzen AI (XDNA 2)
+      NPU silicon — Strix Point, Strix Halo, Kraken Point, Gorgon Point.
+      It provides an Ollama-compatible REST API (port 52625) and a CLI.
+
+      Models are stored in ~/.config/flm/ by default;
+      override with the FLM_MODEL_PATH environment variable.
+
+      Usage:
+        flm validate           # check NPU driver + firmware health
+        flm run llama3.2:1b    # interactive chat (downloads model on first run)
+        flm serve llama3.2:1b  # OpenAI-compatible server on port 52625
+        flm list               # list available models
+        flm pull <model>       # pre-download a model
+
+      System requirements:
+        • Linux >= 6.14 (amdxdna in-tree) or amdxdna-dkms on older kernels
+        • linux-firmware >= 20260221  (NPU firmware >= 1.1.0.0)
+        • Unlimited memlock for the flm process, e.g. in NixOS:
+            security.pam.loginLimits = [{
+              domain = "*"; type = "-";
+              item = "memlock"; value = "unlimited";
+            }];
+
+      License note: CLI/orchestration code is MIT.  The bundled NPU kernel
+      shared libraries are proprietary (free for commercial use up to
+      USD 10 M annual revenue).  See LICENSE_BINARY.txt upstream.
+    '';
+    homepage = "https://fastflowlm.com";
+    license = with licenses; [
+      mit
+      unfreeRedistributable
+    ];
+    mainProgram = "flm";
+    platforms = [ "x86_64-linux" ];
+    maintainers = [ ];
+  };
+}
--- a/packages/xrt/default.nix
+++ b/packages/xrt/default.nix
@@ -0,0 +1,114 @@
+{
+  lib,
+  stdenv,
+  fetchFromGitHub,
+  cmake,
+  ninja,
+  pkg-config,
+  python3,
+  boost,
+  curl,
+  openssl,
+  systemd,
+  libdrm,
+  ncurses,
+  protobuf,
+  elfutils,
+  zlib,
+  rapidjson,
+  util-linux, # provides libuuid
+  xz, # provides liblzma
+}:
+
+# AMD XRT (Xilinx Runtime) userspace library for NPU (XDNA 2) devices.
+#
+# This package builds the XRT base library from the commit pinned as a
+# submodule in amd/xdna-driver.  It provides:
+#   $out/lib/libxrt_coreutil.so    — core utility library (linked by flm)
+#   $out/lib/libxrt_core.so        — platform-independent core
+#   $out/include/xrt/              — public C++ headers
+#   $out/include/experimental/
+#
+# The xrt source tree lives under the src/ subdirectory of the Xilinx/XRT
+# repository (see src/CMakeLists.txt which includes CMake/nativeLnx.cmake).
+#
+# XRT version 2.19.0 — pinned to the commit used by amd/xdna-driver main
+# as of 2026-03-25 (xrt @ 481583d).
+#
+# Runtime note: this package only provides the userspace library.  The
+# kernel driver (amdxdna.ko) is a separate concern:
+#   • Linux >= 6.14 ships it in-tree (boot.kernelPackages.linux_latest).
+#   • Older kernels can use hardware.amdxdna.enable (once packaged).
+
+stdenv.mkDerivation rec {
+  pname = "xrt";
+  version = "2.19.0";
+
+  src = fetchFromGitHub {
+    owner = "Xilinx";
+    repo = "XRT";
+    rev = "481583db9a26cb506a37cab7f1881ae7c7de2f32";
+    hash = "sha256-WLZDjuuEGd3i77zXpAJkfQy/AszdSQ9pagy64yGX58Q=";
+    fetchSubmodules = false; # XRT submodules are Windows-only tools
+  };
+
+  nativeBuildInputs = [
+    cmake
+    ninja
+    pkg-config
+    python3
+  ];
+
+  buildInputs = [
+    boost
+    curl
+    openssl
+    systemd # for libudev (device enumeration)
+    libdrm
+    ncurses
+    protobuf
+    elfutils # libelf
+    zlib
+    rapidjson
+    util-linux # libuuid
+    xz # liblzma
+  ];
+
+  # XRT's CMakeLists.txt is in the src/ subdirectory.
+  cmakeDir = "src";
+
+  cmakeFlags = [
+    "-DCMAKE_BUILD_TYPE=Release"
+    "-DCMAKE_INSTALL_PREFIX=${placeholder "out"}"
+    # Build the NPU/XDNA variant (skips PCIe FPGA-specific components).
+    "-DXRT_NATIVE_BUILD=yes"
+    # Disable components we do not need:
+    "-DXRT_ENABLE_WERROR=OFF"
+    # Install libraries to lib/ (some builds default to lib64/).
+    "-DCMAKE_INSTALL_LIBDIR=lib"
+  ];
+
+  # XRT's install target places a setup.sh in the prefix root; we don't need
+  # that for Nix — the binary wrapper / RPATH mechanism handles library lookup.
+  postInstall = ''
+    # Remove the CMake-generated setup.sh — not needed in a Nix env.
+    rm -f "$out"/setup.sh "$out"/setup.csh 2>/dev/null || true
+  '';
+
+  meta = with lib; {
+    description = "AMD XRT (Xilinx Runtime) userspace library for XDNA NPUs";
+    longDescription = ''
+      XRT is the userspace component of AMD's XRT stack for their FPGA and
+      NPU devices.  This package builds only the base library
+      (libxrt_coreutil, libxrt_core) that FastFlowLM links against to
+      communicate with the AMD XDNA 2 NPU via the amdxdna kernel driver.
+
+      The kernel driver (amdxdna.ko) is built in since Linux 6.14.
+      For older kernels it can be loaded via a DKMS package.
+    '';
+    homepage = "https://github.com/Xilinx/XRT";
+    license = licenses.asl20;
+    platforms = [ "x86_64-linux" ];
+    maintainers = [ ];
+  };
+}
--- a/systems/x86_64-linux/jallen-nas/sops.nix
+++ b/systems/x86_64-linux/jallen-nas/sops.nix
@@ -366,6 +366,19 @@ in
        ];
      };

+      # Grafana reads ntfy credentials via systemd EnvironmentFile so the
+      # $__env{} provider works in alerting provisioning YAML.  The file
+      # provider ($__file{}) only works in grafana.ini, not in provisioning.
+      "grafana.env" = {
+        content = ''
+          GRAFANA_NTFY_USER=${config.sops.placeholder."jallen-nas/ntfy/user"}
+          GRAFANA_NTFY_PASSWORD=${config.sops.placeholder."jallen-nas/ntfy/password"}
+        '';
+        mode = "0400";
+        owner = "grafana";
+        restartUnits = [ "grafana.service" ];
+      };
+
      # CrowdSec HTTP notification plugin config with credentials baked in.
      # The plugin process spawned by crowdsec/cscli reads this file directly.
      # Credentials are embedded in the URL using HTTP basic auth so no