nix-config/packages/fastflowlm/default.nix

{
  lib,
  stdenv,
  fetchFromGitHub,
  cmake,
  ninja,
  pkg-config,
  rustPlatform,
  cargo,
  rustc,
  # C++ build-time dependencies
  boost,
  curl,
  openssl,
  fftw,
  fftwFloat, # fftw3f (single-precision)
  fftwLongDouble, # fftw3l (long-double-precision)
  ffmpeg,
  readline,
  libdrm,
  libuuid,
  # ELF patching for the bundled proprietary .so files
  autoPatchelfHook,
  patchelf,
  gcc-unwrapped,
  # Access to other flake packages (packages/xrt)
  pkgs,
  namespace,
}:

# FastFlowLM (FLM) — Ollama-style LLM runtime for AMD Ryzen AI (XDNA 2) NPUs.
#
# Build overview
# ==============
# The repository contains:
#   src/                  C++20 CMake project → produces the `flm` binary
#   third_party/
#     tokenizers-cpp/     git submodule — builds tokenizers_cpp (C++) +
#                         libtokenizers_c.a (Rust staticlib via cargo)
#   src/lib/*.so          Proprietary NPU kernel libraries (pre-built, bundled)
#   src/xclbins/          AIE bitstreams (pre-built, loaded at runtime by .so)
#   src/model_list.json   Model registry
#
# Runtime prerequisites (managed outside this package):
#   • Linux >= 6.14 with amdxdna in-tree driver, or amdxdna-dkms on older
#     kernels
#   • linux-firmware >= 20260221 (NPU firmware >= 1.1.0.0)
#   • Memlock = unlimited for the FLM process
#   • packages/xrt (libxrt_coreutil) built and available
#
# To update to a new release
# ==========================
#  1. Bump `version` below.
#  2. Update `srcHash` (run: nix-prefetch-git --url ...FastFlowLM --rev v<X>).
#  3. If the tokenizers-cpp submodule rev changed (check .gitmodules / git
#     submodule status), update `tokenizersRev` and `tokenizersHash`:
#       nix-prefetch-git --url .../tokenizers-cpp --rev <REV> --fetch-submodules
#  4. Update `cargoVendorHash`: set to lib.fakeHash, run nix build, copy hash.

let
  version = "0.9.36";

  # XRT userspace runtime — built from packages/xrt in this flake.
  xrt = pkgs.${namespace}.xrt;

  # ── tokenizers-cpp submodule ──────────────────────────────────────────────
  # Pinned to the commit referenced in FastFlowLM v0.9.36 .gitmodules.
  tokenizersRev = "34885cfd7b9ef27b859c28a41e71413dd31926f5";

  tokenizers-cpp-src = fetchFromGitHub {
    owner = "mlc-ai";
    repo = "tokenizers-cpp";
    rev = tokenizersRev;
    # Includes sentencepiece + msgpack sub-submodules.
    hash = "sha256-m3A9OhCXJgvvV9UbVL/ijaUC1zkLHlddnQLqZEA5t4w=";
    fetchSubmodules = true;
  };

  # Vendor the Rust crates from tokenizers-cpp/rust/Cargo.toml offline.
  # This fixed-output derivation has network access; everything else is sandboxed.
  # To compute the hash:  set to lib.fakeHash → nix build → copy printed hash.
  cargoVendorDir = rustPlatform.fetchCargoVendor {
    src = tokenizers-cpp-src;
    sourceRoot = "source/rust";
    hash = lib.fakeHash; # FIXME: replace after first successful build attempt
  };

in
stdenv.mkDerivation rec {
  pname = "fastflowlm";
  inherit version;

  src = fetchFromGitHub {
    owner = "FastFlowLM";
    repo = "FastFlowLM";
    rev = "v${version}";
    # We do NOT fetch submodules here — tokenizers-cpp is injected separately
    # (above) so that its Rust deps can be vendored in a fixed-output derivation.
    hash = "sha256-uq/ZxvJA5HTJbMxofO4Hrz7ULvV1fPC7OHRXulMqwqw=";
  };

  nativeBuildInputs = [
    cmake
    ninja
    pkg-config
    cargo
    rustc
    autoPatchelfHook
    patchelf
  ];

  buildInputs = [
    boost
    curl
    openssl
    fftw
    fftwFloat
    fftwLongDouble
    ffmpeg
    readline
    libdrm
    libuuid
    xrt
    # libstdc++ / libgcc_s needed at runtime by the bundled NPU .so files.
    gcc-unwrapped.lib
  ];

  # autoPatchelfHook uses runtimeDependencies to add NEEDED entries to the
  # ELF RPATH, covering libraries that the bundled .so files depend on.
  runtimeDependencies = [
    xrt
    gcc-unwrapped.lib
    fftw
    fftwFloat
    fftwLongDouble
    ffmpeg
    curl
    openssl
    boost
    readline
    libdrm
  ];

  # CMakeLists.txt lives in src/, not the repo root.
  cmakeDir = "src";

  preConfigure = ''
        # ── 1. Populate the tokenizers-cpp submodule directory ───────────────────
        # CMakeLists.txt references the submodule as:
        #   add_subdirectory(''${CMAKE_SOURCE_DIR}/../third_party/tokenizers-cpp ...)
        # The cmake setup hook unpacks sources to $TMPDIR/source; we write the
        # submodule content there before cmake is invoked.
        mkdir -p third_party/tokenizers-cpp
        cp -r --no-preserve=mode,ownership "${tokenizers-cpp-src}/." \
              third_party/tokenizers-cpp/

        # ── 2. Configure cargo to use the pre-vendored crates (offline) ──────────
        mkdir -p third_party/tokenizers-cpp/rust/.cargo
        cat > third_party/tokenizers-cpp/rust/.cargo/config.toml << EOF
    [source.crates-io]
    replace-with = "vendored-sources"

    [source.vendored-sources]
    directory = "${cargoVendorDir}"
    EOF
  '';

  cmakeFlags = [
    # The build system requires these two version strings (checked at configure).
    "-DFLM_VERSION=${version}"
    "-DNPU_VERSION=32.0.203.311"
    "-DCMAKE_BUILD_TYPE=Release"
    # Override the default XRT install prefix (/opt/xilinx/xrt).
    "-DXRT_INCLUDE_DIR=${xrt}/include"
    "-DXRT_LIB_DIR=${xrt}/lib"
    # xclbins/ path baked into the binary via CMAKE_XCLBIN_PREFIX.
    "-DCMAKE_XCLBIN_PREFIX=${placeholder "out"}/share/flm"
  ];

  installPhase = ''
    runHook preInstall

    cmake --install . --prefix "$out"

    # ── Copy bundled proprietary NPU kernel .so files ─────────────────────────
    # The upstream CMakeLists installs them via:
    #   file(GLOB so_libs "''${CMAKE_SOURCE_DIR}/lib/*.so")
    #   install(FILES ''${so_libs} DESTINATION lib)
    # and sets RPATH=$ORIGIN/../lib on the flm binary.
    # We reproduce that layout: $out/lib/lib*.so alongside $out/bin/flm.
    mkdir -p "$out/lib"
    for so in "$src/src/lib"/lib*.so; do
      install -m755 "$so" "$out/lib/"
    done

    runHook postInstall
  '';

  # autoPatchelfHook runs automatically and patches the bundled .so files.
  # We additionally fix the RPATH on the flm binary to include both:
  #   • $out/lib          (bundled NPU .so files)
  #   • system libs path  (XRT, ffmpeg, boost, …)
  postFixup = ''
    patchelf \
      --set-rpath "${lib.makeLibraryPath buildInputs}:$out/lib" \
      "$out/bin/flm"
  '';

  meta = with lib; {
    description = "LLM runtime for AMD Ryzen AI XDNA 2 NPUs";
    longDescription = ''
      FastFlowLM (FLM) runs large language models on AMD Ryzen AI (XDNA 2)
      NPU silicon — Strix Point, Strix Halo, Kraken Point, Gorgon Point.
      It provides an Ollama-compatible REST API (port 52625) and a CLI.

      Models are stored in ~/.config/flm/ by default;
      override with the FLM_MODEL_PATH environment variable.

      Usage:
        flm validate           # check NPU driver + firmware health
        flm run llama3.2:1b    # interactive chat (downloads model on first run)
        flm serve llama3.2:1b  # OpenAI-compatible server on port 52625
        flm list               # list available models
        flm pull <model>       # pre-download a model

      System requirements:
        • Linux >= 6.14 (amdxdna in-tree) or amdxdna-dkms on older kernels
        • linux-firmware >= 20260221  (NPU firmware >= 1.1.0.0)
        • Unlimited memlock for the flm process, e.g. in NixOS:
            security.pam.loginLimits = [{
              domain = "*"; type = "-";
              item = "memlock"; value = "unlimited";
            }];

      License note: CLI/orchestration code is MIT.  The bundled NPU kernel
      shared libraries are proprietary (free for commercial use up to
      USD 10 M annual revenue).  See LICENSE_BINARY.txt upstream.
    '';
    homepage = "https://fastflowlm.com";
    license = with licenses; [
      mit
      unfreeRedistributable
    ];
    mainProgram = "flm";
    platforms = [ "x86_64-linux" ];
    maintainers = [ ];
  };
}