Merge branch 'master' of https://gitea.thecrazyinsanity.win/TheCrazyInsanity/nixos-configv3

2026-03-13 16:22:07 +00:00
parent 000ab8fa93 5c1ef72f19
commit 67c9ce550d
9 changed files with 214 additions and 34 deletions
--- a/machines/homepc/configuration.nix
+++ b/machines/homepc/configuration.nix
@@ -39,10 +39,81 @@
  # };

  # This is here because I don't have another computer that could run local AI, and regardless the packages would be different on every one.
+  # TODO: honestly, while I currently only have one pc that can run local AI, might change in the future.
+  # And this config is getting a bit complicated for a single pc config
+  # Should be moved to it's own shit
  environment.systemPackages = with pkgs; [
    ollama-cuda
+    opencode
+    llama-cpp
+    llama-swap
  ];

+  services.ollama = {
+    enable = true;
+    package = pkgs.ollama-cuda;
+    environmentVariables = {
+      OLLAMA_NUM_PARALLEL = "1";
+      OLLAMA_FLASH_ATTENTION = "1";
+      OLLAMA_KV_CACHE_TYPE = "q4_0";
+      OLLAMA_CONTEXT_LENGTH = "16384";
+    };
+  };
+
+  # Configure llama-swap as a systemd service
+  systemd.services.llama-swap = {
+    description = "llama-swap - OpenAI compatible proxy with automatic model swapping";
+    after = [ "network.target" ];
+    wantedBy = [ "multi-user.target" ];
+
+    serviceConfig = {
+      Type = "simple";
+      User = "laythe";
+      Group = "users";
+      # Point to your declarative config file
+      ExecStart = "${pkgs.llama-swap}/bin/llama-swap --config /etc/llama-swap/config.yaml --listen 0.0.0.0:9292 --watch-config";
+      Restart = "always";
+      RestartSec = 10;
+
+      # Environment for CUDA support
+      Environment = [
+        "PATH=/run/current-system/sw/bin"
+        "LD_LIBRARY_PATH=/run/opengl-driver/lib:/run/opengl-driver-32/lib"
+      ];
+    };
+  };
+
+  # As long as this is here the models are declarative. llama-server will grab them if not downloaded already.
+  environment.etc."llama-swap/config.yaml".text = ''
+    models:
+      "Qwen3.5-35B-A3B-GGUF":
+        cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_S --ctx-size 128000 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -np 1  --cpu-moe --fit-target 256 --ubatch-size 1024 -fa on  --slots --slot-save-path /home/laythe/llamapcache  --jinja -kvu --no-mmproj --swa-checkpoints 32
+        ttl: 2400
+      "Qwen3-1.7B-GGUF":
+        cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3-1.7B-GGUF:Q4_K_S --ctx-size 32768 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu
+        ttl: 120
+      "Qwen3-8B-GGUF":
+        cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3-8B-GGUF:Q4_K_S --ctx-size 32768 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1
+        ttl: 120
+      "Qwen3-4B-Claude-Opus-Distill":
+        cmd: llama-server --port ''${PORT} -hf TeichAI/Qwen3-4B-Thinking-2507-Claude-4.5-Opus-High-Reasoning-Distill-GGUF:Q4_K_M --ctx-size 32768 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1 --fit-target 256
+        ttl: 120
+      "Qwen3.5-9B-Thinking":
+        cmd: "llama-server --port ''${PORT} -hf unsloth/Qwen3.5-9B-GGUF:Q4_K_M --ctx-size 32000 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1 --fit-target 256 --chat-template-kwargs '{\"enable_thinking\": true}' --no-mmproj"
+        ttl: 120
+      "Qwen3.5-4B-Thinking":
+        cmd: "llama-server --port ''${PORT} -hf unsloth/Qwen3.5-4B-GGUF:Q4_K_M --ctx-size 64000 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1 --fit-target 256 --chat-template-kwargs '{\"enable_thinking\": true}'"
+        ttl: 120
+      "Qwen3.5-9B-Non-Thinking":
+        cmd: "llama-server --port ''${PORT} -hf unsloth/Qwen3.5-9B-GGUF:Q4_K_M --ctx-size 32000 --temp 0.7 --top-p 0.8 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1 --fit-target 256 --no-mmproj"
+        ttl: 120
+      "Qwen3.5-4B-Non-Thinking":
+        cmd: "llama-server --port ''${PORT} -hf unsloth/Qwen3.5-4B-GGUF:Q4_K_M --ctx-size 64000 --temp 0.7 --top-p 0.8 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1 --fit-target 256"
+        ttl: 120
+      "Qwen3.5-0.8B-Non-Thinking":
+        cmd: "llama-server --port ''${PORT} -hf unsloth/Qwen3.5-0.8B-GGUF:Q4_K_M --ctx-size 64000 --temp 0.7 --top-p 0.8 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1 --fit-target 256"
+        ttl: 120'';
+
  # Set your time zone.
  # time.timeZone = "Europe/Amsterdam";

@@ -113,7 +184,7 @@
  # networking.firewall.allowedTCPPorts = [ ... ];
  # networking.firewall.allowedUDPPorts = [ ... ];
  # Or disable the firewall altogether.
-  # networking.firewall.enable = false;
+  networking.firewall.enable = false;

  # Copy the NixOS configuration file and link it from the resulting system
  # (/run/current-system/configuration.nix). This is useful in case you