Merge branch 'master' of https://gitea.thecrazyinsanity.win/TheCrazyInsanity/nixos-configv3
This commit is contained in:
@@ -39,10 +39,81 @@
|
||||
# };
|
||||
|
||||
# This is here because I don't have another computer that could run local AI, and regardless the packages would be different on every one.
|
||||
# TODO: honestly, while I currently only have one pc that can run local AI, might change in the future.
|
||||
# And this config is getting a bit complicated for a single pc config
|
||||
# Should be moved to it's own shit
|
||||
environment.systemPackages = with pkgs; [
|
||||
ollama-cuda
|
||||
opencode
|
||||
llama-cpp
|
||||
llama-swap
|
||||
];
|
||||
|
||||
services.ollama = {
|
||||
enable = true;
|
||||
package = pkgs.ollama-cuda;
|
||||
environmentVariables = {
|
||||
OLLAMA_NUM_PARALLEL = "1";
|
||||
OLLAMA_FLASH_ATTENTION = "1";
|
||||
OLLAMA_KV_CACHE_TYPE = "q4_0";
|
||||
OLLAMA_CONTEXT_LENGTH = "16384";
|
||||
};
|
||||
};
|
||||
|
||||
# Configure llama-swap as a systemd service
|
||||
systemd.services.llama-swap = {
|
||||
description = "llama-swap - OpenAI compatible proxy with automatic model swapping";
|
||||
after = [ "network.target" ];
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
|
||||
serviceConfig = {
|
||||
Type = "simple";
|
||||
User = "laythe";
|
||||
Group = "users";
|
||||
# Point to your declarative config file
|
||||
ExecStart = "${pkgs.llama-swap}/bin/llama-swap --config /etc/llama-swap/config.yaml --listen 0.0.0.0:9292 --watch-config";
|
||||
Restart = "always";
|
||||
RestartSec = 10;
|
||||
|
||||
# Environment for CUDA support
|
||||
Environment = [
|
||||
"PATH=/run/current-system/sw/bin"
|
||||
"LD_LIBRARY_PATH=/run/opengl-driver/lib:/run/opengl-driver-32/lib"
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
# As long as this is here the models are declarative. llama-server will grab them if not downloaded already.
|
||||
environment.etc."llama-swap/config.yaml".text = ''
|
||||
models:
|
||||
"Qwen3.5-35B-A3B-GGUF":
|
||||
cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_S --ctx-size 128000 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -np 1 --cpu-moe --fit-target 256 --ubatch-size 1024 -fa on --slots --slot-save-path /home/laythe/llamapcache --jinja -kvu --no-mmproj --swa-checkpoints 32
|
||||
ttl: 2400
|
||||
"Qwen3-1.7B-GGUF":
|
||||
cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3-1.7B-GGUF:Q4_K_S --ctx-size 32768 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu
|
||||
ttl: 120
|
||||
"Qwen3-8B-GGUF":
|
||||
cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3-8B-GGUF:Q4_K_S --ctx-size 32768 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1
|
||||
ttl: 120
|
||||
"Qwen3-4B-Claude-Opus-Distill":
|
||||
cmd: llama-server --port ''${PORT} -hf TeichAI/Qwen3-4B-Thinking-2507-Claude-4.5-Opus-High-Reasoning-Distill-GGUF:Q4_K_M --ctx-size 32768 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1 --fit-target 256
|
||||
ttl: 120
|
||||
"Qwen3.5-9B-Thinking":
|
||||
cmd: "llama-server --port ''${PORT} -hf unsloth/Qwen3.5-9B-GGUF:Q4_K_M --ctx-size 32000 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1 --fit-target 256 --chat-template-kwargs '{\"enable_thinking\": true}' --no-mmproj"
|
||||
ttl: 120
|
||||
"Qwen3.5-4B-Thinking":
|
||||
cmd: "llama-server --port ''${PORT} -hf unsloth/Qwen3.5-4B-GGUF:Q4_K_M --ctx-size 64000 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1 --fit-target 256 --chat-template-kwargs '{\"enable_thinking\": true}'"
|
||||
ttl: 120
|
||||
"Qwen3.5-9B-Non-Thinking":
|
||||
cmd: "llama-server --port ''${PORT} -hf unsloth/Qwen3.5-9B-GGUF:Q4_K_M --ctx-size 32000 --temp 0.7 --top-p 0.8 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1 --fit-target 256 --no-mmproj"
|
||||
ttl: 120
|
||||
"Qwen3.5-4B-Non-Thinking":
|
||||
cmd: "llama-server --port ''${PORT} -hf unsloth/Qwen3.5-4B-GGUF:Q4_K_M --ctx-size 64000 --temp 0.7 --top-p 0.8 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1 --fit-target 256"
|
||||
ttl: 120
|
||||
"Qwen3.5-0.8B-Non-Thinking":
|
||||
cmd: "llama-server --port ''${PORT} -hf unsloth/Qwen3.5-0.8B-GGUF:Q4_K_M --ctx-size 64000 --temp 0.7 --top-p 0.8 --top-k 20 --min-p 0.00 -fa on --jinja -kvu -np 1 --fit-target 256"
|
||||
ttl: 120'';
|
||||
|
||||
# Set your time zone.
|
||||
# time.timeZone = "Europe/Amsterdam";
|
||||
|
||||
@@ -113,7 +184,7 @@
|
||||
# networking.firewall.allowedTCPPorts = [ ... ];
|
||||
# networking.firewall.allowedUDPPorts = [ ... ];
|
||||
# Or disable the firewall altogether.
|
||||
# networking.firewall.enable = false;
|
||||
networking.firewall.enable = false;
|
||||
|
||||
# Copy the NixOS configuration file and link it from the resulting system
|
||||
# (/run/current-system/configuration.nix). This is useful in case you
|
||||
|
||||
Reference in New Issue
Block a user