merge conflicts galore

This commit is contained in:
2026-03-13 18:42:58 -04:00
parent 5c1ef72f19
commit b7e89f4da4
2 changed files with 30 additions and 7 deletions

View File

@@ -33,13 +33,13 @@
blasSupport = true;
}).overrideAttrs
(oldAttrs: rec {
version = "8184";
version = "8209";
src = pkgs.fetchFromGitHub {
owner = "aagit";
owner = "ggml-org";
repo = "llama.cpp";
# tag = "b${version}";
rev = "6ebf2e0d00d31acfc1a1fa9662e9a7d38bd07bf7"; # https://github.com/ggml-org/llama.cpp/pull/19970
hash = "sha256-xryajW0Cs1d+WDijspMTW21FDaZP9Grkb+uErMQCQ48=";
tag = "b${version}";
# rev = "6ebf2e0d00d31acfc1a1fa9662e9a7d38bd07bf7"; # https://github.com/ggml-org/llama.cpp/pull/19970
hash = "sha256-7z9mQZ/hgNS+doLCVPtax+FBhr6dEfmR9wZJTwtl/pM=";
leaveDotGit = true;
postFetch = ''
git -C "$out" rev-parse --short HEAD > $out/COMMIT
@@ -54,7 +54,30 @@
# for reproducible builds). We sacrifice portability for faster CPU layers.
cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [
"-DGGML_NATIVE=ON"
"-DCMAKE_CUDA_ARCHITECTURES=86" # RTX 3090 - needed since sandbox has no GPU
"-DGGML_LTO=ON" # Link Time Optimization for overall binary speed
"-DCMAKE_CUDA_ARCHITECTURES=86" # RTX 3090
"-DGGML_CUDA=ON"
"-DGGML_CUDA_FA=ON" # FlashAttention kernels (accelerated attention)
"-DGGML_CUDA_FA_ALL_QUANTS=ON" # Support for all KV cache quant types in FA
"-DGGML_CUDA_GRAPHS=ON" # CUDA Graphs for lower overhead inference
"-DGGML_CUDA_FORCE_CUBLAS=ON" # cuBLAS optimized prompt processing for Ampere+
"-DGGML_CUDA_PEER_MAX_BATCH_SIZE=256" # Increased for multi-GPU efficiency (split mode)
"-DGGML_CUDA_COMPRESSION_MODE=speed" # Fast binary loading (CUDA 12.8+)
"-DGGML_OPENMP=ON" # Optimal multi-threading on CPU
"-DGGML_LLAMAFILE=ON" # Use llamafile sgemm for faster CPU layers
"-DGGML_CPU_REPACK=ON" # Optimize Q4_0 quant handling
"-DGGML_AVX=ON"
"-DGGML_AVX2=ON"
"-DGGML_FMA=ON"
"-DGGML_F16C=ON"
"-DGGML_AVX512=ON" # Intel AVX-512 extensions
"-DGGML_AVX512_VNNI=ON" # Vector Neural Network Instructions
"-DGGML_AVX512_BF16=ON" # Bfloat16 support
"-DGGML_AVX_VNNI=ON" # VNNI for processors without AVX-512
"-DGGML_AMX_TILE=ON" # Intel Advanced Matrix Extensions (Sapphire Rapids+)
"-DGGML_AMX_INT8=ON"
"-DGGML_AMX_BF16=ON"
"-DGGML_BLAS=ON" # Uses internal BLAS provided by Nix (blasSupport=true works)
];
# Disable Nix's NIX_ENFORCE_NO_NATIVE which strips -march=native flags

View File

@@ -87,7 +87,7 @@
environment.etc."llama-swap/config.yaml".text = ''
models:
"Qwen3.5-35B-A3B-GGUF":
cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_S --ctx-size 128000 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -np 1 --cpu-moe --fit-target 256 --ubatch-size 1024 -fa on --slots --slot-save-path /home/laythe/llamapcache --jinja -kvu --no-mmproj --swa-checkpoints 32
cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3.5-35B-A3B-GGUF:UD-IQ4_XS --ctx-size 128000 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -np 1 --cpu-moe --fit-target 256 --ubatch-size 1024 -fa on --slots --slot-save-path /home/laythe/llamapcache --jinja -kvu --no-mmproj --swa-checkpoints 32
ttl: 2400
"Qwen3-1.7B-GGUF":
cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3-1.7B-GGUF:Q4_K_S --ctx-size 32768 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu