diff --git a/global/default.nix b/global/default.nix index 338fd46..1dfc2a6 100755 --- a/global/default.nix +++ b/global/default.nix @@ -33,13 +33,13 @@ blasSupport = true; }).overrideAttrs (oldAttrs: rec { - version = "8184"; + version = "8209"; src = pkgs.fetchFromGitHub { - owner = "aagit"; + owner = "ggml-org"; repo = "llama.cpp"; - # tag = "b${version}"; - rev = "6ebf2e0d00d31acfc1a1fa9662e9a7d38bd07bf7"; # https://github.com/ggml-org/llama.cpp/pull/19970 - hash = "sha256-xryajW0Cs1d+WDijspMTW21FDaZP9Grkb+uErMQCQ48="; + tag = "b${version}"; + # rev = "6ebf2e0d00d31acfc1a1fa9662e9a7d38bd07bf7"; # https://github.com/ggml-org/llama.cpp/pull/19970 + hash = "sha256-7z9mQZ/hgNS+doLCVPtax+FBhr6dEfmR9wZJTwtl/pM="; leaveDotGit = true; postFetch = '' git -C "$out" rev-parse --short HEAD > $out/COMMIT @@ -54,7 +54,30 @@ # for reproducible builds). We sacrifice portability for faster CPU layers. cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [ "-DGGML_NATIVE=ON" - "-DCMAKE_CUDA_ARCHITECTURES=86" # RTX 3090 - needed since sandbox has no GPU + "-DGGML_LTO=ON" # Link Time Optimization for overall binary speed + "-DCMAKE_CUDA_ARCHITECTURES=86" # RTX 3090 + "-DGGML_CUDA=ON" + "-DGGML_CUDA_FA=ON" # FlashAttention kernels (accelerated attention) + "-DGGML_CUDA_FA_ALL_QUANTS=ON" # Support for all KV cache quant types in FA + "-DGGML_CUDA_GRAPHS=ON" # CUDA Graphs for lower overhead inference + "-DGGML_CUDA_FORCE_CUBLAS=ON" # cuBLAS optimized prompt processing for Ampere+ + "-DGGML_CUDA_PEER_MAX_BATCH_SIZE=256" # Increased for multi-GPU efficiency (split mode) + "-DGGML_CUDA_COMPRESSION_MODE=speed" # Fast binary loading (CUDA 12.8+) + "-DGGML_OPENMP=ON" # Optimal multi-threading on CPU + "-DGGML_LLAMAFILE=ON" # Use llamafile sgemm for faster CPU layers + "-DGGML_CPU_REPACK=ON" # Optimize Q4_0 quant handling + "-DGGML_AVX=ON" + "-DGGML_AVX2=ON" + "-DGGML_FMA=ON" + "-DGGML_F16C=ON" + "-DGGML_AVX512=ON" # Intel AVX-512 extensions + "-DGGML_AVX512_VNNI=ON" # Vector Neural Network Instructions + "-DGGML_AVX512_BF16=ON" # Bfloat16 support + "-DGGML_AVX_VNNI=ON" # VNNI for processors without AVX-512 + "-DGGML_AMX_TILE=ON" # Intel Advanced Matrix Extensions (Sapphire Rapids+) + "-DGGML_AMX_INT8=ON" + "-DGGML_AMX_BF16=ON" + "-DGGML_BLAS=ON" # Uses internal BLAS provided by Nix (blasSupport=true works) ]; # Disable Nix's NIX_ENFORCE_NO_NATIVE which strips -march=native flags diff --git a/machines/homepc/configuration.nix b/machines/homepc/configuration.nix index 8be6ba5..10cb166 100755 --- a/machines/homepc/configuration.nix +++ b/machines/homepc/configuration.nix @@ -87,7 +87,7 @@ environment.etc."llama-swap/config.yaml".text = '' models: "Qwen3.5-35B-A3B-GGUF": - cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_S --ctx-size 128000 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -np 1 --cpu-moe --fit-target 256 --ubatch-size 1024 -fa on --slots --slot-save-path /home/laythe/llamapcache --jinja -kvu --no-mmproj --swa-checkpoints 32 + cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3.5-35B-A3B-GGUF:UD-IQ4_XS --ctx-size 128000 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -np 1 --cpu-moe --fit-target 256 --ubatch-size 1024 -fa on --slots --slot-save-path /home/laythe/llamapcache --jinja -kvu --no-mmproj --swa-checkpoints 32 ttl: 2400 "Qwen3-1.7B-GGUF": cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3-1.7B-GGUF:Q4_K_S --ctx-size 32768 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu