merge conflicts galore
This commit is contained in:
@@ -33,13 +33,13 @@
|
||||
blasSupport = true;
|
||||
}).overrideAttrs
|
||||
(oldAttrs: rec {
|
||||
version = "8184";
|
||||
version = "8209";
|
||||
src = pkgs.fetchFromGitHub {
|
||||
owner = "aagit";
|
||||
owner = "ggml-org";
|
||||
repo = "llama.cpp";
|
||||
# tag = "b${version}";
|
||||
rev = "6ebf2e0d00d31acfc1a1fa9662e9a7d38bd07bf7"; # https://github.com/ggml-org/llama.cpp/pull/19970
|
||||
hash = "sha256-xryajW0Cs1d+WDijspMTW21FDaZP9Grkb+uErMQCQ48=";
|
||||
tag = "b${version}";
|
||||
# rev = "6ebf2e0d00d31acfc1a1fa9662e9a7d38bd07bf7"; # https://github.com/ggml-org/llama.cpp/pull/19970
|
||||
hash = "sha256-7z9mQZ/hgNS+doLCVPtax+FBhr6dEfmR9wZJTwtl/pM=";
|
||||
leaveDotGit = true;
|
||||
postFetch = ''
|
||||
git -C "$out" rev-parse --short HEAD > $out/COMMIT
|
||||
@@ -54,7 +54,30 @@
|
||||
# for reproducible builds). We sacrifice portability for faster CPU layers.
|
||||
cmakeFlags = (oldAttrs.cmakeFlags or [ ]) ++ [
|
||||
"-DGGML_NATIVE=ON"
|
||||
"-DCMAKE_CUDA_ARCHITECTURES=86" # RTX 3090 - needed since sandbox has no GPU
|
||||
"-DGGML_LTO=ON" # Link Time Optimization for overall binary speed
|
||||
"-DCMAKE_CUDA_ARCHITECTURES=86" # RTX 3090
|
||||
"-DGGML_CUDA=ON"
|
||||
"-DGGML_CUDA_FA=ON" # FlashAttention kernels (accelerated attention)
|
||||
"-DGGML_CUDA_FA_ALL_QUANTS=ON" # Support for all KV cache quant types in FA
|
||||
"-DGGML_CUDA_GRAPHS=ON" # CUDA Graphs for lower overhead inference
|
||||
"-DGGML_CUDA_FORCE_CUBLAS=ON" # cuBLAS optimized prompt processing for Ampere+
|
||||
"-DGGML_CUDA_PEER_MAX_BATCH_SIZE=256" # Increased for multi-GPU efficiency (split mode)
|
||||
"-DGGML_CUDA_COMPRESSION_MODE=speed" # Fast binary loading (CUDA 12.8+)
|
||||
"-DGGML_OPENMP=ON" # Optimal multi-threading on CPU
|
||||
"-DGGML_LLAMAFILE=ON" # Use llamafile sgemm for faster CPU layers
|
||||
"-DGGML_CPU_REPACK=ON" # Optimize Q4_0 quant handling
|
||||
"-DGGML_AVX=ON"
|
||||
"-DGGML_AVX2=ON"
|
||||
"-DGGML_FMA=ON"
|
||||
"-DGGML_F16C=ON"
|
||||
"-DGGML_AVX512=ON" # Intel AVX-512 extensions
|
||||
"-DGGML_AVX512_VNNI=ON" # Vector Neural Network Instructions
|
||||
"-DGGML_AVX512_BF16=ON" # Bfloat16 support
|
||||
"-DGGML_AVX_VNNI=ON" # VNNI for processors without AVX-512
|
||||
"-DGGML_AMX_TILE=ON" # Intel Advanced Matrix Extensions (Sapphire Rapids+)
|
||||
"-DGGML_AMX_INT8=ON"
|
||||
"-DGGML_AMX_BF16=ON"
|
||||
"-DGGML_BLAS=ON" # Uses internal BLAS provided by Nix (blasSupport=true works)
|
||||
];
|
||||
|
||||
# Disable Nix's NIX_ENFORCE_NO_NATIVE which strips -march=native flags
|
||||
|
||||
@@ -87,7 +87,7 @@
|
||||
environment.etc."llama-swap/config.yaml".text = ''
|
||||
models:
|
||||
"Qwen3.5-35B-A3B-GGUF":
|
||||
cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_S --ctx-size 128000 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -np 1 --cpu-moe --fit-target 256 --ubatch-size 1024 -fa on --slots --slot-save-path /home/laythe/llamapcache --jinja -kvu --no-mmproj --swa-checkpoints 32
|
||||
cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3.5-35B-A3B-GGUF:UD-IQ4_XS --ctx-size 128000 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -np 1 --cpu-moe --fit-target 256 --ubatch-size 1024 -fa on --slots --slot-save-path /home/laythe/llamapcache --jinja -kvu --no-mmproj --swa-checkpoints 32
|
||||
ttl: 2400
|
||||
"Qwen3-1.7B-GGUF":
|
||||
cmd: llama-server --port ''${PORT} -hf unsloth/Qwen3-1.7B-GGUF:Q4_K_S --ctx-size 32768 --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 -fa on --jinja -kvu
|
||||
|
||||
Reference in New Issue
Block a user