From 975f1db8f5fbe6c3227216820ce540ea9a2633fd Mon Sep 17 00:00:00 2001 From: Lumpiasty Date: Thu, 5 Mar 2026 20:04:36 +0100 Subject: [PATCH] shorten context for qwen3-vl-2b and lower kv cache quant --- apps/llama/configs/config.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/apps/llama/configs/config.yaml b/apps/llama/configs/config.yaml index 6e2c596..a588465 100644 --- a/apps/llama/configs/config.yaml +++ b/apps/llama/configs/config.yaml @@ -535,8 +535,9 @@ models: cmd: | /app/llama-server -hf unsloth/Qwen3-VL-2B-Instruct-GGUF:Q4_K_M - --ctx-size 16384 - --predict 4096 + --ctx-size 6144 + --cache-type-k q8_0 + --cache-type-v q8_0 --temp 0.7 --top-p 0.8 --top-k 20