From 28365425699fed7ed90677ba0d84f2211fb7534b Mon Sep 17 00:00:00 2001 From: Lumpiasty Date: Sat, 28 Feb 2026 17:48:20 +0100 Subject: [PATCH] Add always loaded Qwen3-VL-2B-Instruct --- apps/llama/configs/config.yaml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/apps/llama/configs/config.yaml b/apps/llama/configs/config.yaml index cdf84e7..6450a1c 100644 --- a/apps/llama/configs/config.yaml +++ b/apps/llama/configs/config.yaml @@ -2,6 +2,19 @@ healthCheckTimeout: 600 logToStdout: "both" # proxy and upstream +hooks: + on_startup: + preload: + - "Qwen3-VL-2B-Instruct-GGUF:Q4_K_M" + +groups: + qwen-vl-always: + persistent: true + exclusive: false + swap: false + members: + - "Qwen3-VL-2B-Instruct-GGUF:Q4_K_M" + models: "DeepSeek-R1-0528-Qwen3-8B-GGUF": ttl: 600 @@ -483,3 +496,18 @@ models: --no-warmup --port ${PORT} --chat-template-kwargs "{\"enable_thinking\": false}" + + "Qwen3-VL-2B-Instruct-GGUF:Q4_K_M": + ttl: 0 + cmd: | + /app/llama-server + -hf unsloth/Qwen3-VL-2B-Instruct-GGUF:Q4_K_M + --ctx-size 16384 + --predict 4096 + --temp 0.7 + --top-p 0.8 + --top-k 20 + --min-p 0.0 + --presence-penalty 1.5 + --no-warmup + --port ${PORT}