From 669beccc3500095e5b374774ee43492359eb414d Mon Sep 17 00:00:00 2001
From: Lumpiasty <arek.dzski@gmail.com>
Date: Sat, 15 Nov 2025 19:49:21 +0100
Subject: [PATCH] fix Qwen3-VL-4B-Instruct-GGUF models looping issue

---
 apps/llama/configs/config.yaml | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/apps/llama/configs/config.yaml b/apps/llama/configs/config.yaml
index 074c6d3..fd0c7b9 100644
--- a/apps/llama/configs/config.yaml
+++ b/apps/llama/configs/config.yaml
@@ -254,18 +254,19 @@ models:
     ttl: 600
     cmd: |
       /app/llama-server
-        -hf unsloth/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M
+        -hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0
         --n-gpu-layers 99
         --ctx-size 12288
         --predict 4096
         --flash-attn auto
         --jinja
-        --top-p 0.95
+        --temp 0.7
+        --top-p 0.85
         --top-k 20
-        --temp 1.0
         --min-p 0.05
-        --repeat-penalty 1.0
-        --presence-penalty 0.0
+        --repeat-penalty 1.15
+        --frequency-penalty 0.5
+        --presence-penalty 0.4
         --no-warmup
         --port ${PORT}
 
@@ -273,17 +274,19 @@ models:
     ttl: 600
     cmd: |
       /app/llama-server
-        -hf unsloth/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M
+        -hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0
         --n-gpu-layers 99
         --ctx-size 12288
         --predict 4096
         --flash-attn auto
         --jinja
+        --temp 0.7
         --top-p 0.8
         --top-k 20
-        --temp 0.7
-        --min-p 0.0
-        --presence-penalty 0.7
+        --min-p 0.05
+        --repeat-penalty 1.15
+        --frequency-penalty 0.5
+        --presence-penalty 0.6
         --no-warmup
         --port ${PORT}
 
@@ -291,7 +294,7 @@ models:
     ttl: 600
     cmd: |
       /app/llama-server
-        -hf unsloth/Qwen3-VL-4B-Thinking-GGUF:Q4_K_M
+        -hf Qwen/Qwen3-VL-4B-Thinking-GGUF:Q4_K_M
         --n-gpu-layers 99
         --ctx-size 12288
         --predict 4096