add qwen3-vl, fix librechat taking over settings and clean up llama config

2025-11-15 19:09:13 +01:00
parent e3325670de
commit 0b677d0faf
2 changed files with 234 additions and 150 deletions
@@ -57,7 +57,9 @@ spec:
                  "Qwen3-4B-Thinking-2507-long-ctx",
                  "Qwen2.5-VL-7B-Instruct-GGUF",
                  "Qwen2.5-VL-32B-Instruct-GGUF-IQ1_S",
-                  "Qwen2.5-VL-32B-Instruct-GGUF-Q2_K_L"
+                  "Qwen2.5-VL-32B-Instruct-GGUF-Q2_K_L",
                  "Qwen3-VL-4B-Instruct-GGUF",
                  "Qwen3-VL-4B-Instruct-GGUF-unslothish"
                ]
              titleConvo: true
              titleModel: "gemma3-4b-novision"
@@ -65,6 +67,16 @@ spec:
              summaryModel: "gemma3-4b-novision"
              forcePrompt: false
              modelDisplayLabel: "Llama.cpp"
              # ✨ IMPORTANT: let llama-swap/llama-server own all these
              dropParams:
                - "temperature"
                - "top_p"
                - "top_k"
                - "presence_penalty"
                - "frequency_penalty"
                - "stop"
                - "max_tokens"
      imageVolume:
        enabled: true
        size: 10G
@@ -6,26 +6,33 @@ models:
    cmd: |
      /app/llama-server
        -hf unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF:Q4_K_M
-      -ngl 37 -c 16384
+        --n-gpu-layers 37
        --ctx-size 16384
        --no-warmup
        --port ${PORT}
  "Qwen3-8B-GGUF":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3-8B-GGUF:Q4_K_M
-      -ngl 37 -c 16384
+        --n-gpu-layers 37
        --ctx-size 16384
        --no-warmup
        --port ${PORT}
  "Qwen3-8B-GGUF-no-thinking":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3-8B-GGUF:Q4_K_M
-      -ngl 37 -c 16384
+        --n-gpu-layers 37
-      --jinja --chat-template-file /config/qwen_nothink_chat_template.jinja
+        --ctx-size 16384
        --jinja
        --chat-template-file /config/qwen_nothink_chat_template.jinja
        --no-warmup
        --port ${PORT}
  "gemma3n-e4b":
    ttl: 600
    cmd: |
@@ -42,6 +49,7 @@ models:
        --top-p 0.95
        --no-warmup
        --port ${PORT}
  "gemma3-12b":
    ttl: 600
    cmd: |
@@ -57,6 +65,7 @@ models:
        --top-p 0.95
        --no-warmup
        --port ${PORT}
  "gemma3-12b-novision":
    ttl: 600
    cmd: |
@@ -73,6 +82,7 @@ models:
        --no-mmproj
        --no-warmup
        --port ${PORT}
  "gemma3-12b-q2":
    ttl: 600
    cmd: |
@@ -88,6 +98,7 @@ models:
        --top-p 0.95
        --no-warmup
        --port ${PORT}
  "gemma3-4b":
    ttl: 600
    cmd: |
@@ -103,6 +114,7 @@ models:
        --top-p 0.95
        --no-warmup
        --port ${PORT}
  "gemma3-4b-novision":
    ttl: 600
    cmd: |
@@ -119,12 +131,15 @@ models:
        --no-mmproj
        --no-warmup
        --port ${PORT}
  "Qwen3-4B-Thinking-2507":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_M
-      -ngl 99 -c 16384 --predict 8192
+        --n-gpu-layers 99
        --ctx-size 16384
        --predict 8192
        --temp 0.6
        --min-p 0.00
        --top-p 0.95
@@ -132,27 +147,34 @@ models:
        --repeat-penalty 1.0
        --no-warmup
        --port ${PORT}
  "Qwen3-4B-Thinking-2507-long-ctx":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_M
-      -ngl 99 -c 262144 --predict 81920
+        --n-gpu-layers 99
        --ctx-size 262144
        --predict 81920
        --temp 0.6
        --min-p 0.00
        --top-p 0.95
        --top-k 20
        --repeat-penalty 1.0
        --no-warmup
-      --flash-attn
+        --flash-attn auto
-      --cache-type-k q8_0 --cache-type-v q8_0
+        --cache-type-k q8_0
        --cache-type-v q8_0
        --port ${PORT}
  "Qwen3-4B-Instruct-2507":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_M
-      -ngl 99 -c 16384 --predict 8192
+        --n-gpu-layers 99
        --ctx-size 16384
        --predict 8192
        --temp 0.7
        --min-p 0.00
        --top-p 0.8
@@ -160,27 +182,34 @@ models:
        --repeat-penalty 1.0
        --no-warmup
        --port ${PORT}
  "Qwen3-4B-Instruct-2507-long-ctx":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_M
-      -ngl 99 -c 262144 --predict 81920
+        --n-gpu-layers 99
        --ctx-size 262144
        --predict 81920
        --temp 0.7
        --min-p 0.00
        --top-p 0.8
        --top-k 20
        --repeat-penalty 1.0
        --no-warmup
-      --flash-attn
+        --flash-attn auto
-      --cache-type-k q8_0 --cache-type-v q8_0
+        --cache-type-k q8_0
        --cache-type-v q8_0
        --port ${PORT}
  "Qwen2.5-VL-32B-Instruct-GGUF-IQ1_S":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen2.5-VL-32B-Instruct-GGUF:IQ1_S
-      -ngl 99 -c 16384 --predict 8192
+        --n-gpu-layers 99
        --ctx-size 16384
        --predict 8192
        --temp 0.7
        --min-p 0.00
        --top-p 0.8
@@ -188,12 +217,15 @@ models:
        --repeat-penalty 1.0
        --no-warmup
        --port ${PORT}
  "Qwen2.5-VL-32B-Instruct-GGUF-Q2_K_L":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen2.5-VL-32B-Instruct-GGUF:Q2_K_L
-      -ngl 99 -c 16384 --predict 8192
+        --n-gpu-layers 99
        --ctx-size 16384
        --predict 8192
        --temp 0.7
        --min-p 0.00
        --top-p 0.8
@@ -201,12 +233,15 @@ models:
        --repeat-penalty 1.0
        --no-warmup
        --port ${PORT}
  "Qwen2.5-VL-7B-Instruct-GGUF":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M
-      -ngl 37 -c 16384 --predict 8192
+        --n-gpu-layers 37
        --ctx-size 16384
        --predict 8192
        --temp 0.7
        --min-p 0.00
        --top-p 0.8
@@ -214,3 +249,40 @@ models:
        --repeat-penalty 1.0
        --no-warmup
        --port ${PORT}
  "Qwen3-VL-4B-Instruct-GGUF":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M
        --n-gpu-layers 99
        --ctx-size 12288
        --predict 4096
        --flash-attn auto
        --jinja
        --top-p 0.95
        --top-k 20
        --temp 1.0
        --min-p 0.05
        --repeat-penalty 1.0
        --presence-penalty 0.0
        --no-warmup
        --port ${PORT}
  "Qwen3-VL-4B-Instruct-GGUF-unslothish":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M
        --n-gpu-layers 99
        --ctx-size 12288
        --predict 4096
        --flash-attn auto
        --jinja
        --top-p 0.8
        --top-k 20
        --temp 0.7
        --min-p 0.0
        --presence-penalty 0.7
        --no-warmup
        --port ${PORT}