Update renovate/renovate Docker tag to v43.59.3

2026-03-08 00:00:29 +00:00
5 changed files with 562 additions and 124 deletions
--- a/apps/llama/configs/config.yaml
+++ b/apps/llama/configs/config.yaml
@@ -2,24 +2,6 @@
 healthCheckTimeout: 600
 logToStdout: "both" # proxy and upstream

-macros:
-  base_args: "--no-warmup --port ${PORT}"
-  common_args: "--fit-target 1536 --fit-ctx 32768 --no-warmup --port ${PORT}"
-  gemma_sampling: "--prio 2 --temp 1.0 --repeat-penalty 1.0 --min-p 0.00 --top-k 64 --top-p 0.95"
-  qwen35_sampling: "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
-  qwen35_35b_args: "--temp 1.0 --min-p 0.00 --top-p 0.95 --top-k 20"
-  qwen35_35b_heretic_mmproj: "--mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf --mmproj /root/.cache/llama.cpp/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf"
-  qwen35_4b_heretic_mmproj: "--mmproj-url https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/mmproj-F16.gguf --mmproj /root/.cache/llama.cpp/unsloth_Qwen3.5-4B-GGUF_mmproj-F16.gguf"
-  thinking_on: "--chat-template-kwargs '{\"enable_thinking\": true}'"
-  thinking_off: "--chat-template-kwargs '{\"enable_thinking\": false}'"
-
-peers:
-  openrouter:
-    proxy: https://openrouter.ai/api
-    apiKey: ${env.OPENROUTER_API_KEY}
-    models:
-      - z-ai/glm-5
-
 hooks:
  on_startup:
    preload:
@@ -34,39 +16,444 @@ groups:
      - "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"

 models:
+  "DeepSeek-R1-0528-Qwen3-8B-GGUF":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF:Q4_K_M
+        --ctx-size 16384
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-8B-GGUF":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/Qwen3-8B-GGUF:Q4_K_M
+        --ctx-size 16384
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-8B-GGUF-no-thinking":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/Qwen3-8B-GGUF:Q4_K_M
+        --ctx-size 16384
+        --jinja
+        --chat-template-file /config/qwen_nothink_chat_template.jinja
+        --no-warmup
+        --port ${PORT}
+
+  "gemma3n-e4b":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/gemma-3n-E4B-it-GGUF:UD-Q4_K_XL
+        --ctx-size 16384
+        --seed 3407
+        --prio 2
+        --temp 1.0
+        --repeat-penalty 1.0
+        --min-p 0.00
+        --top-k 64
+        --top-p 0.95
+        --no-warmup
+        --port ${PORT}
+
  "gemma3-12b":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/gemma-3-12b-it-GGUF:Q4_K_M
-        ${gemma_sampling}
-        ${common_args}
+        --ctx-size 16384
+        --prio 2
+        --temp 1.0
+        --repeat-penalty 1.0
+        --min-p 0.00
+        --top-k 64
+        --top-p 0.95
+        --no-warmup
+        --port ${PORT}

  "gemma3-12b-novision":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/gemma-3-12b-it-GGUF:Q4_K_M
-        ${gemma_sampling}
+        --ctx-size 16384
+        --prio 2
+        --temp 1.0
+        --repeat-penalty 1.0
+        --min-p 0.00
+        --top-k 64
+        --top-p 0.95
        --no-mmproj
-        ${common_args}
+        --no-warmup
+        --port ${PORT}
+
+  "gemma3-12b-q2":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/gemma-3-12b-it-GGUF:Q2_K_L
+        --ctx-size 16384
+        --prio 2
+        --temp 1.0
+        --repeat-penalty 1.0
+        --min-p 0.00
+        --top-k 64
+        --top-p 0.95
+        --no-warmup
+        --port ${PORT}

  "gemma3-4b":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/gemma-3-4b-it-GGUF:Q4_K_M
-        ${gemma_sampling}
-        ${common_args}
+        --ctx-size 16384
+        --prio 2
+        --temp 1.0
+        --repeat-penalty 1.0
+        --min-p 0.00
+        --top-k 64
+        --top-p 0.95
+        --no-warmup
+        --port ${PORT}

  "gemma3-4b-novision":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/gemma-3-4b-it-GGUF:Q4_K_M
-        ${gemma_sampling}
+        --ctx-size 16384
+        --prio 2
+        --temp 1.0
+        --repeat-penalty 1.0
+        --min-p 0.00
+        --top-k 64
+        --top-p 0.95
        --no-mmproj
-        ${common_args}
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-4B-Thinking-2507":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_M
+        --ctx-size 16384
+        --predict 8192
+        --temp 0.6
+        --min-p 0.00
+        --top-p 0.95
+        --top-k 20
+        --repeat-penalty 1.0
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-4B-Thinking-2507-long-ctx":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_M
+        --ctx-size 262144
+        --predict 81920
+        --temp 0.6
+        --min-p 0.00
+        --top-p 0.95
+        --top-k 20
+        --repeat-penalty 1.0
+        --no-warmup
+        --flash-attn auto
+        --cache-type-k q8_0
+        --cache-type-v q8_0
+        --port ${PORT}
+
+  "Qwen3-4B-Instruct-2507":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_M
+        --ctx-size 16384
+        --predict 8192
+        --temp 0.7
+        --min-p 0.00
+        --top-p 0.8
+        --top-k 20
+        --repeat-penalty 1.0
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-4B-Instruct-2507-long-ctx":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_M
+        --ctx-size 262144
+        --predict 81920
+        --temp 0.7
+        --min-p 0.00
+        --top-p 0.8
+        --top-k 20
+        --repeat-penalty 1.0
+        --no-warmup
+        --flash-attn auto
+        --cache-type-k q8_0
+        --cache-type-v q8_0
+        --port ${PORT}
+
+  "Qwen2.5-VL-32B-Instruct-GGUF-IQ1_S":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/Qwen2.5-VL-32B-Instruct-GGUF:IQ1_S
+        --ctx-size 16384
+        --predict 8192
+        --temp 0.7
+        --min-p 0.00
+        --top-p 0.8
+        --top-k 20
+        --repeat-penalty 1.0
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen2.5-VL-32B-Instruct-GGUF-Q2_K_L":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/Qwen2.5-VL-32B-Instruct-GGUF:Q2_K_L
+        --ctx-size 16384
+        --predict 8192
+        --temp 0.7
+        --min-p 0.00
+        --top-p 0.8
+        --top-k 20
+        --repeat-penalty 1.0
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen2.5-VL-7B-Instruct-GGUF":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M
+        --ctx-size 16384
+        --predict 8192
+        --temp 0.7
+        --min-p 0.00
+        --top-p 0.8
+        --top-k 20
+        --repeat-penalty 1.0
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-VL-2B-Instruct-GGUF":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf Qwen/Qwen3-VL-2B-Instruct-GGUF:Q8_0
+        --ctx-size 12288
+        --predict 4096
+        --flash-attn auto
+        --jinja
+        --temp 0.7
+        --top-p 0.85
+        --top-k 20
+        --min-p 0.05
+        --repeat-penalty 1.15
+        --frequency-penalty 0.5
+        --presence-penalty 0.4
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-VL-4B-Instruct-GGUF":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0
+        --ctx-size 12288
+        --predict 4096
+        --flash-attn auto
+        --jinja
+        --temp 0.7
+        --top-p 0.85
+        --top-k 20
+        --min-p 0.05
+        --repeat-penalty 1.15
+        --frequency-penalty 0.5
+        --presence-penalty 0.4
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-VL-8B-Instruct-GGUF":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
+        --ctx-size 12288
+        --predict 4096
+        --flash-attn auto
+        --jinja
+        --temp 0.7
+        --top-p 0.85
+        --top-k 20
+        --min-p 0.05
+        --repeat-penalty 1.15
+        --frequency-penalty 0.5
+        --presence-penalty 0.4
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-VL-2B-Instruct-GGUF-unslothish":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf Qwen/Qwen3-VL-2B-Instruct-GGUF:Q8_0
+        --ctx-size 12288
+        --predict 4096
+        --flash-attn auto
+        --jinja
+        --temp 0.7
+        --top-p 0.8
+        --top-k 20
+        --min-p 0.05
+        --repeat-penalty 1.15
+        --frequency-penalty 0.5
+        --presence-penalty 0.6
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-VL-4B-Instruct-GGUF-unslothish":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0
+        --ctx-size 12288
+        --predict 4096
+        --flash-attn auto
+        --jinja
+        --temp 0.7
+        --top-p 0.8
+        --top-k 20
+        --min-p 0.05
+        --repeat-penalty 1.15
+        --frequency-penalty 0.5
+        --presence-penalty 0.6
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-VL-8B-Instruct-GGUF-unslothish":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
+        --ctx-size 12288
+        --predict 4096
+        --flash-attn auto
+        --jinja
+        --temp 0.7
+        --top-p 0.8
+        --top-k 20
+        --min-p 0.05
+        --repeat-penalty 1.15
+        --frequency-penalty 0.5
+        --presence-penalty 0.6
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-VL-2B-Thinking-GGUF":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf Qwen/Qwen3-VL-2B-Thinking-GGUF:Q8_0
+        --ctx-size 12288
+        --predict 4096
+        --flash-attn auto
+        --jinja
+        --top-p 0.95
+        --top-k 20
+        --temp 1.0
+        --min-p 0.0
+        --repeat-penalty 1.0
+        --presence-penalty 0.0
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-VL-4B-Thinking-GGUF":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf Qwen/Qwen3-VL-4B-Thinking-GGUF:Q4_K_M
+        --ctx-size 12288
+        --predict 4096
+        --flash-attn auto
+        --jinja
+        --top-p 0.95
+        --top-k 20
+        --temp 1.0
+        --min-p 0.0
+        --repeat-penalty 1.0
+        --presence-penalty 0.0
+        --no-warmup
+        --port ${PORT}
+
+  "Qwen3-VL-8B-Thinking-GGUF":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf Qwen/Qwen3-VL-8B-Thinking-GGUF:Q4_K_M
+        --ctx-size 12288
+        --predict 4096
+        --flash-attn auto
+        --jinja
+        --top-p 0.95
+        --top-k 20
+        --temp 1.0
+        --min-p 0.0
+        --repeat-penalty 1.0
+        --presence-penalty 0.0
+        --no-warmup
+        --port ${PORT}
+
+  "Huihui-Qwen3-VL-8B-Instruct-abliterated-GGUF":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf noctrex/Huihui-Qwen3-VL-8B-Instruct-abliterated-GGUF:Q6_K
+        --ctx-size 12288
+        --predict 4096
+        --flash-attn auto
+        --jinja
+        --temp 0.7
+        --top-p 0.85
+        --top-k 20
+        --min-p 0.05
+        --repeat-penalty 1.15
+        --frequency-penalty 0.5
+        --presence-penalty 0.4
+        --no-warmup
+        --port ${PORT}
+
+  "Huihui-Qwen3-VL-8B-Thinking-abliterated-GGUF":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf noctrex/Huihui-Qwen3-VL-8B-Thinking-abliterated-GGUF:Q6_K
+        --ctx-size 12288
+        --predict 4096
+        --flash-attn auto
+        --jinja
+        --temp 0.7
+        --top-p 0.85
+        --top-k 20
+        --min-p 0.05
+        --repeat-penalty 1.15
+        --frequency-penalty 0.5
+        --presence-penalty 0.4
+        --no-warmup
+        --port ${PORT}

  "Qwen3-Coder-Next-GGUF:Q4_K_M":
    ttl: 600
@@ -74,30 +461,44 @@ models:
      /app/llama-server
        -hf unsloth/Qwen3-Coder-Next-GGUF:Q4_K_M
        --ctx-size 65536
+        --fit-target 1536
        --predict 8192
        --temp 1.0
        --min-p 0.01
        --top-p 0.95
        --top-k 40
        --repeat-penalty 1.0
-        ${common_args}
+        --no-warmup
+        --port ${PORT}

  "Qwen3.5-35B-A3B-GGUF:Q4_K_M":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M
-        ${qwen35_35b_args}
-        ${common_args}
+        --fit-target 2048
+        --ctx-size 16384
+        --temp 1.0
+        --min-p 0.00
+        --top-p 0.95
+        --top-k 20
+        --no-warmup
+        --port ${PORT}

  "Qwen3.5-35B-A3B-GGUF-nothink:Q4_K_M":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M
-        ${qwen35_35b_args}
-        ${common_args}
-        ${thinking_off}
+        --fit-target 2048
+        --ctx-size 16384
+        --temp 1.0
+        --min-p 0.00
+        --top-p 0.95
+        --top-k 20
+        --no-warmup
+        --port ${PORT}
+        --chat-template-kwargs "{\"enable_thinking\": false}"

  # The "heretic" version does not provide the mmproj
  # so providing url to the one from the non-heretic version.
@@ -106,127 +507,197 @@ models:
    cmd: |
      /app/llama-server
        -hf mradermacher/Qwen3.5-35B-A3B-heretic-GGUF:Q4_K_M
-        ${qwen35_35b_heretic_mmproj}
-        ${qwen35_35b_args}
-        ${common_args}
+        --mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
+        --mmproj /root/.cache/llama.cpp/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf
+        --fit-target 2048
+        --ctx-size 16384
+        --temp 1.0
+        --min-p 0.00
+        --top-p 0.95
+        --top-k 20
+        --no-warmup
+        --port ${PORT}

  "Qwen3.5-35B-A3B-heretic-GGUF-nothink:Q4_K_M":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf mradermacher/Qwen3.5-35B-A3B-heretic-GGUF:Q4_K_M
-        ${qwen35_35b_heretic_mmproj}
-        ${qwen35_35b_args}
-        ${common_args}
-        ${thinking_off}
+        --mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
+        --mmproj /root/.cache/llama.cpp/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf
+        --fit-target 2048
+        --ctx-size 16384
+        --temp 1.0
+        --min-p 0.00
+        --top-p 0.95
+        --top-k 20
+        --no-warmup
+        --port ${PORT}
+        --chat-template-kwargs "{\"enable_thinking\": false}"
+
+  "Qwen3-VL-2B-Instruct-GGUF:Q4_K_M":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/Qwen3-VL-2B-Instruct-GGUF:Q4_K_M
+        --ctx-size 6144
+        --cache-type-k q8_0
+        --cache-type-v q8_0
+        --temp 0.7
+        --top-p 0.8
+        --top-k 20
+        --min-p 0.0
+        --presence-penalty 1.5
+        --no-warmup
+        --port ${PORT}
+
+  "gemma-3-270m-it-qat-GGUF:Q4_K_M":
+    ttl: 600
+    cmd: |
+      /app/llama-server
+        -hf unsloth/gemma-3-270m-it-qat-GGUF:Q4_K_M
+        --ctx-size 16384
+        --predict 4096
+        --temp 1.0
+        --min-p 0.01
+        --top-p 0.95
+        --top-k 64
+        --repeat-penalty 1.0
+        --no-warmup
+        --port ${PORT}

  "Qwen3.5-0.8B-GGUF:Q4_K_XL":
    ttl: 0
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3.5-0.8B-GGUF:Q4_K_XL
-        ${qwen35_sampling}
-        ${base_args}
-        ${thinking_on}
+        --ctx-size 16384
+        --temp 0.6
+        --top-p 0.95
+        --top-k 20
+        --min-p 0.00
+        --no-warmup
+        --port ${PORT}
+        --chat-template-kwargs "{\"enable_thinking\": true}"

  "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL":
    ttl: 0
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3.5-0.8B-GGUF:Q4_K_XL
-        --ctx-size 4096
-        ${qwen35_sampling}
-        ${base_args}
-        ${thinking_off}
+        --ctx-size 16384
+        --temp 0.6
+        --top-p 0.95
+        --top-k 20
+        --min-p 0.00
+        --no-warmup
+        --port ${PORT}
+        --chat-template-kwargs "{\"enable_thinking\": false}"

  "Qwen3.5-2B-GGUF:Q4_K_M":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3.5-2B-GGUF:Q4_K_M
-        ${qwen35_sampling}
-        ${common_args}
-        ${thinking_on}
+        --temp 0.6
+        --top-p 0.95
+        --top-k 20
+        --min-p 0.00
+        --no-warmup
+        --port ${PORT}
+        --chat-template-kwargs "{\"enable_thinking\": true}"

  "Qwen3.5-2B-GGUF-nothink:Q4_K_M":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3.5-2B-GGUF:Q4_K_M
-        ${qwen35_sampling}
-        ${common_args}
-        ${thinking_off}
+        --temp 0.6
+        --top-p 0.95
+        --top-k 20
+        --min-p 0.00
+        --no-warmup
+        --port ${PORT}
+        --chat-template-kwargs "{\"enable_thinking\": false}"

  "Qwen3.5-4B-GGUF:Q4_K_M":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3.5-4B-GGUF:Q4_K_M
-        ${qwen35_sampling}
-        ${common_args}
-        ${thinking_on}
+        --temp 0.6
+        --top-p 0.95
+        --top-k 20
+        --min-p 0.00
+        --no-warmup
+        --port ${PORT}
+        --chat-template-kwargs "{\"enable_thinking\": true}"

  "Qwen3.5-4B-GGUF-nothink:Q4_K_M":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3.5-4B-GGUF:Q4_K_M
-        ${qwen35_sampling}
-        ${common_args}
-        ${thinking_off}
-
-  "Qwen3.5-4B-heretic-GGUF:Q4_K_M":
-    ttl: 600
-    cmd: |
-      /app/llama-server
-        -hf mradermacher/Qwen3.5-4B-heretic-GGUF:Q4_K_M
-        ${qwen35_4b_heretic_mmproj}
-        ${qwen35_sampling}
-        ${common_args}
-        ${thinking_on}
-
-  "Qwen3.5-4B-heretic-GGUF-nothink:Q4_K_M":
-    ttl: 600
-    cmd: |
-      /app/llama-server
-        -hf mradermacher/Qwen3.5-4B-heretic-GGUF:Q4_K_M
-        ${qwen35_4b_heretic_mmproj}
-        ${qwen35_sampling}
-        ${common_args}
-        ${thinking_off}
+        --temp 0.6
+        --top-p 0.95
+        --top-k 20
+        --min-p 0.00
+        --no-warmup
+        --port ${PORT}
+        --chat-template-kwargs "{\"enable_thinking\": false}"

  "Qwen3.5-9B-GGUF:Q4_K_M":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3.5-9B-GGUF:Q4_K_M
-        ${qwen35_sampling}
-        ${common_args}
-        ${thinking_on}
+        --ctx-size 16384
+        --temp 0.6
+        --top-p 0.95
+        --top-k 20
+        --min-p 0.00
+        --no-warmup
+        --port ${PORT}
+        --chat-template-kwargs "{\"enable_thinking\": true}"

  "Qwen3.5-9B-GGUF-nothink:Q4_K_M":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3.5-9B-GGUF:Q4_K_M
-        ${qwen35_sampling}
-        ${common_args}
-        ${thinking_off}
+        --ctx-size 16384
+        --temp 0.6
+        --top-p 0.95
+        --top-k 20
+        --min-p 0.00
+        --no-warmup
+        --port ${PORT}
+        --chat-template-kwargs "{\"enable_thinking\": false}"

  "Qwen3.5-9B-GGUF:Q3_K_M":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3.5-9B-GGUF:Q3_K_M
-        ${qwen35_sampling}
-        ${common_args}
-        ${thinking_on}
+        --ctx-size 16384
+        --temp 0.6
+        --top-p 0.95
+        --top-k 20
+        --min-p 0.00
+        --no-warmup
+        --port ${PORT}
+        --chat-template-kwargs "{\"enable_thinking\": true}"

  "Qwen3.5-9B-GGUF-nothink:Q3_K_M":
    ttl: 600
    cmd: |
      /app/llama-server
        -hf unsloth/Qwen3.5-9B-GGUF:Q3_K_M
-        ${qwen35_sampling}
-        ${common_args}
-        ${thinking_off}
+        --ctx-size 16384
+        --temp 0.6
+        --top-p 0.95
+        --top-k 20
+        --min-p 0.00
+        --no-warmup
+        --port ${PORT}
+        --chat-template-kwargs "{\"enable_thinking\": false}"
--- a/apps/llama/deployment.yaml
+++ b/apps/llama/deployment.yaml
@@ -30,12 +30,6 @@ spec:
            - containerPort: 8080
              name: http
              protocol: TCP
-          env:
-            - name: OPENROUTER_API_KEY
-              valueFrom:
-                secretKeyRef:
-                  name: llama-openrouter
-                  key: OPENROUTER_API_KEY
          volumeMounts:
            - name: models
              mountPath: /root/.cache
--- a/apps/llama/secret.yaml
+++ b/apps/llama/secret.yaml
@@ -36,26 +36,3 @@ spec:
      excludeRaw: true

  vaultAuthRef: llama
---
-apiVersion: secrets.hashicorp.com/v1beta1
-kind: VaultStaticSecret
-metadata:
-  name: llama-openrouter
-  namespace: llama
-spec:
-  type: kv-v2
-
-  mount: secret
-  path: openrouter
-
-  destination:
-    create: true
-    name: llama-openrouter
-    type: Opaque
-    transformation:
-      excludeRaw: true
-      templates:
-        OPENROUTER_API_KEY:
-          text: '{{ get .Secrets "API_KEY" }}'
-
-  vaultAuthRef: llama
--- a/apps/renovate/cronjob.yaml
+++ b/apps/renovate/cronjob.yaml
@@ -15,7 +15,7 @@ spec:
            - name: renovate
              # Update this to the latest available and then enable Renovate on
              # the manifest
-              image: renovate/renovate:43.59.4-full
+              image: renovate/renovate:43.59.3-full
              envFrom:
                - secretRef:
                    name: renovate-gitea-token
--- a/vault/policy/ollama.hcl
+++ b/vault/policy/ollama.hcl
@@ -1,7 +1,3 @@
 path "secret/data/ollama" {
    capabilities = ["read"]
 }
-
-path "secret/data/openrouter" {
-    capabilities = ["read"]
-}