4 Commits

Author SHA1 Message Date
2564b61182 Update renovate/renovate Docker tag to v43.59.4 2026-03-09 00:00:28 +00:00
2df8303905 add qwen3.5 4b heretic 2026-03-08 21:39:53 +01:00
65c11ab4ca add glm-5 from openrouter to llama-swap 2026-03-08 17:58:01 +01:00
55da75f06e clean up llama-swap config 2026-03-08 17:25:44 +01:00
5 changed files with 124 additions and 562 deletions

View File

@@ -2,6 +2,24 @@
healthCheckTimeout: 600 healthCheckTimeout: 600
logToStdout: "both" # proxy and upstream logToStdout: "both" # proxy and upstream
macros:
base_args: "--no-warmup --port ${PORT}"
common_args: "--fit-target 1536 --fit-ctx 32768 --no-warmup --port ${PORT}"
gemma_sampling: "--prio 2 --temp 1.0 --repeat-penalty 1.0 --min-p 0.00 --top-k 64 --top-p 0.95"
qwen35_sampling: "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
qwen35_35b_args: "--temp 1.0 --min-p 0.00 --top-p 0.95 --top-k 20"
qwen35_35b_heretic_mmproj: "--mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf --mmproj /root/.cache/llama.cpp/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf"
qwen35_4b_heretic_mmproj: "--mmproj-url https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/mmproj-F16.gguf --mmproj /root/.cache/llama.cpp/unsloth_Qwen3.5-4B-GGUF_mmproj-F16.gguf"
thinking_on: "--chat-template-kwargs '{\"enable_thinking\": true}'"
thinking_off: "--chat-template-kwargs '{\"enable_thinking\": false}'"
peers:
openrouter:
proxy: https://openrouter.ai/api
apiKey: ${env.OPENROUTER_API_KEY}
models:
- z-ai/glm-5
hooks: hooks:
on_startup: on_startup:
preload: preload:
@@ -16,444 +34,39 @@ groups:
- "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL" - "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
models: models:
"DeepSeek-R1-0528-Qwen3-8B-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF:Q4_K_M
--ctx-size 16384
--no-warmup
--port ${PORT}
"Qwen3-8B-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-8B-GGUF:Q4_K_M
--ctx-size 16384
--no-warmup
--port ${PORT}
"Qwen3-8B-GGUF-no-thinking":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-8B-GGUF:Q4_K_M
--ctx-size 16384
--jinja
--chat-template-file /config/qwen_nothink_chat_template.jinja
--no-warmup
--port ${PORT}
"gemma3n-e4b":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/gemma-3n-E4B-it-GGUF:UD-Q4_K_XL
--ctx-size 16384
--seed 3407
--prio 2
--temp 1.0
--repeat-penalty 1.0
--min-p 0.00
--top-k 64
--top-p 0.95
--no-warmup
--port ${PORT}
"gemma3-12b": "gemma3-12b":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/gemma-3-12b-it-GGUF:Q4_K_M -hf unsloth/gemma-3-12b-it-GGUF:Q4_K_M
--ctx-size 16384 ${gemma_sampling}
--prio 2 ${common_args}
--temp 1.0
--repeat-penalty 1.0
--min-p 0.00
--top-k 64
--top-p 0.95
--no-warmup
--port ${PORT}
"gemma3-12b-novision": "gemma3-12b-novision":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/gemma-3-12b-it-GGUF:Q4_K_M -hf unsloth/gemma-3-12b-it-GGUF:Q4_K_M
--ctx-size 16384 ${gemma_sampling}
--prio 2
--temp 1.0
--repeat-penalty 1.0
--min-p 0.00
--top-k 64
--top-p 0.95
--no-mmproj --no-mmproj
--no-warmup ${common_args}
--port ${PORT}
"gemma3-12b-q2":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/gemma-3-12b-it-GGUF:Q2_K_L
--ctx-size 16384
--prio 2
--temp 1.0
--repeat-penalty 1.0
--min-p 0.00
--top-k 64
--top-p 0.95
--no-warmup
--port ${PORT}
"gemma3-4b": "gemma3-4b":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/gemma-3-4b-it-GGUF:Q4_K_M -hf unsloth/gemma-3-4b-it-GGUF:Q4_K_M
--ctx-size 16384 ${gemma_sampling}
--prio 2 ${common_args}
--temp 1.0
--repeat-penalty 1.0
--min-p 0.00
--top-k 64
--top-p 0.95
--no-warmup
--port ${PORT}
"gemma3-4b-novision": "gemma3-4b-novision":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/gemma-3-4b-it-GGUF:Q4_K_M -hf unsloth/gemma-3-4b-it-GGUF:Q4_K_M
--ctx-size 16384 ${gemma_sampling}
--prio 2
--temp 1.0
--repeat-penalty 1.0
--min-p 0.00
--top-k 64
--top-p 0.95
--no-mmproj --no-mmproj
--no-warmup ${common_args}
--port ${PORT}
"Qwen3-4B-Thinking-2507":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_M
--ctx-size 16384
--predict 8192
--temp 0.6
--min-p 0.00
--top-p 0.95
--top-k 20
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
"Qwen3-4B-Thinking-2507-long-ctx":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_M
--ctx-size 262144
--predict 81920
--temp 0.6
--min-p 0.00
--top-p 0.95
--top-k 20
--repeat-penalty 1.0
--no-warmup
--flash-attn auto
--cache-type-k q8_0
--cache-type-v q8_0
--port ${PORT}
"Qwen3-4B-Instruct-2507":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_M
--ctx-size 16384
--predict 8192
--temp 0.7
--min-p 0.00
--top-p 0.8
--top-k 20
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
"Qwen3-4B-Instruct-2507-long-ctx":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_M
--ctx-size 262144
--predict 81920
--temp 0.7
--min-p 0.00
--top-p 0.8
--top-k 20
--repeat-penalty 1.0
--no-warmup
--flash-attn auto
--cache-type-k q8_0
--cache-type-v q8_0
--port ${PORT}
"Qwen2.5-VL-32B-Instruct-GGUF-IQ1_S":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen2.5-VL-32B-Instruct-GGUF:IQ1_S
--ctx-size 16384
--predict 8192
--temp 0.7
--min-p 0.00
--top-p 0.8
--top-k 20
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
"Qwen2.5-VL-32B-Instruct-GGUF-Q2_K_L":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen2.5-VL-32B-Instruct-GGUF:Q2_K_L
--ctx-size 16384
--predict 8192
--temp 0.7
--min-p 0.00
--top-p 0.8
--top-k 20
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
"Qwen2.5-VL-7B-Instruct-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M
--ctx-size 16384
--predict 8192
--temp 0.7
--min-p 0.00
--top-p 0.8
--top-k 20
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
"Qwen3-VL-2B-Instruct-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-2B-Instruct-GGUF:Q8_0
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.85
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.4
--no-warmup
--port ${PORT}
"Qwen3-VL-4B-Instruct-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.85
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.4
--no-warmup
--port ${PORT}
"Qwen3-VL-8B-Instruct-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.85
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.4
--no-warmup
--port ${PORT}
"Qwen3-VL-2B-Instruct-GGUF-unslothish":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-2B-Instruct-GGUF:Q8_0
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.8
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.6
--no-warmup
--port ${PORT}
"Qwen3-VL-4B-Instruct-GGUF-unslothish":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.8
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.6
--no-warmup
--port ${PORT}
"Qwen3-VL-8B-Instruct-GGUF-unslothish":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.8
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.6
--no-warmup
--port ${PORT}
"Qwen3-VL-2B-Thinking-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-2B-Thinking-GGUF:Q8_0
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--top-p 0.95
--top-k 20
--temp 1.0
--min-p 0.0
--repeat-penalty 1.0
--presence-penalty 0.0
--no-warmup
--port ${PORT}
"Qwen3-VL-4B-Thinking-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-4B-Thinking-GGUF:Q4_K_M
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--top-p 0.95
--top-k 20
--temp 1.0
--min-p 0.0
--repeat-penalty 1.0
--presence-penalty 0.0
--no-warmup
--port ${PORT}
"Qwen3-VL-8B-Thinking-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-8B-Thinking-GGUF:Q4_K_M
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--top-p 0.95
--top-k 20
--temp 1.0
--min-p 0.0
--repeat-penalty 1.0
--presence-penalty 0.0
--no-warmup
--port ${PORT}
"Huihui-Qwen3-VL-8B-Instruct-abliterated-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf noctrex/Huihui-Qwen3-VL-8B-Instruct-abliterated-GGUF:Q6_K
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.85
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.4
--no-warmup
--port ${PORT}
"Huihui-Qwen3-VL-8B-Thinking-abliterated-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf noctrex/Huihui-Qwen3-VL-8B-Thinking-abliterated-GGUF:Q6_K
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.85
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.4
--no-warmup
--port ${PORT}
"Qwen3-Coder-Next-GGUF:Q4_K_M": "Qwen3-Coder-Next-GGUF:Q4_K_M":
ttl: 600 ttl: 600
@@ -461,44 +74,30 @@ models:
/app/llama-server /app/llama-server
-hf unsloth/Qwen3-Coder-Next-GGUF:Q4_K_M -hf unsloth/Qwen3-Coder-Next-GGUF:Q4_K_M
--ctx-size 65536 --ctx-size 65536
--fit-target 1536
--predict 8192 --predict 8192
--temp 1.0 --temp 1.0
--min-p 0.01 --min-p 0.01
--top-p 0.95 --top-p 0.95
--top-k 40 --top-k 40
--repeat-penalty 1.0 --repeat-penalty 1.0
--no-warmup ${common_args}
--port ${PORT}
"Qwen3.5-35B-A3B-GGUF:Q4_K_M": "Qwen3.5-35B-A3B-GGUF:Q4_K_M":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M -hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M
--fit-target 2048 ${qwen35_35b_args}
--ctx-size 16384 ${common_args}
--temp 1.0
--min-p 0.00
--top-p 0.95
--top-k 20
--no-warmup
--port ${PORT}
"Qwen3.5-35B-A3B-GGUF-nothink:Q4_K_M": "Qwen3.5-35B-A3B-GGUF-nothink:Q4_K_M":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M -hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M
--fit-target 2048 ${qwen35_35b_args}
--ctx-size 16384 ${common_args}
--temp 1.0 ${thinking_off}
--min-p 0.00
--top-p 0.95
--top-k 20
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": false}"
# The "heretic" version does not provide the mmproj # The "heretic" version does not provide the mmproj
# so providing url to the one from the non-heretic version. # so providing url to the one from the non-heretic version.
@@ -507,197 +106,127 @@ models:
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf mradermacher/Qwen3.5-35B-A3B-heretic-GGUF:Q4_K_M -hf mradermacher/Qwen3.5-35B-A3B-heretic-GGUF:Q4_K_M
--mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf ${qwen35_35b_heretic_mmproj}
--mmproj /root/.cache/llama.cpp/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf ${qwen35_35b_args}
--fit-target 2048 ${common_args}
--ctx-size 16384
--temp 1.0
--min-p 0.00
--top-p 0.95
--top-k 20
--no-warmup
--port ${PORT}
"Qwen3.5-35B-A3B-heretic-GGUF-nothink:Q4_K_M": "Qwen3.5-35B-A3B-heretic-GGUF-nothink:Q4_K_M":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf mradermacher/Qwen3.5-35B-A3B-heretic-GGUF:Q4_K_M -hf mradermacher/Qwen3.5-35B-A3B-heretic-GGUF:Q4_K_M
--mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf ${qwen35_35b_heretic_mmproj}
--mmproj /root/.cache/llama.cpp/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf ${qwen35_35b_args}
--fit-target 2048 ${common_args}
--ctx-size 16384 ${thinking_off}
--temp 1.0
--min-p 0.00
--top-p 0.95
--top-k 20
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": false}"
"Qwen3-VL-2B-Instruct-GGUF:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-VL-2B-Instruct-GGUF:Q4_K_M
--ctx-size 6144
--cache-type-k q8_0
--cache-type-v q8_0
--temp 0.7
--top-p 0.8
--top-k 20
--min-p 0.0
--presence-penalty 1.5
--no-warmup
--port ${PORT}
"gemma-3-270m-it-qat-GGUF:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/gemma-3-270m-it-qat-GGUF:Q4_K_M
--ctx-size 16384
--predict 4096
--temp 1.0
--min-p 0.01
--top-p 0.95
--top-k 64
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
"Qwen3.5-0.8B-GGUF:Q4_K_XL": "Qwen3.5-0.8B-GGUF:Q4_K_XL":
ttl: 0 ttl: 0
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3.5-0.8B-GGUF:Q4_K_XL -hf unsloth/Qwen3.5-0.8B-GGUF:Q4_K_XL
--ctx-size 16384 ${qwen35_sampling}
--temp 0.6 ${base_args}
--top-p 0.95 ${thinking_on}
--top-k 20
--min-p 0.00
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": true}"
"Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL": "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL":
ttl: 0 ttl: 0
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3.5-0.8B-GGUF:Q4_K_XL -hf unsloth/Qwen3.5-0.8B-GGUF:Q4_K_XL
--ctx-size 16384 --ctx-size 4096
--temp 0.6 ${qwen35_sampling}
--top-p 0.95 ${base_args}
--top-k 20 ${thinking_off}
--min-p 0.00
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": false}"
"Qwen3.5-2B-GGUF:Q4_K_M": "Qwen3.5-2B-GGUF:Q4_K_M":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3.5-2B-GGUF:Q4_K_M -hf unsloth/Qwen3.5-2B-GGUF:Q4_K_M
--temp 0.6 ${qwen35_sampling}
--top-p 0.95 ${common_args}
--top-k 20 ${thinking_on}
--min-p 0.00
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": true}"
"Qwen3.5-2B-GGUF-nothink:Q4_K_M": "Qwen3.5-2B-GGUF-nothink:Q4_K_M":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3.5-2B-GGUF:Q4_K_M -hf unsloth/Qwen3.5-2B-GGUF:Q4_K_M
--temp 0.6 ${qwen35_sampling}
--top-p 0.95 ${common_args}
--top-k 20 ${thinking_off}
--min-p 0.00
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": false}"
"Qwen3.5-4B-GGUF:Q4_K_M": "Qwen3.5-4B-GGUF:Q4_K_M":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3.5-4B-GGUF:Q4_K_M -hf unsloth/Qwen3.5-4B-GGUF:Q4_K_M
--temp 0.6 ${qwen35_sampling}
--top-p 0.95 ${common_args}
--top-k 20 ${thinking_on}
--min-p 0.00
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": true}"
"Qwen3.5-4B-GGUF-nothink:Q4_K_M": "Qwen3.5-4B-GGUF-nothink:Q4_K_M":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3.5-4B-GGUF:Q4_K_M -hf unsloth/Qwen3.5-4B-GGUF:Q4_K_M
--temp 0.6 ${qwen35_sampling}
--top-p 0.95 ${common_args}
--top-k 20 ${thinking_off}
--min-p 0.00
--no-warmup "Qwen3.5-4B-heretic-GGUF:Q4_K_M":
--port ${PORT} ttl: 600
--chat-template-kwargs "{\"enable_thinking\": false}" cmd: |
/app/llama-server
-hf mradermacher/Qwen3.5-4B-heretic-GGUF:Q4_K_M
${qwen35_4b_heretic_mmproj}
${qwen35_sampling}
${common_args}
${thinking_on}
"Qwen3.5-4B-heretic-GGUF-nothink:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf mradermacher/Qwen3.5-4B-heretic-GGUF:Q4_K_M
${qwen35_4b_heretic_mmproj}
${qwen35_sampling}
${common_args}
${thinking_off}
"Qwen3.5-9B-GGUF:Q4_K_M": "Qwen3.5-9B-GGUF:Q4_K_M":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3.5-9B-GGUF:Q4_K_M -hf unsloth/Qwen3.5-9B-GGUF:Q4_K_M
--ctx-size 16384 ${qwen35_sampling}
--temp 0.6 ${common_args}
--top-p 0.95 ${thinking_on}
--top-k 20
--min-p 0.00
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": true}"
"Qwen3.5-9B-GGUF-nothink:Q4_K_M": "Qwen3.5-9B-GGUF-nothink:Q4_K_M":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3.5-9B-GGUF:Q4_K_M -hf unsloth/Qwen3.5-9B-GGUF:Q4_K_M
--ctx-size 16384 ${qwen35_sampling}
--temp 0.6 ${common_args}
--top-p 0.95 ${thinking_off}
--top-k 20
--min-p 0.00
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": false}"
"Qwen3.5-9B-GGUF:Q3_K_M": "Qwen3.5-9B-GGUF:Q3_K_M":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3.5-9B-GGUF:Q3_K_M -hf unsloth/Qwen3.5-9B-GGUF:Q3_K_M
--ctx-size 16384 ${qwen35_sampling}
--temp 0.6 ${common_args}
--top-p 0.95 ${thinking_on}
--top-k 20
--min-p 0.00
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": true}"
"Qwen3.5-9B-GGUF-nothink:Q3_K_M": "Qwen3.5-9B-GGUF-nothink:Q3_K_M":
ttl: 600 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3.5-9B-GGUF:Q3_K_M -hf unsloth/Qwen3.5-9B-GGUF:Q3_K_M
--ctx-size 16384 ${qwen35_sampling}
--temp 0.6 ${common_args}
--top-p 0.95 ${thinking_off}
--top-k 20
--min-p 0.00
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": false}"

View File

@@ -30,6 +30,12 @@ spec:
- containerPort: 8080 - containerPort: 8080
name: http name: http
protocol: TCP protocol: TCP
env:
- name: OPENROUTER_API_KEY
valueFrom:
secretKeyRef:
name: llama-openrouter
key: OPENROUTER_API_KEY
volumeMounts: volumeMounts:
- name: models - name: models
mountPath: /root/.cache mountPath: /root/.cache

View File

@@ -36,3 +36,26 @@ spec:
excludeRaw: true excludeRaw: true
vaultAuthRef: llama vaultAuthRef: llama
---
apiVersion: secrets.hashicorp.com/v1beta1
kind: VaultStaticSecret
metadata:
name: llama-openrouter
namespace: llama
spec:
type: kv-v2
mount: secret
path: openrouter
destination:
create: true
name: llama-openrouter
type: Opaque
transformation:
excludeRaw: true
templates:
OPENROUTER_API_KEY:
text: '{{ get .Secrets "API_KEY" }}'
vaultAuthRef: llama

View File

@@ -15,7 +15,7 @@ spec:
- name: renovate - name: renovate
# Update this to the latest available and then enable Renovate on # Update this to the latest available and then enable Renovate on
# the manifest # the manifest
image: renovate/renovate:43.59.3-full image: renovate/renovate:43.59.4-full
envFrom: envFrom:
- secretRef: - secretRef:
name: renovate-gitea-token name: renovate-gitea-token

View File

@@ -1,3 +1,7 @@
path "secret/data/ollama" { path "secret/data/ollama" {
capabilities = ["read"] capabilities = ["read"]
} }
path "secret/data/openrouter" {
capabilities = ["read"]
}