15 Commits

5 changed files with 183 additions and 475 deletions

View File

@@ -2,500 +2,102 @@
healthCheckTimeout: 600
logToStdout: "both" # proxy and upstream
macros:
base_args: "--no-warmup --port ${PORT}"
common_args: "--fit-target 1536 --fit-ctx 32768 --no-warmup --port ${PORT}"
gemma_sampling: "--prio 2 --temp 1.0 --repeat-penalty 1.0 --min-p 0.00 --top-k 64 --top-p 0.95"
qwen35_sampling: "--temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00"
qwen35_35b_args: "--temp 1.0 --min-p 0.00 --top-p 0.95 --top-k 20"
qwen35_35b_heretic_mmproj: "--mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf --mmproj /root/.cache/llama.cpp/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf"
qwen35_4b_heretic_mmproj: "--mmproj-url https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/mmproj-F16.gguf --mmproj /root/.cache/llama.cpp/unsloth_Qwen3.5-4B-GGUF_mmproj-F16.gguf"
thinking_on: "--chat-template-kwargs '{\"enable_thinking\": true}'"
thinking_off: "--chat-template-kwargs '{\"enable_thinking\": false}'"
peers:
openrouter:
proxy: https://openrouter.ai/api
apiKey: ${env.OPENROUTER_API_KEY}
models:
- z-ai/glm-5
hooks:
on_startup:
preload:
- "Qwen3-VL-2B-Instruct-GGUF:Q4_K_M"
- "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
groups:
qwen-vl-always:
always:
persistent: true
exclusive: false
swap: false
members:
- "Qwen3-VL-2B-Instruct-GGUF:Q4_K_M"
- "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
models:
"DeepSeek-R1-0528-Qwen3-8B-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF:Q4_K_M
--ctx-size 16384
--no-warmup
--port ${PORT}
"Qwen3-8B-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-8B-GGUF:Q4_K_M
--ctx-size 16384
--no-warmup
--port ${PORT}
"Qwen3-8B-GGUF-no-thinking":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-8B-GGUF:Q4_K_M
--ctx-size 16384
--jinja
--chat-template-file /config/qwen_nothink_chat_template.jinja
--no-warmup
--port ${PORT}
"gemma3n-e4b":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/gemma-3n-E4B-it-GGUF:UD-Q4_K_XL
--ctx-size 16384
--seed 3407
--prio 2
--temp 1.0
--repeat-penalty 1.0
--min-p 0.00
--top-k 64
--top-p 0.95
--no-warmup
--port ${PORT}
"gemma3-12b":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/gemma-3-12b-it-GGUF:Q4_K_M
--ctx-size 16384
--prio 2
--temp 1.0
--repeat-penalty 1.0
--min-p 0.00
--top-k 64
--top-p 0.95
--no-warmup
--port ${PORT}
${gemma_sampling}
${common_args}
"gemma3-12b-novision":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/gemma-3-12b-it-GGUF:Q4_K_M
--ctx-size 16384
--prio 2
--temp 1.0
--repeat-penalty 1.0
--min-p 0.00
--top-k 64
--top-p 0.95
${gemma_sampling}
--no-mmproj
--no-warmup
--port ${PORT}
"gemma3-12b-q2":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/gemma-3-12b-it-GGUF:Q2_K_L
--ctx-size 16384
--prio 2
--temp 1.0
--repeat-penalty 1.0
--min-p 0.00
--top-k 64
--top-p 0.95
--no-warmup
--port ${PORT}
${common_args}
"gemma3-4b":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/gemma-3-4b-it-GGUF:Q4_K_M
--ctx-size 16384
--prio 2
--temp 1.0
--repeat-penalty 1.0
--min-p 0.00
--top-k 64
--top-p 0.95
--no-warmup
--port ${PORT}
${gemma_sampling}
${common_args}
"gemma3-4b-novision":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/gemma-3-4b-it-GGUF:Q4_K_M
--ctx-size 16384
--prio 2
--temp 1.0
--repeat-penalty 1.0
--min-p 0.00
--top-k 64
--top-p 0.95
${gemma_sampling}
--no-mmproj
--no-warmup
--port ${PORT}
"Qwen3-4B-Thinking-2507":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_M
--ctx-size 16384
--predict 8192
--temp 0.6
--min-p 0.00
--top-p 0.95
--top-k 20
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
"Qwen3-4B-Thinking-2507-long-ctx":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_M
--ctx-size 262144
--predict 81920
--temp 0.6
--min-p 0.00
--top-p 0.95
--top-k 20
--repeat-penalty 1.0
--no-warmup
--flash-attn auto
--cache-type-k q8_0
--cache-type-v q8_0
--port ${PORT}
"Qwen3-4B-Instruct-2507":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_M
--ctx-size 16384
--predict 8192
--temp 0.7
--min-p 0.00
--top-p 0.8
--top-k 20
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
"Qwen3-4B-Instruct-2507-long-ctx":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_M
--ctx-size 262144
--predict 81920
--temp 0.7
--min-p 0.00
--top-p 0.8
--top-k 20
--repeat-penalty 1.0
--no-warmup
--flash-attn auto
--cache-type-k q8_0
--cache-type-v q8_0
--port ${PORT}
"Qwen2.5-VL-32B-Instruct-GGUF-IQ1_S":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen2.5-VL-32B-Instruct-GGUF:IQ1_S
--ctx-size 16384
--predict 8192
--temp 0.7
--min-p 0.00
--top-p 0.8
--top-k 20
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
"Qwen2.5-VL-32B-Instruct-GGUF-Q2_K_L":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen2.5-VL-32B-Instruct-GGUF:Q2_K_L
--ctx-size 16384
--predict 8192
--temp 0.7
--min-p 0.00
--top-p 0.8
--top-k 20
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
"Qwen2.5-VL-7B-Instruct-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M
--ctx-size 16384
--predict 8192
--temp 0.7
--min-p 0.00
--top-p 0.8
--top-k 20
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
"Qwen3-VL-2B-Instruct-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-2B-Instruct-GGUF:Q8_0
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.85
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.4
--no-warmup
--port ${PORT}
"Qwen3-VL-4B-Instruct-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.85
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.4
--no-warmup
--port ${PORT}
"Qwen3-VL-8B-Instruct-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.85
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.4
--no-warmup
--port ${PORT}
"Qwen3-VL-2B-Instruct-GGUF-unslothish":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-2B-Instruct-GGUF:Q8_0
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.8
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.6
--no-warmup
--port ${PORT}
"Qwen3-VL-4B-Instruct-GGUF-unslothish":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.8
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.6
--no-warmup
--port ${PORT}
"Qwen3-VL-8B-Instruct-GGUF-unslothish":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.8
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.6
--no-warmup
--port ${PORT}
"Qwen3-VL-2B-Thinking-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-2B-Thinking-GGUF:Q8_0
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--top-p 0.95
--top-k 20
--temp 1.0
--min-p 0.0
--repeat-penalty 1.0
--presence-penalty 0.0
--no-warmup
--port ${PORT}
"Qwen3-VL-4B-Thinking-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-4B-Thinking-GGUF:Q4_K_M
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--top-p 0.95
--top-k 20
--temp 1.0
--min-p 0.0
--repeat-penalty 1.0
--presence-penalty 0.0
--no-warmup
--port ${PORT}
"Qwen3-VL-8B-Thinking-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-8B-Thinking-GGUF:Q4_K_M
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--top-p 0.95
--top-k 20
--temp 1.0
--min-p 0.0
--repeat-penalty 1.0
--presence-penalty 0.0
--no-warmup
--port ${PORT}
"Huihui-Qwen3-VL-8B-Instruct-abliterated-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf noctrex/Huihui-Qwen3-VL-8B-Instruct-abliterated-GGUF:Q6_K
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.85
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.4
--no-warmup
--port ${PORT}
"Huihui-Qwen3-VL-8B-Thinking-abliterated-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf noctrex/Huihui-Qwen3-VL-8B-Thinking-abliterated-GGUF:Q6_K
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.85
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.4
--no-warmup
--port ${PORT}
${common_args}
"Qwen3-Coder-Next-GGUF:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3-Coder-Next-GGUF:Q4_K_M
--ctx-size 32768
--ctx-size 65536
--predict 8192
--temp 1.0
--min-p 0.01
--top-p 0.95
--top-k 40
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
${common_args}
"Qwen3.5-35B-A3B-GGUF:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M
--ctx-size 16384
--temp 1.0
--min-p 0.00
--top-p 0.95
--top-k 20
--no-warmup
--port ${PORT}
${qwen35_35b_args}
${common_args}
"Qwen3.5-35B-A3B-GGUF-nothink:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M
--ctx-size 16384
--temp 1.0
--min-p 0.00
--top-p 0.95
--top-k 20
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": false}"
${qwen35_35b_args}
${common_args}
${thinking_off}
# The "heretic" version does not provide the mmproj
# so providing url to the one from the non-heretic version.
@@ -504,56 +106,127 @@ models:
cmd: |
/app/llama-server
-hf mradermacher/Qwen3.5-35B-A3B-heretic-GGUF:Q4_K_M
--mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
--ctx-size 16384
--temp 1.0
--min-p 0.00
--top-p 0.95
--top-k 20
--no-warmup
--port ${PORT}
${qwen35_35b_heretic_mmproj}
${qwen35_35b_args}
${common_args}
"Qwen3.5-35B-A3B-heretic-GGUF-nothink:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf mradermacher/Qwen3.5-35B-A3B-heretic-GGUF:Q4_K_M
--mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
--ctx-size 16384
--temp 1.0
--min-p 0.00
--top-p 0.95
--top-k 20
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": false}"
${qwen35_35b_heretic_mmproj}
${qwen35_35b_args}
${common_args}
${thinking_off}
"Qwen3-VL-2B-Instruct-GGUF:Q4_K_M":
"Qwen3.5-0.8B-GGUF:Q4_K_XL":
ttl: 0
cmd: |
/app/llama-server
-hf unsloth/Qwen3-VL-2B-Instruct-GGUF:Q4_K_M
--ctx-size 16384
--predict 4096
--temp 0.7
--top-p 0.8
--top-k 20
--min-p 0.0
--presence-penalty 1.5
--no-warmup
--port ${PORT}
-hf unsloth/Qwen3.5-0.8B-GGUF:Q4_K_XL
${qwen35_sampling}
${base_args}
${thinking_on}
"gemma-3-270m-it-qat-GGUF:Q4_K_M":
"Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL":
ttl: 0
cmd: |
/app/llama-server
-hf unsloth/Qwen3.5-0.8B-GGUF:Q4_K_XL
--ctx-size 4096
${qwen35_sampling}
${base_args}
${thinking_off}
"Qwen3.5-2B-GGUF:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/gemma-3-270m-it-qat-GGUF:Q4_K_M
--ctx-size 16384
--predict 4096
--temp 1.0
--min-p 0.01
--top-p 0.95
--top-k 64
--repeat-penalty 1.0
--no-warmup
--port ${PORT}
-hf unsloth/Qwen3.5-2B-GGUF:Q4_K_M
${qwen35_sampling}
${common_args}
${thinking_on}
"Qwen3.5-2B-GGUF-nothink:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3.5-2B-GGUF:Q4_K_M
${qwen35_sampling}
${common_args}
${thinking_off}
"Qwen3.5-4B-GGUF:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3.5-4B-GGUF:Q4_K_M
${qwen35_sampling}
${common_args}
${thinking_on}
"Qwen3.5-4B-GGUF-nothink:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3.5-4B-GGUF:Q4_K_M
${qwen35_sampling}
${common_args}
${thinking_off}
"Qwen3.5-4B-heretic-GGUF:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf mradermacher/Qwen3.5-4B-heretic-GGUF:Q4_K_M
${qwen35_4b_heretic_mmproj}
${qwen35_sampling}
${common_args}
${thinking_on}
"Qwen3.5-4B-heretic-GGUF-nothink:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf mradermacher/Qwen3.5-4B-heretic-GGUF:Q4_K_M
${qwen35_4b_heretic_mmproj}
${qwen35_sampling}
${common_args}
${thinking_off}
"Qwen3.5-9B-GGUF:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3.5-9B-GGUF:Q4_K_M
${qwen35_sampling}
${common_args}
${thinking_on}
"Qwen3.5-9B-GGUF-nothink:Q4_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3.5-9B-GGUF:Q4_K_M
${qwen35_sampling}
${common_args}
${thinking_off}
"Qwen3.5-9B-GGUF:Q3_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3.5-9B-GGUF:Q3_K_M
${qwen35_sampling}
${common_args}
${thinking_on}
"Qwen3.5-9B-GGUF-nothink:Q3_K_M":
ttl: 600
cmd: |
/app/llama-server
-hf unsloth/Qwen3.5-9B-GGUF:Q3_K_M
${qwen35_sampling}
${common_args}
${thinking_off}

View File

@@ -6,6 +6,8 @@ metadata:
namespace: llama
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: llama-swap
@@ -17,7 +19,7 @@ spec:
containers:
- name: llama-swap
# TODO: make renovate update the image tag
image: ghcr.io/mostlygeek/llama-swap:v195-vulkan-b8148
image: ghcr.io/mostlygeek/llama-swap:v197-vulkan-b8202
imagePullPolicy: IfNotPresent
command:
- /app/llama-swap
@@ -28,6 +30,12 @@ spec:
- containerPort: 8080
name: http
protocol: TCP
env:
- name: OPENROUTER_API_KEY
valueFrom:
secretKeyRef:
name: llama-openrouter
key: OPENROUTER_API_KEY
volumeMounts:
- name: models
mountPath: /root/.cache

View File

@@ -36,3 +36,26 @@ spec:
excludeRaw: true
vaultAuthRef: llama
---
apiVersion: secrets.hashicorp.com/v1beta1
kind: VaultStaticSecret
metadata:
name: llama-openrouter
namespace: llama
spec:
type: kv-v2
mount: secret
path: openrouter
destination:
create: true
name: llama-openrouter
type: Opaque
transformation:
excludeRaw: true
templates:
OPENROUTER_API_KEY:
text: '{{ get .Secrets "API_KEY" }}'
vaultAuthRef: llama

View File

@@ -18,7 +18,7 @@ spec:
chart:
spec:
chart: open-webui
version: 12.8.1
version: 12.9.0
sourceRef:
kind: HelmRepository
name: open-webui

View File

@@ -1,3 +1,7 @@
path "secret/data/ollama" {
capabilities = ["read"]
}
path "secret/data/openrouter" {
capabilities = ["read"]
}