5 Commits

3 changed files with 25 additions and 8 deletions

View File

@@ -5,15 +5,15 @@ logToStdout: "both" # proxy and upstream
hooks: hooks:
on_startup: on_startup:
preload: preload:
- "Qwen3-VL-2B-Instruct-GGUF:Q4_K_M" - "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
groups: groups:
qwen-vl-always: always:
persistent: true persistent: true
exclusive: false exclusive: false
swap: false swap: false
members: members:
- "Qwen3-VL-2B-Instruct-GGUF:Q4_K_M" - "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
models: models:
"DeepSeek-R1-0528-Qwen3-8B-GGUF": "DeepSeek-R1-0528-Qwen3-8B-GGUF":
@@ -505,6 +505,7 @@ models:
/app/llama-server /app/llama-server
-hf mradermacher/Qwen3.5-35B-A3B-heretic-GGUF:Q4_K_M -hf mradermacher/Qwen3.5-35B-A3B-heretic-GGUF:Q4_K_M
--mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf --mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
--mmproj /root/.cache/llama.cpp/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf
--ctx-size 16384 --ctx-size 16384
--temp 1.0 --temp 1.0
--min-p 0.00 --min-p 0.00
@@ -519,6 +520,7 @@ models:
/app/llama-server /app/llama-server
-hf mradermacher/Qwen3.5-35B-A3B-heretic-GGUF:Q4_K_M -hf mradermacher/Qwen3.5-35B-A3B-heretic-GGUF:Q4_K_M
--mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf --mmproj-url https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F16.gguf
--mmproj /root/.cache/llama.cpp/unsloth_Qwen3.5-35B-A3B-GGUF_mmproj-F16.gguf
--ctx-size 16384 --ctx-size 16384
--temp 1.0 --temp 1.0
--min-p 0.00 --min-p 0.00
@@ -529,12 +531,13 @@ models:
--chat-template-kwargs "{\"enable_thinking\": false}" --chat-template-kwargs "{\"enable_thinking\": false}"
"Qwen3-VL-2B-Instruct-GGUF:Q4_K_M": "Qwen3-VL-2B-Instruct-GGUF:Q4_K_M":
ttl: 0 ttl: 600
cmd: | cmd: |
/app/llama-server /app/llama-server
-hf unsloth/Qwen3-VL-2B-Instruct-GGUF:Q4_K_M -hf unsloth/Qwen3-VL-2B-Instruct-GGUF:Q4_K_M
--ctx-size 16384 --ctx-size 6144
--predict 4096 --cache-type-k q8_0
--cache-type-v q8_0
--temp 0.7 --temp 0.7
--top-p 0.8 --top-p 0.8
--top-k 20 --top-k 20
@@ -557,3 +560,17 @@ models:
--repeat-penalty 1.0 --repeat-penalty 1.0
--no-warmup --no-warmup
--port ${PORT} --port ${PORT}
"Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL":
ttl: 0
cmd: |
/app/llama-server
-hf unsloth/Qwen3.5-0.8B-GGUF:Q4_K_XL
--ctx-size 16384
--temp 0.6
--top-p 0.95
--top-k 20
--min-p 0.00
--no-warmup
--port ${PORT}
--chat-template-kwargs "{\"enable_thinking\": false}"

View File

@@ -17,7 +17,7 @@ spec:
containers: containers:
- name: llama-swap - name: llama-swap
# TODO: make renovate update the image tag # TODO: make renovate update the image tag
image: ghcr.io/mostlygeek/llama-swap:v195-vulkan-b8148 image: ghcr.io/mostlygeek/llama-swap:v197-vulkan-b8202
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
command: command:
- /app/llama-swap - /app/llama-swap

View File

@@ -15,7 +15,7 @@ spec:
- name: renovate - name: renovate
# Update this to the latest available and then enable Renovate on # Update this to the latest available and then enable Renovate on
# the manifest # the manifest
image: renovate/renovate:43.46.6-full image: renovate/renovate:43.56.1-full
envFrom: envFrom:
- secretRef: - secretRef:
name: renovate-gitea-token name: renovate-gitea-token