add tts and sst to llama-swap and openwebui
ci/woodpecker/push/flux-reconcile-source Pipeline was successful
ci/woodpecker/push/flux-reconcile-source Pipeline was successful
This commit is contained in:
@@ -5,6 +5,7 @@ logToStdout: "both" # proxy and upstream
|
|||||||
macros:
|
macros:
|
||||||
base_args: "--no-warmup --port ${PORT}"
|
base_args: "--no-warmup --port ${PORT}"
|
||||||
common_args: "--fit-target 1536 --no-warmup --port ${PORT}"
|
common_args: "--fit-target 1536 --no-warmup --port ${PORT}"
|
||||||
|
cpu_args: "--no-warmup --port ${PORT} -ngl 0"
|
||||||
ctx_64k: "--ctx-size 65536"
|
ctx_64k: "--ctx-size 65536"
|
||||||
ctx_128k: "--ctx-size 131072"
|
ctx_128k: "--ctx-size 131072"
|
||||||
ctx_256k: "--ctx-size 262144"
|
ctx_256k: "--ctx-size 262144"
|
||||||
@@ -18,6 +19,8 @@ hooks:
|
|||||||
on_startup:
|
on_startup:
|
||||||
preload:
|
preload:
|
||||||
- "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
|
- "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
|
||||||
|
- "whisper-small"
|
||||||
|
- "outetts-tts"
|
||||||
|
|
||||||
# matrix replaces groups (they are mutually exclusive).
|
# matrix replaces groups (they are mutually exclusive).
|
||||||
# The small 0.8B model runs alongside any LLM.
|
# The small 0.8B model runs alongside any LLM.
|
||||||
@@ -25,6 +28,8 @@ hooks:
|
|||||||
matrix:
|
matrix:
|
||||||
vars:
|
vars:
|
||||||
q8: "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
|
q8: "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
|
||||||
|
stt: "whisper-small"
|
||||||
|
tts: "outetts-tts"
|
||||||
flux: "flux2-klein-4b:Q4_K_M"
|
flux: "flux2-klein-4b:Q4_K_M"
|
||||||
coder: "Qwen3-Coder-Next-GGUF:Q4_K_M"
|
coder: "Qwen3-Coder-Next-GGUF:Q4_K_M"
|
||||||
q35t: "Qwen3.5-35B-A3B-GGUF:Q4_K_M"
|
q35t: "Qwen3.5-35B-A3B-GGUF:Q4_K_M"
|
||||||
@@ -50,10 +55,10 @@ matrix:
|
|||||||
flux: 10 # large files, slow to reload
|
flux: 10 # large files, slow to reload
|
||||||
|
|
||||||
sets:
|
sets:
|
||||||
# any LLM can run alongside the small always-on model
|
# any LLM can run alongside the small always-on model + STT + TTS (all CPU, no VRAM cost)
|
||||||
with_q8: "(coder | q35t | q35nt | q35ht | q35hnt | q4t | q4nt | q4ht | q4hnt | g26xl | g26q2 | ge4xl | ge2xl | q36t | q36nt | haut | haunt | mtpt | mtpnt) & q8"
|
with_q8: "(coder | q35t | q35nt | q35ht | q35hnt | q4t | q4nt | q4ht | q4hnt | g26xl | g26q2 | ge4xl | ge2xl | q36t | q36nt | haut | haunt | mtpt | mtpnt) & q8 & stt & tts"
|
||||||
# FLUX runs alone — evicts everything including q8
|
# FLUX runs alone — evicts everything including q8, but keeps STT+TTS for voice during image gen
|
||||||
image_gen: "flux"
|
image_gen: "flux & stt & tts"
|
||||||
|
|
||||||
models:
|
models:
|
||||||
"Qwen3-Coder-Next-GGUF:Q4_K_M":
|
"Qwen3-Coder-Next-GGUF:Q4_K_M":
|
||||||
@@ -232,6 +237,35 @@ models:
|
|||||||
--parallel 1
|
--parallel 1
|
||||||
${common_args}
|
${common_args}
|
||||||
|
|
||||||
|
# STT via whisper.cpp (CPU-only, always loaded)
|
||||||
|
# Model auto-downloaded from HuggingFace on first start
|
||||||
|
# whisper-small: ~240MB RAM, good accuracy/speed tradeoff on R5 3600
|
||||||
|
"whisper-small":
|
||||||
|
checkEndpoint: none
|
||||||
|
cmd: |
|
||||||
|
whisper-server
|
||||||
|
--port ${PORT}
|
||||||
|
-m /root/.cache/whisper/ggml-small.bin
|
||||||
|
--request-path /v1/audio
|
||||||
|
--inference-path /transcriptions
|
||||||
|
--threads 6
|
||||||
|
--no-gpu
|
||||||
|
|
||||||
|
# TTS via OuteTTS 0.3 1B + WavTokenizer vocoder (CPU-only, always loaded)
|
||||||
|
# Models auto-downloaded from HuggingFace on first start
|
||||||
|
# OuteTTS 0.3 1B: ~1GB RAM, WavTokenizer: ~600MB RAM
|
||||||
|
# Exposes /v1/audio/speech compatible with OpenAI TTS API
|
||||||
|
"outetts-tts":
|
||||||
|
checkEndpoint: none
|
||||||
|
cmd: |
|
||||||
|
llama-server
|
||||||
|
-hf OuteAI/OuteTTS-0.3-1B-GGUF
|
||||||
|
-hff OuteTTS-0.3-1B-Q8_0.gguf
|
||||||
|
-hfv ggml-org/WavTokenizer
|
||||||
|
-hffv WavTokenizer-Large-75-F16.gguf
|
||||||
|
-c 4096
|
||||||
|
${cpu_args}
|
||||||
|
|
||||||
# Image generation via stable-diffusion.cpp (sd-server)
|
# Image generation via stable-diffusion.cpp (sd-server)
|
||||||
# Models must be pre-downloaded to /root/.cache/sd/
|
# Models must be pre-downloaded to /root/.cache/sd/
|
||||||
# FLUX.2-klein-4B: fast unified text-to-image and image editing model (Apache 2.0)
|
# FLUX.2-klein-4B: fast unified text-to-image and image editing model (Apache 2.0)
|
||||||
|
|||||||
@@ -16,6 +16,24 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
app: llama-swap
|
app: llama-swap
|
||||||
spec:
|
spec:
|
||||||
|
initContainers:
|
||||||
|
- name: download-whisper
|
||||||
|
image: ghcr.io/mostlygeek/llama-swap:unified-vulkan-2026-05-21
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
mkdir -p /root/.cache/whisper
|
||||||
|
if [ ! -f /root/.cache/whisper/ggml-small.bin ]; then
|
||||||
|
echo "Downloading whisper-small model..."
|
||||||
|
curl -L -o /root/.cache/whisper/ggml-small.bin \
|
||||||
|
https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin
|
||||||
|
else
|
||||||
|
echo "whisper-small model already present, skipping download"
|
||||||
|
fi
|
||||||
|
volumeMounts:
|
||||||
|
- name: models
|
||||||
|
mountPath: /root/.cache
|
||||||
containers:
|
containers:
|
||||||
- name: llama-swap
|
- name: llama-swap
|
||||||
image: ghcr.io/mostlygeek/llama-swap:unified-vulkan-2026-05-21
|
image: ghcr.io/mostlygeek/llama-swap:unified-vulkan-2026-05-21
|
||||||
|
|||||||
@@ -74,6 +74,28 @@ spec:
|
|||||||
value: "false"
|
value: "false"
|
||||||
- name: OAUTH_MERGE_ACCOUNTS_BY_EMAIL
|
- name: OAUTH_MERGE_ACCOUNTS_BY_EMAIL
|
||||||
value: "true"
|
value: "true"
|
||||||
|
# STT via whisper-server (routed through llama-swap)
|
||||||
|
- name: AUDIO_STT_ENGINE
|
||||||
|
value: "openai"
|
||||||
|
- name: AUDIO_STT_OPENAI_API_BASE_URL
|
||||||
|
value: "http://llama.llama.svc.cluster.local:11434/v1"
|
||||||
|
- name: AUDIO_STT_OPENAI_API_KEY
|
||||||
|
value: "ignored"
|
||||||
|
- name: AUDIO_STT_MODEL
|
||||||
|
value: "whisper-small"
|
||||||
|
- name: AUDIO_STT_SUPPORTED_CONTENT_TYPES
|
||||||
|
value: "audio/wav,audio/wave"
|
||||||
|
# TTS via OuteTTS (routed through llama-swap)
|
||||||
|
- name: AUDIO_TTS_ENGINE
|
||||||
|
value: "openai"
|
||||||
|
- name: AUDIO_TTS_OPENAI_API_BASE_URL
|
||||||
|
value: "http://llama.llama.svc.cluster.local:11434/v1"
|
||||||
|
- name: AUDIO_TTS_OPENAI_API_KEY
|
||||||
|
value: "ignored"
|
||||||
|
- name: AUDIO_TTS_MODEL
|
||||||
|
value: "outetts-tts"
|
||||||
|
- name: AUDIO_TTS_VOICE
|
||||||
|
value: "default"
|
||||||
# Image generation via llama-swap sd-server
|
# Image generation via llama-swap sd-server
|
||||||
- name: ENABLE_IMAGE_GENERATION
|
- name: ENABLE_IMAGE_GENERATION
|
||||||
value: "true"
|
value: "true"
|
||||||
|
|||||||
Reference in New Issue
Block a user