From 611f9f388696b22a89969ab05ce71fb05df24a97 Mon Sep 17 00:00:00 2001 From: Lumpiasty Date: Thu, 21 May 2026 20:09:15 +0200 Subject: [PATCH] add tts and sst to llama-swap and openwebui --- apps/llama/configs/config.yaml | 42 ++++++++++++++++++++++++++++++---- apps/llama/deployment.yaml | 18 +++++++++++++++ apps/openwebui/release.yaml | 22 ++++++++++++++++++ 3 files changed, 78 insertions(+), 4 deletions(-) diff --git a/apps/llama/configs/config.yaml b/apps/llama/configs/config.yaml index 7d13bdb..2f222f9 100644 --- a/apps/llama/configs/config.yaml +++ b/apps/llama/configs/config.yaml @@ -5,6 +5,7 @@ logToStdout: "both" # proxy and upstream macros: base_args: "--no-warmup --port ${PORT}" common_args: "--fit-target 1536 --no-warmup --port ${PORT}" + cpu_args: "--no-warmup --port ${PORT} -ngl 0" ctx_64k: "--ctx-size 65536" ctx_128k: "--ctx-size 131072" ctx_256k: "--ctx-size 262144" @@ -18,6 +19,8 @@ hooks: on_startup: preload: - "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL" + - "whisper-small" + - "outetts-tts" # matrix replaces groups (they are mutually exclusive). # The small 0.8B model runs alongside any LLM. @@ -25,6 +28,8 @@ hooks: matrix: vars: q8: "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL" + stt: "whisper-small" + tts: "outetts-tts" flux: "flux2-klein-4b:Q4_K_M" coder: "Qwen3-Coder-Next-GGUF:Q4_K_M" q35t: "Qwen3.5-35B-A3B-GGUF:Q4_K_M" @@ -50,10 +55,10 @@ matrix: flux: 10 # large files, slow to reload sets: - # any LLM can run alongside the small always-on model - with_q8: "(coder | q35t | q35nt | q35ht | q35hnt | q4t | q4nt | q4ht | q4hnt | g26xl | g26q2 | ge4xl | ge2xl | q36t | q36nt | haut | haunt | mtpt | mtpnt) & q8" - # FLUX runs alone — evicts everything including q8 - image_gen: "flux" + # any LLM can run alongside the small always-on model + STT + TTS (all CPU, no VRAM cost) + with_q8: "(coder | q35t | q35nt | q35ht | q35hnt | q4t | q4nt | q4ht | q4hnt | g26xl | g26q2 | ge4xl | ge2xl | q36t | q36nt | haut | haunt | mtpt | mtpnt) & q8 & stt & tts" + # FLUX runs alone — evicts everything including q8, but keeps STT+TTS for voice during image gen + image_gen: "flux & stt & tts" models: "Qwen3-Coder-Next-GGUF:Q4_K_M": @@ -232,6 +237,35 @@ models: --parallel 1 ${common_args} + # STT via whisper.cpp (CPU-only, always loaded) + # Model auto-downloaded from HuggingFace on first start + # whisper-small: ~240MB RAM, good accuracy/speed tradeoff on R5 3600 + "whisper-small": + checkEndpoint: none + cmd: | + whisper-server + --port ${PORT} + -m /root/.cache/whisper/ggml-small.bin + --request-path /v1/audio + --inference-path /transcriptions + --threads 6 + --no-gpu + + # TTS via OuteTTS 0.3 1B + WavTokenizer vocoder (CPU-only, always loaded) + # Models auto-downloaded from HuggingFace on first start + # OuteTTS 0.3 1B: ~1GB RAM, WavTokenizer: ~600MB RAM + # Exposes /v1/audio/speech compatible with OpenAI TTS API + "outetts-tts": + checkEndpoint: none + cmd: | + llama-server + -hf OuteAI/OuteTTS-0.3-1B-GGUF + -hff OuteTTS-0.3-1B-Q8_0.gguf + -hfv ggml-org/WavTokenizer + -hffv WavTokenizer-Large-75-F16.gguf + -c 4096 + ${cpu_args} + # Image generation via stable-diffusion.cpp (sd-server) # Models must be pre-downloaded to /root/.cache/sd/ # FLUX.2-klein-4B: fast unified text-to-image and image editing model (Apache 2.0) diff --git a/apps/llama/deployment.yaml b/apps/llama/deployment.yaml index 656d268..6936918 100644 --- a/apps/llama/deployment.yaml +++ b/apps/llama/deployment.yaml @@ -16,6 +16,24 @@ spec: labels: app: llama-swap spec: + initContainers: + - name: download-whisper + image: ghcr.io/mostlygeek/llama-swap:unified-vulkan-2026-05-21 + command: + - sh + - -c + - | + mkdir -p /root/.cache/whisper + if [ ! -f /root/.cache/whisper/ggml-small.bin ]; then + echo "Downloading whisper-small model..." + curl -L -o /root/.cache/whisper/ggml-small.bin \ + https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin + else + echo "whisper-small model already present, skipping download" + fi + volumeMounts: + - name: models + mountPath: /root/.cache containers: - name: llama-swap image: ghcr.io/mostlygeek/llama-swap:unified-vulkan-2026-05-21 diff --git a/apps/openwebui/release.yaml b/apps/openwebui/release.yaml index 7395c05..189b5c0 100644 --- a/apps/openwebui/release.yaml +++ b/apps/openwebui/release.yaml @@ -74,6 +74,28 @@ spec: value: "false" - name: OAUTH_MERGE_ACCOUNTS_BY_EMAIL value: "true" + # STT via whisper-server (routed through llama-swap) + - name: AUDIO_STT_ENGINE + value: "openai" + - name: AUDIO_STT_OPENAI_API_BASE_URL + value: "http://llama.llama.svc.cluster.local:11434/v1" + - name: AUDIO_STT_OPENAI_API_KEY + value: "ignored" + - name: AUDIO_STT_MODEL + value: "whisper-small" + - name: AUDIO_STT_SUPPORTED_CONTENT_TYPES + value: "audio/wav,audio/wave" + # TTS via OuteTTS (routed through llama-swap) + - name: AUDIO_TTS_ENGINE + value: "openai" + - name: AUDIO_TTS_OPENAI_API_BASE_URL + value: "http://llama.llama.svc.cluster.local:11434/v1" + - name: AUDIO_TTS_OPENAI_API_KEY + value: "ignored" + - name: AUDIO_TTS_MODEL + value: "outetts-tts" + - name: AUDIO_TTS_VOICE + value: "default" # Image generation via llama-swap sd-server - name: ENABLE_IMAGE_GENERATION value: "true"