From 611f9f388696b22a89969ab05ce71fb05df24a97 Mon Sep 17 00:00:00 2001
From: Lumpiasty <arek.dzski@gmail.com>
Date: Thu, 21 May 2026 20:09:15 +0200
Subject: [PATCH] add tts and sst to llama-swap and openwebui

---
 apps/llama/configs/config.yaml | 42 ++++++++++++++++++++++++++++++----
 apps/llama/deployment.yaml     | 18 +++++++++++++++
 apps/openwebui/release.yaml    | 22 ++++++++++++++++++
 3 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/apps/llama/configs/config.yaml b/apps/llama/configs/config.yaml
index 7d13bdb..2f222f9 100644
--- a/apps/llama/configs/config.yaml
+++ b/apps/llama/configs/config.yaml
@@ -5,6 +5,7 @@ logToStdout: "both" # proxy and upstream
 macros:
   base_args: "--no-warmup --port ${PORT}"
   common_args: "--fit-target 1536 --no-warmup --port ${PORT}"
+  cpu_args: "--no-warmup --port ${PORT} -ngl 0"
   ctx_64k: "--ctx-size 65536"
   ctx_128k: "--ctx-size 131072"
   ctx_256k: "--ctx-size 262144"
@@ -18,6 +19,8 @@ hooks:
   on_startup:
     preload:
       - "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
+      - "whisper-small"
+      - "outetts-tts"
 
 # matrix replaces groups (they are mutually exclusive).
 # The small 0.8B model runs alongside any LLM.
@@ -25,6 +28,8 @@ hooks:
 matrix:
   vars:
     q8: "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
+    stt: "whisper-small"
+    tts: "outetts-tts"
     flux: "flux2-klein-4b:Q4_K_M"
     coder: "Qwen3-Coder-Next-GGUF:Q4_K_M"
     q35t: "Qwen3.5-35B-A3B-GGUF:Q4_K_M"
@@ -50,10 +55,10 @@ matrix:
     flux: 10  # large files, slow to reload
 
   sets:
-    # any LLM can run alongside the small always-on model
-    with_q8: "(coder | q35t | q35nt | q35ht | q35hnt | q4t | q4nt | q4ht | q4hnt | g26xl | g26q2 | ge4xl | ge2xl | q36t | q36nt | haut | haunt | mtpt | mtpnt) & q8"
-    # FLUX runs alone — evicts everything including q8
-    image_gen: "flux"
+    # any LLM can run alongside the small always-on model + STT + TTS (all CPU, no VRAM cost)
+    with_q8: "(coder | q35t | q35nt | q35ht | q35hnt | q4t | q4nt | q4ht | q4hnt | g26xl | g26q2 | ge4xl | ge2xl | q36t | q36nt | haut | haunt | mtpt | mtpnt) & q8 & stt & tts"
+    # FLUX runs alone — evicts everything including q8, but keeps STT+TTS for voice during image gen
+    image_gen: "flux & stt & tts"
 
 models:
   "Qwen3-Coder-Next-GGUF:Q4_K_M":
@@ -232,6 +237,35 @@ models:
       --parallel 1
       ${common_args}
 
+  # STT via whisper.cpp (CPU-only, always loaded)
+  # Model auto-downloaded from HuggingFace on first start
+  # whisper-small: ~240MB RAM, good accuracy/speed tradeoff on R5 3600
+  "whisper-small":
+    checkEndpoint: none
+    cmd: |
+      whisper-server
+        --port ${PORT}
+        -m /root/.cache/whisper/ggml-small.bin
+        --request-path /v1/audio
+        --inference-path /transcriptions
+        --threads 6
+        --no-gpu
+
+  # TTS via OuteTTS 0.3 1B + WavTokenizer vocoder (CPU-only, always loaded)
+  # Models auto-downloaded from HuggingFace on first start
+  # OuteTTS 0.3 1B: ~1GB RAM, WavTokenizer: ~600MB RAM
+  # Exposes /v1/audio/speech compatible with OpenAI TTS API
+  "outetts-tts":
+    checkEndpoint: none
+    cmd: |
+      llama-server
+        -hf OuteAI/OuteTTS-0.3-1B-GGUF
+        -hff OuteTTS-0.3-1B-Q8_0.gguf
+        -hfv ggml-org/WavTokenizer
+        -hffv WavTokenizer-Large-75-F16.gguf
+        -c 4096
+        ${cpu_args}
+
   # Image generation via stable-diffusion.cpp (sd-server)
   # Models must be pre-downloaded to /root/.cache/sd/
   # FLUX.2-klein-4B: fast unified text-to-image and image editing model (Apache 2.0)
diff --git a/apps/llama/deployment.yaml b/apps/llama/deployment.yaml
index 656d268..6936918 100644
--- a/apps/llama/deployment.yaml
+++ b/apps/llama/deployment.yaml
@@ -16,6 +16,24 @@ spec:
       labels:
         app: llama-swap
     spec:
+      initContainers:
+        - name: download-whisper
+          image: ghcr.io/mostlygeek/llama-swap:unified-vulkan-2026-05-21
+          command:
+            - sh
+            - -c
+            - |
+              mkdir -p /root/.cache/whisper
+              if [ ! -f /root/.cache/whisper/ggml-small.bin ]; then
+                echo "Downloading whisper-small model..."
+                curl -L -o /root/.cache/whisper/ggml-small.bin \
+                  https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin
+              else
+                echo "whisper-small model already present, skipping download"
+              fi
+          volumeMounts:
+            - name: models
+              mountPath: /root/.cache
       containers:
         - name: llama-swap
           image: ghcr.io/mostlygeek/llama-swap:unified-vulkan-2026-05-21
diff --git a/apps/openwebui/release.yaml b/apps/openwebui/release.yaml
index 7395c05..189b5c0 100644
--- a/apps/openwebui/release.yaml
+++ b/apps/openwebui/release.yaml
@@ -74,6 +74,28 @@ spec:
         value: "false"
       - name: OAUTH_MERGE_ACCOUNTS_BY_EMAIL
         value: "true"
+      # STT via whisper-server (routed through llama-swap)
+      - name: AUDIO_STT_ENGINE
+        value: "openai"
+      - name: AUDIO_STT_OPENAI_API_BASE_URL
+        value: "http://llama.llama.svc.cluster.local:11434/v1"
+      - name: AUDIO_STT_OPENAI_API_KEY
+        value: "ignored"
+      - name: AUDIO_STT_MODEL
+        value: "whisper-small"
+      - name: AUDIO_STT_SUPPORTED_CONTENT_TYPES
+        value: "audio/wav,audio/wave"
+      # TTS via OuteTTS (routed through llama-swap)
+      - name: AUDIO_TTS_ENGINE
+        value: "openai"
+      - name: AUDIO_TTS_OPENAI_API_BASE_URL
+        value: "http://llama.llama.svc.cluster.local:11434/v1"
+      - name: AUDIO_TTS_OPENAI_API_KEY
+        value: "ignored"
+      - name: AUDIO_TTS_MODEL
+        value: "outetts-tts"
+      - name: AUDIO_TTS_VOICE
+        value: "default"
       # Image generation via llama-swap sd-server
       - name: ENABLE_IMAGE_GENERATION
         value: "true"