From 989732e1b5763f00edb9446afcede80a993daaef Mon Sep 17 00:00:00 2001 From: Lumpiasty Date: Thu, 21 May 2026 21:23:36 +0200 Subject: [PATCH] move kokoro to separate deployment --- apps/llama/configs/config.yaml | 21 ++--------- apps/llama/kokoro.yaml | 65 ++++++++++++++++++++++++++++++++++ apps/llama/kustomization.yaml | 2 ++ apps/llama/pvc-kokoro-ssd.yaml | 46 ++++++++++++++++++++++++ apps/openwebui/release.yaml | 7 ++-- 5 files changed, 120 insertions(+), 21 deletions(-) create mode 100644 apps/llama/kokoro.yaml create mode 100644 apps/llama/pvc-kokoro-ssd.yaml diff --git a/apps/llama/configs/config.yaml b/apps/llama/configs/config.yaml index 9c59140..03dce68 100644 --- a/apps/llama/configs/config.yaml +++ b/apps/llama/configs/config.yaml @@ -20,7 +20,6 @@ hooks: preload: - "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL" - "whisper-small" - - "outetts-tts" # matrix replaces groups (they are mutually exclusive). # The small 0.8B model runs alongside any LLM. @@ -29,7 +28,6 @@ matrix: vars: q8: "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL" stt: "whisper-small" - tts: "outetts-tts" flux: "flux2-klein-4b:Q4_K_M" coder: "Qwen3-Coder-Next-GGUF:Q4_K_M" q35t: "Qwen3.5-35B-A3B-GGUF:Q4_K_M" @@ -56,9 +54,9 @@ matrix: sets: # any LLM can run alongside the small always-on model + STT + TTS (all CPU, no VRAM cost) - with_q8: "(coder | q35t | q35nt | q35ht | q35hnt | q4t | q4nt | q4ht | q4hnt | g26xl | g26q2 | ge4xl | ge2xl | q36t | q36nt | haut | haunt | mtpt | mtpnt) & q8 & stt & tts" - # FLUX runs alone — evicts everything including q8, but keeps STT+TTS for voice during image gen - image_gen: "flux & stt & tts" + with_q8: "(coder | q35t | q35nt | q35ht | q35hnt | q4t | q4nt | q4ht | q4hnt | g26xl | g26q2 | ge4xl | ge2xl | q36t | q36nt | haut | haunt | mtpt | mtpnt) & q8 & stt" + # FLUX runs alone — evicts everything including q8, but keeps STT for voice during image gen + image_gen: "flux & stt" models: "Qwen3-Coder-Next-GGUF:Q4_K_M": @@ -252,19 +250,6 @@ models: --threads 6 --no-gpu - # TTS via OuteTTS 0.3 1B + WavTokenizer vocoder (CPU-only, always loaded) - # Models auto-downloaded from HuggingFace on first start - # OuteTTS 0.3 1B: ~1GB RAM, WavTokenizer: ~600MB RAM - # Exposes /v1/audio/speech compatible with OpenAI TTS API - "outetts-tts": - checkEndpoint: none - cmd: | - llama-server - -hf OuteAI/OuteTTS-0.3-1B-GGUF - -hff OuteTTS-0.3-1B-Q8_0.gguf - -mv /root/.cache/huggingface/hub/models--ggml-org--WavTokenizer/snapshots/0c97fdc098158ec9bf4e703cd5f81a5aa20520e6/WavTokenizer-Large-75-F16.gguf - -c 4096 - ${cpu_args} # Image generation via stable-diffusion.cpp (sd-server) # Models must be pre-downloaded to /root/.cache/sd/ diff --git a/apps/llama/kokoro.yaml b/apps/llama/kokoro.yaml new file mode 100644 index 0000000..71e3fc2 --- /dev/null +++ b/apps/llama/kokoro.yaml @@ -0,0 +1,65 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kokoro + namespace: llama +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: kokoro + template: + metadata: + labels: + app: kokoro + spec: + # Prevent Kubernetes from auto-injecting KOKORO_PORT=tcp://... (legacy + # Docker-link-style env vars from same-namespace Services), which Kokoro + # tries to parse as an integer and fails. + enableServiceLinks: false + containers: + - name: kokoro + # OpenAI-compatible Kokoro TTS server, CPU-only PyTorch backend + # Exposes POST /v1/audio/speech with multiple voices and streaming + image: hwdsl2/kokoro-server:latest@sha256:42886b8720e901f7e31aba2050cd03867767eb9f609bbc38fe93852e72f0feeb + ports: + - containerPort: 8880 + name: http + protocol: TCP + env: + # Default voice (can be overridden per-request) + - name: KOKORO_VOICE + value: "af_heart" + volumeMounts: + # Persistent cache for the ~320MB Kokoro model + - name: cache + mountPath: /var/lib/kokoro + resources: + requests: + memory: "2Gi" + cpu: "500m" + limits: + memory: "6Gi" + volumes: + - name: cache + persistentVolumeClaim: + claimName: kokoro-cache-lvmssd +--- +apiVersion: v1 +kind: Service +metadata: + name: kokoro + namespace: llama +spec: + type: ClusterIP + ports: + - name: http + port: 8880 + targetPort: 8880 + protocol: TCP + selector: + app: kokoro + diff --git a/apps/llama/kustomization.yaml b/apps/llama/kustomization.yaml index 6813e0b..74cbd37 100644 --- a/apps/llama/kustomization.yaml +++ b/apps/llama/kustomization.yaml @@ -6,7 +6,9 @@ resources: - auth-proxy.yaml - ingress.yaml - pvc-ssd.yaml + - pvc-kokoro-ssd.yaml - deployment.yaml + - kokoro.yaml configMapGenerator: - name: llama-swap namespace: llama diff --git a/apps/llama/pvc-kokoro-ssd.yaml b/apps/llama/pvc-kokoro-ssd.yaml new file mode 100644 index 0000000..643bf76 --- /dev/null +++ b/apps/llama/pvc-kokoro-ssd.yaml @@ -0,0 +1,46 @@ +--- +apiVersion: local.openebs.io/v1alpha1 +kind: LVMVolume +metadata: + labels: + kubernetes.io/nodename: anapistula-delrosalae + name: kokoro-cache-lvmssd + namespace: openebs +spec: + capacity: "2147483648" + ownerNodeID: anapistula-delrosalae + shared: "yes" + thinProvision: "no" + vgPattern: ^openebs-ssd$ + volGroup: openebs-ssd +--- +kind: PersistentVolume +apiVersion: v1 +metadata: + name: kokoro-cache-lvmssd +spec: + capacity: + storage: 2Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: ssd-lvmpv + volumeMode: Filesystem + csi: + driver: local.csi.openebs.io + fsType: btrfs + volumeHandle: kokoro-cache-lvmssd +--- +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: kokoro-cache-lvmssd + namespace: llama +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi + storageClassName: ssd-lvmpv + volumeName: kokoro-cache-lvmssd diff --git a/apps/openwebui/release.yaml b/apps/openwebui/release.yaml index 189b5c0..38ad343 100644 --- a/apps/openwebui/release.yaml +++ b/apps/openwebui/release.yaml @@ -86,16 +86,17 @@ spec: - name: AUDIO_STT_SUPPORTED_CONTENT_TYPES value: "audio/wav,audio/wave" # TTS via OuteTTS (routed through llama-swap) + # TTS via dedicated Kokoro server (CPU-only, separate pod) - name: AUDIO_TTS_ENGINE value: "openai" - name: AUDIO_TTS_OPENAI_API_BASE_URL - value: "http://llama.llama.svc.cluster.local:11434/v1" + value: "http://kokoro.llama.svc.cluster.local:8880/v1" - name: AUDIO_TTS_OPENAI_API_KEY value: "ignored" - name: AUDIO_TTS_MODEL - value: "outetts-tts" + value: "kokoro" - name: AUDIO_TTS_VOICE - value: "default" + value: "af_heart" # Image generation via llama-swap sd-server - name: ENABLE_IMAGE_GENERATION value: "true"