move kokoro to separate deployment
ci/woodpecker/push/flux-reconcile-source Pipeline was successful

This commit is contained in:
2026-05-21 21:23:36 +02:00
parent ab438be629
commit 989732e1b5
5 changed files with 120 additions and 21 deletions
+3 -18
View File
@@ -20,7 +20,6 @@ hooks:
preload:
- "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
- "whisper-small"
- "outetts-tts"
# matrix replaces groups (they are mutually exclusive).
# The small 0.8B model runs alongside any LLM.
@@ -29,7 +28,6 @@ matrix:
vars:
q8: "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
stt: "whisper-small"
tts: "outetts-tts"
flux: "flux2-klein-4b:Q4_K_M"
coder: "Qwen3-Coder-Next-GGUF:Q4_K_M"
q35t: "Qwen3.5-35B-A3B-GGUF:Q4_K_M"
@@ -56,9 +54,9 @@ matrix:
sets:
# any LLM can run alongside the small always-on model + STT + TTS (all CPU, no VRAM cost)
with_q8: "(coder | q35t | q35nt | q35ht | q35hnt | q4t | q4nt | q4ht | q4hnt | g26xl | g26q2 | ge4xl | ge2xl | q36t | q36nt | haut | haunt | mtpt | mtpnt) & q8 & stt & tts"
# FLUX runs alone — evicts everything including q8, but keeps STT+TTS for voice during image gen
image_gen: "flux & stt & tts"
with_q8: "(coder | q35t | q35nt | q35ht | q35hnt | q4t | q4nt | q4ht | q4hnt | g26xl | g26q2 | ge4xl | ge2xl | q36t | q36nt | haut | haunt | mtpt | mtpnt) & q8 & stt"
# FLUX runs alone — evicts everything including q8, but keeps STT for voice during image gen
image_gen: "flux & stt"
models:
"Qwen3-Coder-Next-GGUF:Q4_K_M":
@@ -252,19 +250,6 @@ models:
--threads 6
--no-gpu
# TTS via OuteTTS 0.3 1B + WavTokenizer vocoder (CPU-only, always loaded)
# Models auto-downloaded from HuggingFace on first start
# OuteTTS 0.3 1B: ~1GB RAM, WavTokenizer: ~600MB RAM
# Exposes /v1/audio/speech compatible with OpenAI TTS API
"outetts-tts":
checkEndpoint: none
cmd: |
llama-server
-hf OuteAI/OuteTTS-0.3-1B-GGUF
-hff OuteTTS-0.3-1B-Q8_0.gguf
-mv /root/.cache/huggingface/hub/models--ggml-org--WavTokenizer/snapshots/0c97fdc098158ec9bf4e703cd5f81a5aa20520e6/WavTokenizer-Large-75-F16.gguf
-c 4096
${cpu_args}
# Image generation via stable-diffusion.cpp (sd-server)
# Models must be pre-downloaded to /root/.cache/sd/
+65
View File
@@ -0,0 +1,65 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: kokoro
namespace: llama
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: kokoro
template:
metadata:
labels:
app: kokoro
spec:
# Prevent Kubernetes from auto-injecting KOKORO_PORT=tcp://... (legacy
# Docker-link-style env vars from same-namespace Services), which Kokoro
# tries to parse as an integer and fails.
enableServiceLinks: false
containers:
- name: kokoro
# OpenAI-compatible Kokoro TTS server, CPU-only PyTorch backend
# Exposes POST /v1/audio/speech with multiple voices and streaming
image: hwdsl2/kokoro-server:latest@sha256:42886b8720e901f7e31aba2050cd03867767eb9f609bbc38fe93852e72f0feeb
ports:
- containerPort: 8880
name: http
protocol: TCP
env:
# Default voice (can be overridden per-request)
- name: KOKORO_VOICE
value: "af_heart"
volumeMounts:
# Persistent cache for the ~320MB Kokoro model
- name: cache
mountPath: /var/lib/kokoro
resources:
requests:
memory: "2Gi"
cpu: "500m"
limits:
memory: "6Gi"
volumes:
- name: cache
persistentVolumeClaim:
claimName: kokoro-cache-lvmssd
---
apiVersion: v1
kind: Service
metadata:
name: kokoro
namespace: llama
spec:
type: ClusterIP
ports:
- name: http
port: 8880
targetPort: 8880
protocol: TCP
selector:
app: kokoro
+2
View File
@@ -6,7 +6,9 @@ resources:
- auth-proxy.yaml
- ingress.yaml
- pvc-ssd.yaml
- pvc-kokoro-ssd.yaml
- deployment.yaml
- kokoro.yaml
configMapGenerator:
- name: llama-swap
namespace: llama
+46
View File
@@ -0,0 +1,46 @@
---
apiVersion: local.openebs.io/v1alpha1
kind: LVMVolume
metadata:
labels:
kubernetes.io/nodename: anapistula-delrosalae
name: kokoro-cache-lvmssd
namespace: openebs
spec:
capacity: "2147483648"
ownerNodeID: anapistula-delrosalae
shared: "yes"
thinProvision: "no"
vgPattern: ^openebs-ssd$
volGroup: openebs-ssd
---
kind: PersistentVolume
apiVersion: v1
metadata:
name: kokoro-cache-lvmssd
spec:
capacity:
storage: 2Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: ssd-lvmpv
volumeMode: Filesystem
csi:
driver: local.csi.openebs.io
fsType: btrfs
volumeHandle: kokoro-cache-lvmssd
---
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: kokoro-cache-lvmssd
namespace: llama
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 2Gi
storageClassName: ssd-lvmpv
volumeName: kokoro-cache-lvmssd
+4 -3
View File
@@ -86,16 +86,17 @@ spec:
- name: AUDIO_STT_SUPPORTED_CONTENT_TYPES
value: "audio/wav,audio/wave"
# TTS via OuteTTS (routed through llama-swap)
# TTS via dedicated Kokoro server (CPU-only, separate pod)
- name: AUDIO_TTS_ENGINE
value: "openai"
- name: AUDIO_TTS_OPENAI_API_BASE_URL
value: "http://llama.llama.svc.cluster.local:11434/v1"
value: "http://kokoro.llama.svc.cluster.local:8880/v1"
- name: AUDIO_TTS_OPENAI_API_KEY
value: "ignored"
- name: AUDIO_TTS_MODEL
value: "outetts-tts"
value: "kokoro"
- name: AUDIO_TTS_VOICE
value: "default"
value: "af_heart"
# Image generation via llama-swap sd-server
- name: ENABLE_IMAGE_GENERATION
value: "true"