This commit is contained in:
@@ -20,7 +20,6 @@ hooks:
|
|||||||
preload:
|
preload:
|
||||||
- "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
|
- "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
|
||||||
- "whisper-small"
|
- "whisper-small"
|
||||||
- "outetts-tts"
|
|
||||||
|
|
||||||
# matrix replaces groups (they are mutually exclusive).
|
# matrix replaces groups (they are mutually exclusive).
|
||||||
# The small 0.8B model runs alongside any LLM.
|
# The small 0.8B model runs alongside any LLM.
|
||||||
@@ -29,7 +28,6 @@ matrix:
|
|||||||
vars:
|
vars:
|
||||||
q8: "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
|
q8: "Qwen3.5-0.8B-GGUF-nothink:Q4_K_XL"
|
||||||
stt: "whisper-small"
|
stt: "whisper-small"
|
||||||
tts: "outetts-tts"
|
|
||||||
flux: "flux2-klein-4b:Q4_K_M"
|
flux: "flux2-klein-4b:Q4_K_M"
|
||||||
coder: "Qwen3-Coder-Next-GGUF:Q4_K_M"
|
coder: "Qwen3-Coder-Next-GGUF:Q4_K_M"
|
||||||
q35t: "Qwen3.5-35B-A3B-GGUF:Q4_K_M"
|
q35t: "Qwen3.5-35B-A3B-GGUF:Q4_K_M"
|
||||||
@@ -56,9 +54,9 @@ matrix:
|
|||||||
|
|
||||||
sets:
|
sets:
|
||||||
# any LLM can run alongside the small always-on model + STT + TTS (all CPU, no VRAM cost)
|
# any LLM can run alongside the small always-on model + STT + TTS (all CPU, no VRAM cost)
|
||||||
with_q8: "(coder | q35t | q35nt | q35ht | q35hnt | q4t | q4nt | q4ht | q4hnt | g26xl | g26q2 | ge4xl | ge2xl | q36t | q36nt | haut | haunt | mtpt | mtpnt) & q8 & stt & tts"
|
with_q8: "(coder | q35t | q35nt | q35ht | q35hnt | q4t | q4nt | q4ht | q4hnt | g26xl | g26q2 | ge4xl | ge2xl | q36t | q36nt | haut | haunt | mtpt | mtpnt) & q8 & stt"
|
||||||
# FLUX runs alone — evicts everything including q8, but keeps STT+TTS for voice during image gen
|
# FLUX runs alone — evicts everything including q8, but keeps STT for voice during image gen
|
||||||
image_gen: "flux & stt & tts"
|
image_gen: "flux & stt"
|
||||||
|
|
||||||
models:
|
models:
|
||||||
"Qwen3-Coder-Next-GGUF:Q4_K_M":
|
"Qwen3-Coder-Next-GGUF:Q4_K_M":
|
||||||
@@ -252,19 +250,6 @@ models:
|
|||||||
--threads 6
|
--threads 6
|
||||||
--no-gpu
|
--no-gpu
|
||||||
|
|
||||||
# TTS via OuteTTS 0.3 1B + WavTokenizer vocoder (CPU-only, always loaded)
|
|
||||||
# Models auto-downloaded from HuggingFace on first start
|
|
||||||
# OuteTTS 0.3 1B: ~1GB RAM, WavTokenizer: ~600MB RAM
|
|
||||||
# Exposes /v1/audio/speech compatible with OpenAI TTS API
|
|
||||||
"outetts-tts":
|
|
||||||
checkEndpoint: none
|
|
||||||
cmd: |
|
|
||||||
llama-server
|
|
||||||
-hf OuteAI/OuteTTS-0.3-1B-GGUF
|
|
||||||
-hff OuteTTS-0.3-1B-Q8_0.gguf
|
|
||||||
-mv /root/.cache/huggingface/hub/models--ggml-org--WavTokenizer/snapshots/0c97fdc098158ec9bf4e703cd5f81a5aa20520e6/WavTokenizer-Large-75-F16.gguf
|
|
||||||
-c 4096
|
|
||||||
${cpu_args}
|
|
||||||
|
|
||||||
# Image generation via stable-diffusion.cpp (sd-server)
|
# Image generation via stable-diffusion.cpp (sd-server)
|
||||||
# Models must be pre-downloaded to /root/.cache/sd/
|
# Models must be pre-downloaded to /root/.cache/sd/
|
||||||
|
|||||||
@@ -0,0 +1,65 @@
|
|||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: kokoro
|
||||||
|
namespace: llama
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
strategy:
|
||||||
|
type: Recreate
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: kokoro
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: kokoro
|
||||||
|
spec:
|
||||||
|
# Prevent Kubernetes from auto-injecting KOKORO_PORT=tcp://... (legacy
|
||||||
|
# Docker-link-style env vars from same-namespace Services), which Kokoro
|
||||||
|
# tries to parse as an integer and fails.
|
||||||
|
enableServiceLinks: false
|
||||||
|
containers:
|
||||||
|
- name: kokoro
|
||||||
|
# OpenAI-compatible Kokoro TTS server, CPU-only PyTorch backend
|
||||||
|
# Exposes POST /v1/audio/speech with multiple voices and streaming
|
||||||
|
image: hwdsl2/kokoro-server:latest@sha256:42886b8720e901f7e31aba2050cd03867767eb9f609bbc38fe93852e72f0feeb
|
||||||
|
ports:
|
||||||
|
- containerPort: 8880
|
||||||
|
name: http
|
||||||
|
protocol: TCP
|
||||||
|
env:
|
||||||
|
# Default voice (can be overridden per-request)
|
||||||
|
- name: KOKORO_VOICE
|
||||||
|
value: "af_heart"
|
||||||
|
volumeMounts:
|
||||||
|
# Persistent cache for the ~320MB Kokoro model
|
||||||
|
- name: cache
|
||||||
|
mountPath: /var/lib/kokoro
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "2Gi"
|
||||||
|
cpu: "500m"
|
||||||
|
limits:
|
||||||
|
memory: "6Gi"
|
||||||
|
volumes:
|
||||||
|
- name: cache
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: kokoro-cache-lvmssd
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: kokoro
|
||||||
|
namespace: llama
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 8880
|
||||||
|
targetPort: 8880
|
||||||
|
protocol: TCP
|
||||||
|
selector:
|
||||||
|
app: kokoro
|
||||||
|
|
||||||
@@ -6,7 +6,9 @@ resources:
|
|||||||
- auth-proxy.yaml
|
- auth-proxy.yaml
|
||||||
- ingress.yaml
|
- ingress.yaml
|
||||||
- pvc-ssd.yaml
|
- pvc-ssd.yaml
|
||||||
|
- pvc-kokoro-ssd.yaml
|
||||||
- deployment.yaml
|
- deployment.yaml
|
||||||
|
- kokoro.yaml
|
||||||
configMapGenerator:
|
configMapGenerator:
|
||||||
- name: llama-swap
|
- name: llama-swap
|
||||||
namespace: llama
|
namespace: llama
|
||||||
|
|||||||
@@ -0,0 +1,46 @@
|
|||||||
|
---
|
||||||
|
apiVersion: local.openebs.io/v1alpha1
|
||||||
|
kind: LVMVolume
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
kubernetes.io/nodename: anapistula-delrosalae
|
||||||
|
name: kokoro-cache-lvmssd
|
||||||
|
namespace: openebs
|
||||||
|
spec:
|
||||||
|
capacity: "2147483648"
|
||||||
|
ownerNodeID: anapistula-delrosalae
|
||||||
|
shared: "yes"
|
||||||
|
thinProvision: "no"
|
||||||
|
vgPattern: ^openebs-ssd$
|
||||||
|
volGroup: openebs-ssd
|
||||||
|
---
|
||||||
|
kind: PersistentVolume
|
||||||
|
apiVersion: v1
|
||||||
|
metadata:
|
||||||
|
name: kokoro-cache-lvmssd
|
||||||
|
spec:
|
||||||
|
capacity:
|
||||||
|
storage: 2Gi
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
persistentVolumeReclaimPolicy: Retain
|
||||||
|
storageClassName: ssd-lvmpv
|
||||||
|
volumeMode: Filesystem
|
||||||
|
csi:
|
||||||
|
driver: local.csi.openebs.io
|
||||||
|
fsType: btrfs
|
||||||
|
volumeHandle: kokoro-cache-lvmssd
|
||||||
|
---
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
apiVersion: v1
|
||||||
|
metadata:
|
||||||
|
name: kokoro-cache-lvmssd
|
||||||
|
namespace: llama
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 2Gi
|
||||||
|
storageClassName: ssd-lvmpv
|
||||||
|
volumeName: kokoro-cache-lvmssd
|
||||||
@@ -86,16 +86,17 @@ spec:
|
|||||||
- name: AUDIO_STT_SUPPORTED_CONTENT_TYPES
|
- name: AUDIO_STT_SUPPORTED_CONTENT_TYPES
|
||||||
value: "audio/wav,audio/wave"
|
value: "audio/wav,audio/wave"
|
||||||
# TTS via OuteTTS (routed through llama-swap)
|
# TTS via OuteTTS (routed through llama-swap)
|
||||||
|
# TTS via dedicated Kokoro server (CPU-only, separate pod)
|
||||||
- name: AUDIO_TTS_ENGINE
|
- name: AUDIO_TTS_ENGINE
|
||||||
value: "openai"
|
value: "openai"
|
||||||
- name: AUDIO_TTS_OPENAI_API_BASE_URL
|
- name: AUDIO_TTS_OPENAI_API_BASE_URL
|
||||||
value: "http://llama.llama.svc.cluster.local:11434/v1"
|
value: "http://kokoro.llama.svc.cluster.local:8880/v1"
|
||||||
- name: AUDIO_TTS_OPENAI_API_KEY
|
- name: AUDIO_TTS_OPENAI_API_KEY
|
||||||
value: "ignored"
|
value: "ignored"
|
||||||
- name: AUDIO_TTS_MODEL
|
- name: AUDIO_TTS_MODEL
|
||||||
value: "outetts-tts"
|
value: "kokoro"
|
||||||
- name: AUDIO_TTS_VOICE
|
- name: AUDIO_TTS_VOICE
|
||||||
value: "default"
|
value: "af_heart"
|
||||||
# Image generation via llama-swap sd-server
|
# Image generation via llama-swap sd-server
|
||||||
- name: ENABLE_IMAGE_GENERATION
|
- name: ENABLE_IMAGE_GENERATION
|
||||||
value: "true"
|
value: "true"
|
||||||
|
|||||||
Reference in New Issue
Block a user