move whisper to gpu

switch kokoro to remsky/Kokoro-FastAPI
2026-05-21 22:02:34 +02:00 · 2026-05-21 21:55:34 +02:00
4 changed files with 8 additions and 70 deletions
@@ -235,9 +235,10 @@ models:
      --parallel 1
      ${common_args}
-  # STT via whisper.cpp (CPU-only, always loaded)
+  # STT via whisper.cpp (Vulkan GPU on RX 580, always loaded, ~600MB VRAM)
-  # Model auto-downloaded from HuggingFace on first start
+  # Model auto-downloaded by init container, see deployment.yaml
-  # whisper-small: ~240MB RAM, good accuracy/speed tradeoff on R5 3600
+  # Note: Vulkan whisper on AMD GPUs has known quality issues on some cards;
  # if transcriptions come out as garbage/gibberish, add --no-gpu to fall back.
  "whisper-small":
    checkEndpoint: none
    cmd: |
@@ -248,7 +249,6 @@ models:
        --inference-path /transcriptions
        --convert
        --threads 6
        --no-gpu
  # Image generation via stable-diffusion.cpp (sd-server)
@@ -16,37 +16,22 @@ spec:
      labels:
        app: kokoro
    spec:
      # Prevent Kubernetes from auto-injecting KOKORO_PORT=tcp://... (legacy
      # Docker-link-style env vars from same-namespace Services), which Kokoro
      # tries to parse as an integer and fails.
      enableServiceLinks: false
      containers:
        - name: kokoro
-          # OpenAI-compatible Kokoro TTS server, CPU-only PyTorch backend
+          # OpenAI-compatible Kokoro-FastAPI TTS server, CPU PyTorch backend.
-          # Exposes POST /v1/audio/speech with multiple voices and streaming
+          # Models baked into the image (no PVC needed).
-          image: hwdsl2/kokoro-server:latest@sha256:42886b8720e901f7e31aba2050cd03867767eb9f609bbc38fe93852e72f0feeb
+          # v0.3.0 includes fix for per-request voice tensor memory leak (#459).
          image: ghcr.io/remsky/kokoro-fastapi-cpu:v0.3.0
          ports:
            - containerPort: 8880
              name: http
              protocol: TCP
          env:
            # Default voice (can be overridden per-request)
            - name: KOKORO_VOICE
              value: "af_heart"
          volumeMounts:
            # Persistent cache for the ~320MB Kokoro model
            - name: cache
              mountPath: /var/lib/kokoro
          resources:
            requests:
              memory: "2Gi"
              cpu: "500m"
            limits:
              memory: "6Gi"
      volumes:
        - name: cache
          persistentVolumeClaim:
            claimName: kokoro-cache-lvmssd
 ---
 apiVersion: v1
 kind: Service
@@ -6,7 +6,6 @@ resources:
  - auth-proxy.yaml
  - ingress.yaml
  - pvc-ssd.yaml
  - pvc-kokoro-ssd.yaml
  - deployment.yaml
  - kokoro.yaml
 configMapGenerator:
@@ -1,46 +0,0 @@
 ---
 apiVersion: local.openebs.io/v1alpha1
 kind: LVMVolume
 metadata:
  labels:
    kubernetes.io/nodename: anapistula-delrosalae
  name: kokoro-cache-lvmssd
  namespace: openebs
 spec:
  capacity: "2147483648"
  ownerNodeID: anapistula-delrosalae
  shared: "yes"
  thinProvision: "no"
  vgPattern: ^openebs-ssd$
  volGroup: openebs-ssd
 ---
 kind: PersistentVolume
 apiVersion: v1
 metadata:
  name: kokoro-cache-lvmssd
 spec:
  capacity:
    storage: 2Gi
  accessModes:
    - ReadWriteOnce
  persistentVolumeReclaimPolicy: Retain
  storageClassName: ssd-lvmpv
  volumeMode: Filesystem
  csi:
    driver: local.csi.openebs.io
    fsType: btrfs
    volumeHandle: kokoro-cache-lvmssd
 ---
 kind: PersistentVolumeClaim
 apiVersion: v1
 metadata:
  name: kokoro-cache-lvmssd
  namespace: llama
 spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 2Gi
  storageClassName: ssd-lvmpv
  volumeName: kokoro-cache-lvmssd
Author	SHA1	Message	Date
Lumpiasty	fc2c15d154	move whisper to gpu ci/woodpecker/push/flux-reconcile-source Pipeline was successful Details	2026-05-21 22:02:34 +02:00
Lumpiasty	02b3ec13b4	switch kokoro to remsky/Kokoro-FastAPI	2026-05-21 21:55:34 +02:00