move whisper to gpu

switch kokoro to remsky/Kokoro-FastAPI
2026-05-21 22:02:34 +02:00 · 2026-05-21 21:55:34 +02:00
4 changed files with 8 additions and 70 deletions
@@ -235,9 +235,10 @@ models:
      --parallel 1
      ${common_args}

-  # STT via whisper.cpp (CPU-only, always loaded)
-  # Model auto-downloaded from HuggingFace on first start
-  # whisper-small: ~240MB RAM, good accuracy/speed tradeoff on R5 3600
+  # STT via whisper.cpp (Vulkan GPU on RX 580, always loaded, ~600MB VRAM)
+  # Model auto-downloaded by init container, see deployment.yaml
+  # Note: Vulkan whisper on AMD GPUs has known quality issues on some cards;
+  # if transcriptions come out as garbage/gibberish, add --no-gpu to fall back.
  "whisper-small":
    checkEndpoint: none
    cmd: |
@@ -248,7 +249,6 @@ models:
        --inference-path /transcriptions
        --convert
        --threads 6
-        --no-gpu


  # Image generation via stable-diffusion.cpp (sd-server)
@@ -16,37 +16,22 @@ spec:
      labels:
        app: kokoro
    spec:
-      # Prevent Kubernetes from auto-injecting KOKORO_PORT=tcp://... (legacy
-      # Docker-link-style env vars from same-namespace Services), which Kokoro
-      # tries to parse as an integer and fails.
-      enableServiceLinks: false
      containers:
        - name: kokoro
-          # OpenAI-compatible Kokoro TTS server, CPU-only PyTorch backend
-          # Exposes POST /v1/audio/speech with multiple voices and streaming
-          image: hwdsl2/kokoro-server:latest@sha256:42886b8720e901f7e31aba2050cd03867767eb9f609bbc38fe93852e72f0feeb
+          # OpenAI-compatible Kokoro-FastAPI TTS server, CPU PyTorch backend.
+          # Models baked into the image (no PVC needed).
+          # v0.3.0 includes fix for per-request voice tensor memory leak (#459).
+          image: ghcr.io/remsky/kokoro-fastapi-cpu:v0.3.0
          ports:
            - containerPort: 8880
              name: http
              protocol: TCP
-          env:
-            # Default voice (can be overridden per-request)
-            - name: KOKORO_VOICE
-              value: "af_heart"
-          volumeMounts:
-            # Persistent cache for the ~320MB Kokoro model
-            - name: cache
-              mountPath: /var/lib/kokoro
          resources:
            requests:
              memory: "2Gi"
              cpu: "500m"
            limits:
              memory: "6Gi"
-      volumes:
-        - name: cache
-          persistentVolumeClaim:
-            claimName: kokoro-cache-lvmssd
 ---
 apiVersion: v1
 kind: Service
@@ -6,7 +6,6 @@ resources:
  - auth-proxy.yaml
  - ingress.yaml
  - pvc-ssd.yaml
-  - pvc-kokoro-ssd.yaml
  - deployment.yaml
  - kokoro.yaml
 configMapGenerator:
@@ -1,46 +0,0 @@
---
-apiVersion: local.openebs.io/v1alpha1
-kind: LVMVolume
-metadata:
-  labels:
-    kubernetes.io/nodename: anapistula-delrosalae
-  name: kokoro-cache-lvmssd
-  namespace: openebs
-spec:
-  capacity: "2147483648"
-  ownerNodeID: anapistula-delrosalae
-  shared: "yes"
-  thinProvision: "no"
-  vgPattern: ^openebs-ssd$
-  volGroup: openebs-ssd
---
-kind: PersistentVolume
-apiVersion: v1
-metadata:
-  name: kokoro-cache-lvmssd
-spec:
-  capacity:
-    storage: 2Gi
-  accessModes:
-    - ReadWriteOnce
-  persistentVolumeReclaimPolicy: Retain
-  storageClassName: ssd-lvmpv
-  volumeMode: Filesystem
-  csi:
-    driver: local.csi.openebs.io
-    fsType: btrfs
-    volumeHandle: kokoro-cache-lvmssd
---
-kind: PersistentVolumeClaim
-apiVersion: v1
-metadata:
-  name: kokoro-cache-lvmssd
-  namespace: llama
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: 2Gi
-  storageClassName: ssd-lvmpv
-  volumeName: kokoro-cache-lvmssd
Author	SHA1	Message	Date
Lumpiasty	fc2c15d154	move whisper to gpu ci/woodpecker/push/flux-reconcile-source Pipeline was successful Details	2026-05-21 22:02:34 +02:00
Lumpiasty	02b3ec13b4	switch kokoro to remsky/Kokoro-FastAPI	2026-05-21 21:55:34 +02:00