7 Commits

6 changed files with 213 additions and 29 deletions

View File

@@ -1,4 +1,6 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/mostlygeek/llama-swap/refs/heads/main/config-schema.json
healthCheckTimeout: 600
logToStdout: "both" # proxy and upstream
models:
"DeepSeek-R1-0528-Qwen3-8B-GGUF":
@@ -6,7 +8,6 @@ models:
cmd: |
/app/llama-server
-hf unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF:Q4_K_M
--n-gpu-layers 37
--ctx-size 16384
--no-warmup
--port ${PORT}
@@ -16,7 +17,6 @@ models:
cmd: |
/app/llama-server
-hf unsloth/Qwen3-8B-GGUF:Q4_K_M
--n-gpu-layers 37
--ctx-size 16384
--no-warmup
--port ${PORT}
@@ -26,7 +26,6 @@ models:
cmd: |
/app/llama-server
-hf unsloth/Qwen3-8B-GGUF:Q4_K_M
--n-gpu-layers 37
--ctx-size 16384
--jinja
--chat-template-file /config/qwen_nothink_chat_template.jinja
@@ -39,7 +38,6 @@ models:
/app/llama-server
-hf unsloth/gemma-3n-E4B-it-GGUF:UD-Q4_K_XL
--ctx-size 16384
--n-gpu-layers 99
--seed 3407
--prio 2
--temp 1.0
@@ -56,7 +54,6 @@ models:
/app/llama-server
-hf unsloth/gemma-3-12b-it-GGUF:Q4_K_M
--ctx-size 16384
--n-gpu-layers 99
--prio 2
--temp 1.0
--repeat-penalty 1.0
@@ -72,7 +69,6 @@ models:
/app/llama-server
-hf unsloth/gemma-3-12b-it-GGUF:Q4_K_M
--ctx-size 16384
--n-gpu-layers 99
--prio 2
--temp 1.0
--repeat-penalty 1.0
@@ -89,7 +85,6 @@ models:
/app/llama-server
-hf unsloth/gemma-3-12b-it-GGUF:Q2_K_L
--ctx-size 16384
--n-gpu-layers 99
--prio 2
--temp 1.0
--repeat-penalty 1.0
@@ -105,7 +100,6 @@ models:
/app/llama-server
-hf unsloth/gemma-3-4b-it-GGUF:Q4_K_M
--ctx-size 16384
--n-gpu-layers 99
--prio 2
--temp 1.0
--repeat-penalty 1.0
@@ -121,7 +115,6 @@ models:
/app/llama-server
-hf unsloth/gemma-3-4b-it-GGUF:Q4_K_M
--ctx-size 16384
--n-gpu-layers 99
--prio 2
--temp 1.0
--repeat-penalty 1.0
@@ -137,7 +130,6 @@ models:
cmd: |
/app/llama-server
-hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_M
--n-gpu-layers 99
--ctx-size 16384
--predict 8192
--temp 0.6
@@ -153,7 +145,6 @@ models:
cmd: |
/app/llama-server
-hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_M
--n-gpu-layers 99
--ctx-size 262144
--predict 81920
--temp 0.6
@@ -172,7 +163,6 @@ models:
cmd: |
/app/llama-server
-hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_M
--n-gpu-layers 99
--ctx-size 16384
--predict 8192
--temp 0.7
@@ -188,7 +178,6 @@ models:
cmd: |
/app/llama-server
-hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_M
--n-gpu-layers 99
--ctx-size 262144
--predict 81920
--temp 0.7
@@ -207,7 +196,6 @@ models:
cmd: |
/app/llama-server
-hf unsloth/Qwen2.5-VL-32B-Instruct-GGUF:IQ1_S
--n-gpu-layers 99
--ctx-size 16384
--predict 8192
--temp 0.7
@@ -223,7 +211,6 @@ models:
cmd: |
/app/llama-server
-hf unsloth/Qwen2.5-VL-32B-Instruct-GGUF:Q2_K_L
--n-gpu-layers 99
--ctx-size 16384
--predict 8192
--temp 0.7
@@ -239,7 +226,6 @@ models:
cmd: |
/app/llama-server
-hf unsloth/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M
--n-gpu-layers 37
--ctx-size 16384
--predict 8192
--temp 0.7
@@ -255,7 +241,6 @@ models:
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-2B-Instruct-GGUF:Q8_0
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
@@ -275,7 +260,6 @@ models:
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
@@ -295,7 +279,6 @@ models:
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
@@ -315,7 +298,6 @@ models:
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-2B-Instruct-GGUF:Q8_0
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
@@ -335,7 +317,6 @@ models:
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
@@ -355,7 +336,6 @@ models:
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
@@ -375,7 +355,6 @@ models:
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-2B-Thinking-GGUF:Q8_0
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
@@ -394,7 +373,6 @@ models:
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-4B-Thinking-GGUF:Q4_K_M
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
@@ -413,7 +391,6 @@ models:
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-8B-Thinking-GGUF:Q4_K_M
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
@@ -432,7 +409,6 @@ models:
cmd: |
/app/llama-server
-hf noctrex/Huihui-Qwen3-VL-8B-Instruct-abliterated-GGUF:Q6_K
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
@@ -452,7 +428,6 @@ models:
cmd: |
/app/llama-server
-hf noctrex/Huihui-Qwen3-VL-8B-Thinking-abliterated-GGUF:Q6_K
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto

View File

@@ -30,7 +30,7 @@ spec:
protocol: TCP
volumeMounts:
- name: models
mountPath: /app/.cache
mountPath: /root/.cache
- mountPath: /dev/kfd
name: kfd
- mountPath: /dev/dri

View File

@@ -15,7 +15,7 @@ spec:
- name: renovate
# Update this to the latest available and then enable Renovate on
# the manifest
image: renovate/renovate:43.39.2-full
image: renovate/renovate:43.43.0-full
envFrom:
- secretRef:
name: renovate-gitea-token

View File

@@ -0,0 +1,200 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: coredns
namespace: kube-system
labels:
k8s-app: kube-dns
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: system:coredns
labels:
k8s-app: kube-dns
rules:
- apiGroups: [""]
resources:
- endpoints
- services
- pods
- namespaces
verbs: ["list", "watch"]
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: system:coredns
labels:
k8s-app: kube-dns
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:coredns
subjects:
- kind: ServiceAccount
name: coredns
namespace: kube-system
---
apiVersion: v1
kind: ConfigMap
metadata:
name: coredns
namespace: kube-system
data:
Corefile: |-
.:53 {
errors
health {
lameduck 5s
}
ready
log . {
class all
}
prometheus :9153
# Return NODATA for AAAA on selected domains to force IPv4.
template IN AAAA {
match "(^|\.)huggingface\.co\.$"
rcode NOERROR
fallthrough
}
kubernetes homelab.lumpiasty.xyz cluster.local in-addr.arpa ip6.arpa {
pods insecure
fallthrough in-addr.arpa ip6.arpa
ttl 30
}
forward . /etc/resolv.conf {
max_concurrent 1000
}
cache 30 {
disable success cluster.local
disable denial cluster.local
}
loop
reload
loadbalance
}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: coredns
namespace: kube-system
labels:
k8s-app: kube-dns
kubernetes.io/name: CoreDNS
spec:
replicas: 1
selector:
matchLabels:
k8s-app: kube-dns
template:
metadata:
labels:
k8s-app: kube-dns
spec:
priorityClassName: system-cluster-critical
serviceAccountName: coredns
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
nodeSelector:
kubernetes.io/os: linux
containers:
- name: coredns
image: registry.k8s.io/coredns/coredns:v1.14.1
imagePullPolicy: IfNotPresent
args: ["-conf", "/etc/coredns/Corefile"]
ports:
- containerPort: 53
name: dns
protocol: UDP
- containerPort: 53
name: dns-tcp
protocol: TCP
- containerPort: 9153
name: metrics
protocol: TCP
livenessProbe:
httpGet:
path: /health
port: 8080
scheme: HTTP
initialDelaySeconds: 60
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 5
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 8181
scheme: HTTP
timeoutSeconds: 1
successThreshold: 1
failureThreshold: 3
periodSeconds: 10
resources:
limits:
memory: 170Mi
requests:
cpu: 0
memory: 70Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
add:
- NET_BIND_SERVICE
drop:
- ALL
readOnlyRootFilesystem: true
volumeMounts:
- name: config-volume
mountPath: /etc/coredns
readOnly: true
dnsPolicy: Default
volumes:
- name: config-volume
configMap:
name: coredns
items:
- key: Corefile
path: Corefile
---
apiVersion: v1
kind: Service
metadata:
name: kube-dns
namespace: kube-system
labels:
k8s-app: kube-dns
kubernetes.io/name: CoreDNS
spec:
type: ClusterIP
clusterIP: 10.43.0.10
clusterIPs:
- 10.43.0.10
- 2001:470:61a3:300::a
ipFamilyPolicy: RequireDualStack
ipFamilies:
- IPv4
- IPv6
selector:
k8s-app: kube-dns
ports:
- name: dns
port: 53
protocol: UDP
targetPort: 53
- name: dns-tcp
port: 53
protocol: TCP
targetPort: 53

View File

@@ -4,6 +4,7 @@ resources:
- controllers/k8up-crd-4.8.3.yaml
- controllers/cilium.yaml
- controllers/nginx-ingress.yaml
- controllers/coredns.yaml
- controllers/dns-public.yaml
- controllers/cert-manager.yaml
- controllers/cert-manager-webhook-ovh.yaml

View File

@@ -14,7 +14,15 @@ machine:
hostDNS:
forwardKubeDNSToHost: false
kubelet:
clusterDNS:
- 10.43.0.10
- 2001:470:61a3:300::a
cluster:
# We're configuring CoreDNS ourselves, so disable the default one
coreDNS:
disabled: true
network:
# Likely redundant, we use Cilium as IPAM with their CRDs
podSubnets: