Add 8B and 2B variants of qwen3-vl

This commit is contained in:
2025-11-15 22:21:10 +01:00
parent 669beccc35
commit f13c3ae3e7
2 changed files with 125 additions and 1 deletions

View File

@@ -250,6 +250,26 @@ models:
--no-warmup
--port ${PORT}
"Qwen3-VL-2B-Instruct-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-2B-Instruct-GGUF:Q8_0
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.85
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.4
--no-warmup
--port ${PORT}
"Qwen3-VL-4B-Instruct-GGUF":
ttl: 600
cmd: |
@@ -270,6 +290,46 @@ models:
--no-warmup
--port ${PORT}
"Qwen3-VL-8B-Instruct-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.85
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.4
--no-warmup
--port ${PORT}
"Qwen3-VL-2B-Instruct-GGUF-unslothish":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-2B-Instruct-GGUF:Q8_0
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.8
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.6
--no-warmup
--port ${PORT}
"Qwen3-VL-4B-Instruct-GGUF-unslothish":
ttl: 600
cmd: |
@@ -290,6 +350,45 @@ models:
--no-warmup
--port ${PORT}
"Qwen3-VL-8B-Instruct-GGUF-unslothish":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--temp 0.7
--top-p 0.8
--top-k 20
--min-p 0.05
--repeat-penalty 1.15
--frequency-penalty 0.5
--presence-penalty 0.6
--no-warmup
--port ${PORT}
"Qwen3-VL-2B-Thinking-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-2B-Thinking-GGUF:Q8_0
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--top-p 0.95
--top-k 20
--temp 1.0
--min-p 0.0
--repeat-penalty 1.0
--presence-penalty 0.0
--no-warmup
--port ${PORT}
"Qwen3-VL-4B-Thinking-GGUF":
ttl: 600
cmd: |
@@ -308,3 +407,22 @@ models:
--presence-penalty 0.0
--no-warmup
--port ${PORT}
"Qwen3-VL-8B-Thinking-GGUF":
ttl: 600
cmd: |
/app/llama-server
-hf Qwen/Qwen3-VL-8B-Thinking-GGUF:Q4_K_M
--n-gpu-layers 99
--ctx-size 12288
--predict 4096
--flash-attn auto
--jinja
--top-p 0.95
--top-k 20
--temp 1.0
--min-p 0.0
--repeat-penalty 1.0
--presence-penalty 0.0
--no-warmup
--port ${PORT}