diff --git a/apps/librechat/release.yaml b/apps/librechat/release.yaml index a7d5e58..04d5272 100644 --- a/apps/librechat/release.yaml +++ b/apps/librechat/release.yaml @@ -58,9 +58,15 @@ spec: "Qwen2.5-VL-7B-Instruct-GGUF", "Qwen2.5-VL-32B-Instruct-GGUF-IQ1_S", "Qwen2.5-VL-32B-Instruct-GGUF-Q2_K_L", + "Qwen3-VL-2B-Instruct-GGUF", + "Qwen3-VL-2B-Instruct-GGUF-unslothish", + "Qwen3-VL-2B-Thinking-GGUF", "Qwen3-VL-4B-Instruct-GGUF", "Qwen3-VL-4B-Instruct-GGUF-unslothish", - "Qwen3-VL-4B-Thinking-GGUF" + "Qwen3-VL-4B-Thinking-GGUF", + "Qwen3-VL-8B-Instruct-GGUF", + "Qwen3-VL-8B-Instruct-GGUF-unslothish", + "Qwen3-VL-8B-Thinking-GGUF" ] titleConvo: true titleModel: "gemma3-4b-novision" diff --git a/apps/llama/configs/config.yaml b/apps/llama/configs/config.yaml index fd0c7b9..50c7d63 100644 --- a/apps/llama/configs/config.yaml +++ b/apps/llama/configs/config.yaml @@ -250,6 +250,26 @@ models: --no-warmup --port ${PORT} + "Qwen3-VL-2B-Instruct-GGUF": + ttl: 600 + cmd: | + /app/llama-server + -hf Qwen/Qwen3-VL-2B-Instruct-GGUF:Q8_0 + --n-gpu-layers 99 + --ctx-size 12288 + --predict 4096 + --flash-attn auto + --jinja + --temp 0.7 + --top-p 0.85 + --top-k 20 + --min-p 0.05 + --repeat-penalty 1.15 + --frequency-penalty 0.5 + --presence-penalty 0.4 + --no-warmup + --port ${PORT} + "Qwen3-VL-4B-Instruct-GGUF": ttl: 600 cmd: | @@ -270,6 +290,46 @@ models: --no-warmup --port ${PORT} + "Qwen3-VL-8B-Instruct-GGUF": + ttl: 600 + cmd: | + /app/llama-server + -hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M + --n-gpu-layers 99 + --ctx-size 12288 + --predict 4096 + --flash-attn auto + --jinja + --temp 0.7 + --top-p 0.85 + --top-k 20 + --min-p 0.05 + --repeat-penalty 1.15 + --frequency-penalty 0.5 + --presence-penalty 0.4 + --no-warmup + --port ${PORT} + + "Qwen3-VL-2B-Instruct-GGUF-unslothish": + ttl: 600 + cmd: | + /app/llama-server + -hf Qwen/Qwen3-VL-2B-Instruct-GGUF:Q8_0 + --n-gpu-layers 99 + --ctx-size 12288 + --predict 4096 + --flash-attn auto + --jinja + --temp 0.7 + --top-p 0.8 + --top-k 20 + --min-p 0.05 + --repeat-penalty 1.15 + --frequency-penalty 0.5 + --presence-penalty 0.6 + --no-warmup + --port ${PORT} + "Qwen3-VL-4B-Instruct-GGUF-unslothish": ttl: 600 cmd: | @@ -290,6 +350,45 @@ models: --no-warmup --port ${PORT} + "Qwen3-VL-8B-Instruct-GGUF-unslothish": + ttl: 600 + cmd: | + /app/llama-server + -hf Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M + --n-gpu-layers 99 + --ctx-size 12288 + --predict 4096 + --flash-attn auto + --jinja + --temp 0.7 + --top-p 0.8 + --top-k 20 + --min-p 0.05 + --repeat-penalty 1.15 + --frequency-penalty 0.5 + --presence-penalty 0.6 + --no-warmup + --port ${PORT} + + "Qwen3-VL-2B-Thinking-GGUF": + ttl: 600 + cmd: | + /app/llama-server + -hf Qwen/Qwen3-VL-2B-Thinking-GGUF:Q8_0 + --n-gpu-layers 99 + --ctx-size 12288 + --predict 4096 + --flash-attn auto + --jinja + --top-p 0.95 + --top-k 20 + --temp 1.0 + --min-p 0.0 + --repeat-penalty 1.0 + --presence-penalty 0.0 + --no-warmup + --port ${PORT} + "Qwen3-VL-4B-Thinking-GGUF": ttl: 600 cmd: | @@ -308,3 +407,22 @@ models: --presence-penalty 0.0 --no-warmup --port ${PORT} + + "Qwen3-VL-8B-Thinking-GGUF": + ttl: 600 + cmd: | + /app/llama-server + -hf Qwen/Qwen3-VL-8B-Thinking-GGUF:Q4_K_M + --n-gpu-layers 99 + --ctx-size 12288 + --predict 4096 + --flash-attn auto + --jinja + --top-p 0.95 + --top-k 20 + --temp 1.0 + --min-p 0.0 + --repeat-penalty 1.0 + --presence-penalty 0.0 + --no-warmup + --port ${PORT}