add qwen3-4b-2507 model

2025-08-16 23:26:51 +02:00
parent 83e5cada3f
commit 9c61d47fda
2 changed files with 62 additions and 4 deletions
@@ -72,12 +72,14 @@ spec:
                  "gemma3-12b-q2",
                  "gemma3-12b-novision",
                  "gemma3-4b",
-                  "gemma3-4b-novision"
+                  "gemma3-4b-novision",
                  "Qwen3-4B-Thinking-2507",
                  "Qwen3-4B-Thinking-2507-long-ctx"
                ]
              titleConvo: true
-              titleModel: "current_model"
+              titleModel: "gemma3-4b-novision"
              summarize: false
-              summaryModel: "current_model"
+              summaryModel: "gemma3-4b-novision"
              forcePrompt: false
              modelDisplayLabel: "Llama.cpp"
      imageVolume:
@@ -26,7 +26,7 @@ models:
      --jinja --chat-template-file /config/qwen_nothink_chat_template.jinja
      --no-warmup
      --port ${PORT}
-  "gemma3n-e3b":
+  "gemma3n-e4b":
    ttl: 600
    cmd: |
      /app/llama-server
@@ -119,3 +119,59 @@ models:
      --no-mmproj
      --no-warmup
      --port ${PORT}
  "Qwen3-4B-Thinking-2507":
    ttl: 600
    cmd: |
      /app/llama-server
      -hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_M
      -ngl 99 -c 16384 --predict 8192
      --temp 0.6
      --min-p 0.00
      --top-p 0.95
      --top-k 20
      --repeat-penalty 1.0
      --no-warmup
      --port ${PORT}
  "Qwen3-4B-Thinking-2507-long-ctx":
    ttl: 600
    cmd: |
      /app/llama-server
      -hf unsloth/Qwen3-4B-Thinking-2507-GGUF:Q4_K_M
      -ngl 99 -c 262144 --predict 81920
      --temp 0.6
      --min-p 0.00
      --top-p 0.95
      --top-k 20
      --repeat-penalty 1.0
      --no-warmup
      --flash-attn
      --cache-type-k q8_0 --cache-type-v q8_0
      --port ${PORT}
  "Qwen3-4B-Instruct-2507":
    ttl: 600
    cmd: |
      /app/llama-server
      -hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_M
      -ngl 99 -c 16384 --predict 8192
      --temp 0.7
      --min-p 0.00
      --top-p 0.8
      --top-k 20
      --repeat-penalty 1.0
      --no-warmup
      --port ${PORT}
  "Qwen3-4B-Instruct-2507-long-ctx":
    ttl: 600
    cmd: |
      /app/llama-server
      -hf unsloth/Qwen3-4B-Instruct-2507-GGUF:Q4_K_M
      -ngl 99 -c 262144 --predict 81920
      --temp 0.7
      --min-p 0.00
      --top-p 0.8
      --top-k 20
      --repeat-penalty 1.0
      --no-warmup
      --flash-attn
      --cache-type-k q8_0 --cache-type-v q8_0
      --port ${PORT}