dstackai · peterschmidt85 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md
@@ -21,23 +21,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
 
     ```yaml
     type: service
-    name: qwen397
+    name: qwen36
 
     image: lmsysorg/sglang:v0.5.10.post1
 
     commands:
       - |
         sglang serve \
-          --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
           --port 30000 \
           --tp $DSTACK_GPUS_NUM \
-          --reasoning-parser qwen3 \
-          --tool-call-parser qwen3_coder \
-          --enable-flashinfer-allreduce-fusion \
-          --mem-fraction-static 0.8
+          --mem-fraction-static 0.8 \
+          --context-length 262144 \
+          --reasoning-parser qwen3
 
     port: 30000
-    model: Qwen/Qwen3.5-397B-A17B-FP8
+    model: Qwen/Qwen3.6-27B
 
     volumes:
       # Optional instance volume for model and runtime caches
@@ -46,11 +46,8 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
         optional: true
 
     resources:
-      cpu: x86:96..
-      memory: 512GB..
       shm_size: 16GB
-      disk: 500GB..
-      gpu: H100:80GB:8
+      gpu: H100:4
     ```
 
     </div>
@@ -61,38 +58,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
 
     ```yaml
     type: service
-    name: qwen397
+    name: qwen36
 
-    image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x
-
-    env:
-      - HIP_FORCE_DEV_KERNARG=1
-      - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
-      - SGLANG_DISABLE_CUDNN_CHECK=1
-      - SGLANG_INT4_WEIGHT=0
-      - SGLANG_MOE_PADDING=1
-      - SGLANG_ROCM_DISABLE_LINEARQUANT=0
-      - SGLANG_ROCM_FUSED_DECODE_MLA=1
-      - SGLANG_SET_CPU_AFFINITY=1
-      - SGLANG_USE_AITER=1
-      - SGLANG_USE_ROCM700A=1
+    image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
 
     commands:
       - |
         sglang serve \
-          --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
+          --port 30000 \
           --tp $DSTACK_GPUS_NUM \
-          --reasoning-parser qwen3 \
-          --tool-call-parser qwen3_coder \
           --mem-fraction-static 0.8 \
           --context-length 262144 \
-          --attention-backend triton \
-          --disable-cuda-graph \
-          --fp8-gemm-backend aiter \
-          --port 30000
+          --reasoning-parser qwen3
 
     port: 30000
-    model: Qwen/Qwen3.5-397B-A17B-FP8
+    model: Qwen/Qwen3.6-27B
 
     volumes:
       # Optional instance volume for model and runtime caches
@@ -101,30 +83,33 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
         optional: true
 
     resources:
-      cpu: x86:52..
-      memory: 700GB..
+      cpu: 52..
+      memory: 896GB..
       shm_size: 16GB
-      disk: 600GB..
-      gpu: MI300X:192GB:4
+      disk: 450GB..
+      gpu: MI300X:4
     ```
 
     </div>
 
+The first startup on MI300X can take longer while SGLang compiles ROCm
+kernels.
+
 To run a service, pass the configuration to [`dstack apply`](../reference/cli/dstack/apply.md):
 
 <div class="termy">
 
 ```shell
 $ dstack apply -f .dstack.yml
 
-Submit the run qwen397? [y/n]: y
+Submit the run qwen36? [y/n]: y
 
 Provisioning...
 ---> 100%
 
 Service is published at:
-  http://localhost:3000/proxy/services/main/qwen397/
-Model Qwen/Qwen3.5-397B-A17B-FP8 is published at:
+  http://localhost:3000/proxy/services/main/qwen36/
+Model Qwen/Qwen3.6-27B is published at:
   http://localhost:3000/proxy/models/main/
 ```
 
@@ -138,11 +123,11 @@ If you do not have a [gateway](gateways.md) created, the service endpoint will b
 <div class="termy">
 
 ```shell
-$ curl http://localhost:3000/proxy/services/main/qwen397/v1/chat/completions \
+$ curl http://localhost:3000/proxy/services/main/qwen36/v1/chat/completions \
     -H 'Content-Type: application/json' \
     -H 'Authorization: Bearer &lt;dstack token&gt;' \
     -d '{
-        "model": "Qwen/Qwen3.5-397B-A17B-FP8",
+        "model": "Qwen/Qwen3.6-27B",
         "messages": [
             {
                 "role": "user",
@@ -213,23 +198,23 @@ You can configure the number of replicas as well as the auto-scaling rules.
 
     ```yaml
     type: service
-    name: qwen397-service
+    name: qwen36-service
 
     image: lmsysorg/sglang:v0.5.10.post1
 
     commands:
       - |
         sglang serve \
-          --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
           --port 30000 \
           --tp $DSTACK_GPUS_NUM \
           --reasoning-parser qwen3 \
-          --tool-call-parser qwen3_coder \
-          --enable-flashinfer-allreduce-fusion \
-          --mem-fraction-static 0.8
+          --mem-fraction-static 0.8 \
+          --context-length 262144
 
     port: 30000
-    model: Qwen/Qwen3.5-397B-A17B-FP8
+    model: Qwen/Qwen3.6-27B
 
     volumes:
       # Optional instance volume for model and runtime caches
@@ -238,11 +223,8 @@ You can configure the number of replicas as well as the auto-scaling rules.
         optional: true
 
     resources:
-      cpu: x86:96..
-      memory: 512GB..
       shm_size: 16GB
-      disk: 500GB..
-      gpu: H100:80GB:8
+      gpu: H100:4
 
     replicas: 1..2
     scaling:
@@ -258,38 +240,23 @@ You can configure the number of replicas as well as the auto-scaling rules.
 
     ```yaml
     type: service
-    name: qwen397-service
-
-    image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x
+    name: qwen36-service
 
-    env:
-      - HIP_FORCE_DEV_KERNARG=1
-      - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
-      - SGLANG_DISABLE_CUDNN_CHECK=1
-      - SGLANG_INT4_WEIGHT=0
-      - SGLANG_MOE_PADDING=1
-      - SGLANG_ROCM_DISABLE_LINEARQUANT=0
-      - SGLANG_ROCM_FUSED_DECODE_MLA=1
-      - SGLANG_SET_CPU_AFFINITY=1
-      - SGLANG_USE_AITER=1
-      - SGLANG_USE_ROCM700A=1
+    image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
 
     commands:
       - |
         sglang serve \
-          --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
+          --port 30000 \
           --tp $DSTACK_GPUS_NUM \
           --reasoning-parser qwen3 \
-          --tool-call-parser qwen3_coder \
           --mem-fraction-static 0.8 \
-          --context-length 262144 \
-          --attention-backend triton \
-          --disable-cuda-graph \
-          --fp8-gemm-backend aiter \
-          --port 30000
+          --context-length 262144
 
     port: 30000
-    model: Qwen/Qwen3.5-397B-A17B-FP8
+    model: Qwen/Qwen3.6-27B
 
     volumes:
       # Optional instance volume for model and runtime caches
@@ -298,11 +265,11 @@ You can configure the number of replicas as well as the auto-scaling rules.
         optional: true
 
     resources:
-      cpu: x86:52..
-      memory: 700GB..
+      cpu: 52..
+      memory: 896GB..
       shm_size: 16GB
-      disk: 600GB..
-      gpu: MI300X:192GB:4
+      disk: 450GB..
+      gpu: MI300X:4
 
     replicas: 1..2
     scaling:

diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md
@@ -219,27 +219,26 @@ description: Quick guide to creating fleets and submitting runs
 
     ```yaml
     type: service
-    name: llama31-service
-
-    # If `image` is not specified, dstack uses its default image
-    python: "3.11"
-    #image: dstackai/base:py3.13-0.7-cuda-12.1
-
-    # Required environment variables
-    env:
-      - HF_TOKEN
+    name: qwen36-service
+
+    image: lmsysorg/sglang:v0.5.10.post1
+
     commands:
-      - pip install vllm
-      - vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct --max-model-len 4096
-    # Expose the vllm server port
-    port: 8000
+      - |
+        sglang serve \
+          --model-path Qwen/Qwen3.6-27B \
+          --host 0.0.0.0 \
+          --port 30000 \
+          --reasoning-parser qwen3
+    # Expose the SGLang server port
+    port: 30000
 
     # Specify a name if it's an OpenAI-compatible model
-    model: meta-llama/Meta-Llama-3.1-8B-Instruct
-    
+    model: Qwen/Qwen3.6-27B
+
     # Required resources
     resources:
-      gpu: 24GB
+      gpu: H100
     ```
 
     </div>
@@ -249,22 +248,20 @@ description: Quick guide to creating fleets and submitting runs
     <div class="termy">
 
     ```shell
-    $ HF_TOKEN=...
     $ dstack apply -f service.dstack.yml
-
-     #  BACKEND  REGION     INSTANCE       RESOURCES                    SPOT  PRICE
-     1  aws      us-west-2  g5.4xlarge     16xCPU, 64GB, 1xA10G (24GB)  yes   $0.22
-     2  aws      us-east-2  g6.xlarge      4xCPU, 16GB, 1xL4 (24GB)     yes   $0.27
-     3  gcp      us-west1   g2-standard-4  4xCPU, 16GB, 1xL4 (24GB)     yes   $0.27
-
-    Submit the run llama31-service? [y/n]: y
-
-    Provisioning `llama31-service`...
+
+     #  BACKEND  REGION     INSTANCE              RESOURCES                          SPOT  PRICE
+     1  nebius   eu-north1  gpu-h100-sxm          16xCPU, 250GB, 1xH100 (80GB)      no    $2.95
+     2  runpod   US-CA-2    NVIDIA H100 80GB HBM3 64xCPU, 1004GB, 1xH100 (80GB)     no    $2.99
+
+    Submit the run qwen36-service? [y/n]: y
+
+    Provisioning `qwen36-service`...
     ---> 100%
 
     Service is published at: 
-      http://localhost:3000/proxy/services/main/llama31-service/
-    Model meta-llama/Meta-Llama-3.1-8B-Instruct is published at:
+      http://localhost:3000/proxy/services/main/qwen36-service/
+    Model Qwen/Qwen3.6-27B is published at:
       http://localhost:3000/proxy/models/main/
     ```
 

diff --git a/docs/examples.md b/docs/examples.md
@@ -1,6 +1,6 @@
 ---
 title: Examples
-description: Collection of examples for training, inference, and clusters
+description: Collection of examples for models, training, inference, and clusters
 #template: examples.html
 hide:
 #    - navigation
@@ -153,7 +153,7 @@ hide:
            SGLang
        </h3>
        <p>
-           Deploy DeepSeek distilled models with SGLang
+           Deploy Qwen3.6-27B with SGLang
       </p>
     </a>
     <a href="/examples/inference/vllm" 
@@ -162,7 +162,7 @@ hide:
            vLLM
        </h3>
        <p>
-            Deploy Llama 3.1 with vLLM
+            Deploy Qwen3.6-27B with vLLM
         </p>
     </a>
     <a href="/examples/inference/nim" 
@@ -185,6 +185,21 @@ hide:
     </a>
 </div>
 
+## Models
+
+<div class="tx-landing__highlights_grid">
+    <a href="/examples/models/qwen36"
+       class="feature-cell">
+        <h3>
+            Qwen 3.6
+        </h3>
+
+        <p>
+            Deploy Qwen3.6-27B with SGLang on NVIDIA or AMD
+        </p>
+    </a>
+</div>
+
 ## Accelerators
 
 <div class="tx-landing__highlights_grid">

diff --git a/docs/examples/models/qwen36/index.md b/docs/examples/models/qwen36/index.md