diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md
index fd0d2a2dc..f1a88e3dd 100644
--- a/docs/docs/concepts/services.md
+++ b/docs/docs/concepts/services.md
@@ -21,23 +21,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
```yaml
type: service
- name: qwen397
+ name: qwen36
image: lmsysorg/sglang:v0.5.10.post1
commands:
- |
sglang serve \
- --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+ --model-path Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
--port 30000 \
--tp $DSTACK_GPUS_NUM \
- --reasoning-parser qwen3 \
- --tool-call-parser qwen3_coder \
- --enable-flashinfer-allreduce-fusion \
- --mem-fraction-static 0.8
+ --mem-fraction-static 0.8 \
+ --context-length 262144 \
+ --reasoning-parser qwen3
port: 30000
- model: Qwen/Qwen3.5-397B-A17B-FP8
+ model: Qwen/Qwen3.6-27B
volumes:
# Optional instance volume for model and runtime caches
@@ -46,11 +46,8 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
optional: true
resources:
- cpu: x86:96..
- memory: 512GB..
shm_size: 16GB
- disk: 500GB..
- gpu: H100:80GB:8
+ gpu: H100:4
```
@@ -61,38 +58,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
```yaml
type: service
- name: qwen397
+ name: qwen36
- image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x
-
- env:
- - HIP_FORCE_DEV_KERNARG=1
- - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
- - SGLANG_DISABLE_CUDNN_CHECK=1
- - SGLANG_INT4_WEIGHT=0
- - SGLANG_MOE_PADDING=1
- - SGLANG_ROCM_DISABLE_LINEARQUANT=0
- - SGLANG_ROCM_FUSED_DECODE_MLA=1
- - SGLANG_SET_CPU_AFFINITY=1
- - SGLANG_USE_AITER=1
- - SGLANG_USE_ROCM700A=1
+ image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
commands:
- |
sglang serve \
- --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+ --model-path Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
+ --port 30000 \
--tp $DSTACK_GPUS_NUM \
- --reasoning-parser qwen3 \
- --tool-call-parser qwen3_coder \
--mem-fraction-static 0.8 \
--context-length 262144 \
- --attention-backend triton \
- --disable-cuda-graph \
- --fp8-gemm-backend aiter \
- --port 30000
+ --reasoning-parser qwen3
port: 30000
- model: Qwen/Qwen3.5-397B-A17B-FP8
+ model: Qwen/Qwen3.6-27B
volumes:
# Optional instance volume for model and runtime caches
@@ -101,15 +83,18 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml`
optional: true
resources:
- cpu: x86:52..
- memory: 700GB..
+ cpu: 52..
+ memory: 896GB..
shm_size: 16GB
- disk: 600GB..
- gpu: MI300X:192GB:4
+ disk: 450GB..
+ gpu: MI300X:4
```
+The first startup on MI300X can take longer while SGLang compiles ROCm
+kernels.
+
To run a service, pass the configuration to [`dstack apply`](../reference/cli/dstack/apply.md):
@@ -117,14 +102,14 @@ To run a service, pass the configuration to [`dstack apply`](../reference/cli/ds
```shell
$ dstack apply -f .dstack.yml
-Submit the run qwen397? [y/n]: y
+Submit the run qwen36? [y/n]: y
Provisioning...
---> 100%
Service is published at:
- http://localhost:3000/proxy/services/main/qwen397/
-Model Qwen/Qwen3.5-397B-A17B-FP8 is published at:
+ http://localhost:3000/proxy/services/main/qwen36/
+Model Qwen/Qwen3.6-27B is published at:
http://localhost:3000/proxy/models/main/
```
@@ -138,11 +123,11 @@ If you do not have a [gateway](gateways.md) created, the service endpoint will b
```shell
-$ curl http://localhost:3000/proxy/services/main/qwen397/v1/chat/completions \
+$ curl http://localhost:3000/proxy/services/main/qwen36/v1/chat/completions \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer <dstack token>' \
-d '{
- "model": "Qwen/Qwen3.5-397B-A17B-FP8",
+ "model": "Qwen/Qwen3.6-27B",
"messages": [
{
"role": "user",
@@ -213,23 +198,23 @@ You can configure the number of replicas as well as the auto-scaling rules.
```yaml
type: service
- name: qwen397-service
+ name: qwen36-service
image: lmsysorg/sglang:v0.5.10.post1
commands:
- |
sglang serve \
- --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+ --model-path Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
--port 30000 \
--tp $DSTACK_GPUS_NUM \
--reasoning-parser qwen3 \
- --tool-call-parser qwen3_coder \
- --enable-flashinfer-allreduce-fusion \
- --mem-fraction-static 0.8
+ --mem-fraction-static 0.8 \
+ --context-length 262144
port: 30000
- model: Qwen/Qwen3.5-397B-A17B-FP8
+ model: Qwen/Qwen3.6-27B
volumes:
# Optional instance volume for model and runtime caches
@@ -238,11 +223,8 @@ You can configure the number of replicas as well as the auto-scaling rules.
optional: true
resources:
- cpu: x86:96..
- memory: 512GB..
shm_size: 16GB
- disk: 500GB..
- gpu: H100:80GB:8
+ gpu: H100:4
replicas: 1..2
scaling:
@@ -258,38 +240,23 @@ You can configure the number of replicas as well as the auto-scaling rules.
```yaml
type: service
- name: qwen397-service
-
- image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x
+ name: qwen36-service
- env:
- - HIP_FORCE_DEV_KERNARG=1
- - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
- - SGLANG_DISABLE_CUDNN_CHECK=1
- - SGLANG_INT4_WEIGHT=0
- - SGLANG_MOE_PADDING=1
- - SGLANG_ROCM_DISABLE_LINEARQUANT=0
- - SGLANG_ROCM_FUSED_DECODE_MLA=1
- - SGLANG_SET_CPU_AFFINITY=1
- - SGLANG_USE_AITER=1
- - SGLANG_USE_ROCM700A=1
+ image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
commands:
- |
sglang serve \
- --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+ --model-path Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
+ --port 30000 \
--tp $DSTACK_GPUS_NUM \
--reasoning-parser qwen3 \
- --tool-call-parser qwen3_coder \
--mem-fraction-static 0.8 \
- --context-length 262144 \
- --attention-backend triton \
- --disable-cuda-graph \
- --fp8-gemm-backend aiter \
- --port 30000
+ --context-length 262144
port: 30000
- model: Qwen/Qwen3.5-397B-A17B-FP8
+ model: Qwen/Qwen3.6-27B
volumes:
# Optional instance volume for model and runtime caches
@@ -298,11 +265,11 @@ You can configure the number of replicas as well as the auto-scaling rules.
optional: true
resources:
- cpu: x86:52..
- memory: 700GB..
+ cpu: 52..
+ memory: 896GB..
shm_size: 16GB
- disk: 600GB..
- gpu: MI300X:192GB:4
+ disk: 450GB..
+ gpu: MI300X:4
replicas: 1..2
scaling:
diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md
index 4cdecae5e..80a98f79b 100644
--- a/docs/docs/quickstart.md
+++ b/docs/docs/quickstart.md
@@ -219,27 +219,26 @@ description: Quick guide to creating fleets and submitting runs
```yaml
type: service
- name: llama31-service
-
- # If `image` is not specified, dstack uses its default image
- python: "3.11"
- #image: dstackai/base:py3.13-0.7-cuda-12.1
-
- # Required environment variables
- env:
- - HF_TOKEN
+ name: qwen36-service
+
+ image: lmsysorg/sglang:v0.5.10.post1
+
commands:
- - pip install vllm
- - vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct --max-model-len 4096
- # Expose the vllm server port
- port: 8000
+ - |
+ sglang serve \
+ --model-path Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --reasoning-parser qwen3
+ # Expose the SGLang server port
+ port: 30000
# Specify a name if it's an OpenAI-compatible model
- model: meta-llama/Meta-Llama-3.1-8B-Instruct
-
+ model: Qwen/Qwen3.6-27B
+
# Required resources
resources:
- gpu: 24GB
+ gpu: H100
```
@@ -249,22 +248,20 @@ description: Quick guide to creating fleets and submitting runs
```shell
- $ HF_TOKEN=...
$ dstack apply -f service.dstack.yml
-
- # BACKEND REGION INSTANCE RESOURCES SPOT PRICE
- 1 aws us-west-2 g5.4xlarge 16xCPU, 64GB, 1xA10G (24GB) yes $0.22
- 2 aws us-east-2 g6.xlarge 4xCPU, 16GB, 1xL4 (24GB) yes $0.27
- 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB) yes $0.27
-
- Submit the run llama31-service? [y/n]: y
-
- Provisioning `llama31-service`...
+
+ # BACKEND REGION INSTANCE RESOURCES SPOT PRICE
+ 1 nebius eu-north1 gpu-h100-sxm 16xCPU, 250GB, 1xH100 (80GB) no $2.95
+ 2 runpod US-CA-2 NVIDIA H100 80GB HBM3 64xCPU, 1004GB, 1xH100 (80GB) no $2.99
+
+ Submit the run qwen36-service? [y/n]: y
+
+ Provisioning `qwen36-service`...
---> 100%
Service is published at:
- http://localhost:3000/proxy/services/main/llama31-service/
- Model meta-llama/Meta-Llama-3.1-8B-Instruct is published at:
+ http://localhost:3000/proxy/services/main/qwen36-service/
+ Model Qwen/Qwen3.6-27B is published at:
http://localhost:3000/proxy/models/main/
```
diff --git a/docs/examples.md b/docs/examples.md
index 04cd5ff0f..b3e3e0d42 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -1,6 +1,6 @@
---
title: Examples
-description: Collection of examples for training, inference, and clusters
+description: Collection of examples for models, training, inference, and clusters
#template: examples.html
hide:
# - navigation
@@ -153,7 +153,7 @@ hide:
SGLang
- Deploy DeepSeek distilled models with SGLang
+ Deploy Qwen3.6-27B with SGLang
- Deploy Llama 3.1 with vLLM
+ Deploy Qwen3.6-27B with vLLM
+## Models
+
+
+
## Accelerators
diff --git a/docs/examples/models/qwen36/index.md b/docs/examples/models/qwen36/index.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md
index 36be8044e..b35b29c1c 100644
--- a/examples/accelerators/amd/README.md
+++ b/examples/accelerators/amd/README.md
@@ -1,6 +1,6 @@
---
title: AMD
-description: Deploying and fine-tuning models on AMD MI300X GPUs using vLLM, TRL, and Axolotl
+description: Deploying and fine-tuning models on AMD MI300X GPUs using SGLang, vLLM, TRL, and Axolotl
---
# AMD
@@ -11,8 +11,49 @@ with on-prem AMD GPUs or configuring a backend that offers AMD GPUs such as the
## Deployment
-vLLM supports AMD GPUs. Here's an example of a [service](https://dstack.ai/docs/services) that deploys
-Llama 3.1 70B in FP16 using [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html).
+Here are examples of a [service](https://dstack.ai/docs/services) that deploy
+`Qwen/Qwen3.6-27B` on AMD MI300X GPUs using
+[SGLang](https://github.com/sgl-project/sglang) and
+[vLLM](https://docs.vllm.ai/en/latest/).
+
+=== "SGLang"
+
+
+
+ ```yaml
+ type: service
+ name: qwen36-service-sglang-amd
+
+ image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
+
+ commands:
+ - |
+ sglang serve \
+ --model-path Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --tp $DSTACK_GPUS_NUM \
+ --reasoning-parser qwen3 \
+ --mem-fraction-static 0.8 \
+ --context-length 262144
+
+ port: 30000
+ model: Qwen/Qwen3.6-27B
+
+ volumes:
+ - instance_path: /root/.cache
+ path: /root/.cache
+ optional: true
+
+ resources:
+ cpu: 52..
+ memory: 896GB..
+ shm_size: 16GB
+ disk: 450GB..
+ gpu: MI300X:4
+ ```
+
+
=== "vLLM"
@@ -20,65 +61,49 @@ Llama 3.1 70B in FP16 using [vLLM](https://docs.vllm.ai/en/latest/getting_starte
```yaml
type: service
- name: llama31-service-vllm-amd
+ name: qwen36-service-vllm-amd
+
+ image: vllm/vllm-openai-rocm:v0.19.1
- # Using Runpod's ROCm Docker image
- image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04
- # Required environment variables
- env:
- - HF_TOKEN
- - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct
- - MAX_MODEL_LEN=126192
- # Commands of the task
commands:
- - export PATH=/opt/conda/envs/py_3.10/bin:$PATH
- - wget https://github.com/ROCm/hipBLAS/archive/refs/tags/rocm-6.1.0.zip
- - unzip rocm-6.1.0.zip
- - cd hipBLAS-rocm-6.1.0
- - python rmake.py
- - cd ..
- - git clone https://github.com/vllm-project/vllm.git
- - cd vllm
- - pip install triton
- - pip uninstall torch -y
- - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1
- - pip install /opt/rocm/share/amd_smi
- - pip install --upgrade numba scipy huggingface-hub[cli]
- - pip install "numpy<2"
- - pip install -r requirements-rocm.txt
- - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib
- - rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so*
- - export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
- - wget https://dstack-binaries.s3.amazonaws.com/vllm-0.6.0%2Brocm614-cp310-cp310-linux_x86_64.whl
- - pip install vllm-0.6.0+rocm614-cp310-cp310-linux_x86_64.whl
- - vllm serve $MODEL_ID --max-model-len $MAX_MODEL_LEN --port 8000
- # Service port
+ - |
+ vllm serve Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
+ --port 8000 \
+ --tensor-parallel-size $DSTACK_GPUS_NUM \
+ --max-model-len 262144 \
+ --reasoning-parser qwen3
+
port: 8000
- # Register the model
- model: meta-llama/Meta-Llama-3.1-70B-Instruct
+ model: Qwen/Qwen3.6-27B
- # Uncomment to leverage spot instances
- #spot_policy: auto
+ volumes:
+ - instance_path: /root/.cache
+ path: /root/.cache
+ optional: true
resources:
- gpu: MI300X
- disk: 200GB
+ cpu: 52..
+ memory: 896GB..
+ shm_size: 16GB
+ disk: 450GB..
+ gpu: MI300X:4
```
- Note, maximum size of vLLM’s `KV cache` is 126192, consequently we must set `MAX_MODEL_LEN` to 126192. Adding `/opt/conda/envs/py_3.10/bin` to PATH ensures we use the Python 3.10 environment necessary for the pre-built binaries compiled specifically for this version.
-
- > To speed up the `vLLM-ROCm` installation, this example uses a pre-built binary from S3.
-
!!! info "Docker image"
- If you want to use AMD, specifying `image` is currently required. This must be an image that includes
- ROCm drivers.
+ AMD deployments require specifying an image that already includes ROCm
+ drivers. The SGLang and vLLM examples above use pinned ROCm images.
To request multiple GPUs, specify the quantity after the GPU name, separated by a colon, e.g., `MI300X:4`.
## Fine-tuning
+> If you're planning multi-node AMD training, validate cluster networking first
+with the [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/)
+example.
+
=== "TRL"
Below is an example of LoRA fine-tuning Llama 3.1 8B using [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.html)
@@ -189,28 +214,30 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by
## Running a configuration
-Once a configuration is ready, save it to a `.dstack.yml` file, then run
-`dstack apply -f
`, and `dstack` will automatically provision the
-cloud resources and run the configuration.
+Once a configuration is ready, save it to a `.dstack.yml` file. If your
+configuration references environment variables such as `HF_TOKEN` or
+`WANDB_API_KEY`, export them first. Then run
+`dstack apply -f `, and `dstack` will automatically
+provision the cloud resources and run the configuration.
```shell
-$ HF_TOKEN=...
-$ WANDB_API_KEY=...
-$ WANDB_PROJECT=...
-$ WANDB_NAME=axolotl-amd-llama31-train
-$ HUB_MODEL_ID=...
-$ dstack apply -f service.dstack.yml
+$ dstack apply -f
```
## What's next?
-1. Browse [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-rocm),
+1. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/)
+ and [vLLM](https://dstack.ai/examples/inference/vllm/) examples, plus
[Axolotl](https://github.com/ROCm/rocm-blogs/tree/release/blogs/artificial-intelligence/axolotl),
- [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html) and
- [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes)
-2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and
+ [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html),
+ and [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes)
+2. For multi-node training, run
+ [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/)
+ to validate AMD cluster networking.
+3. Check [dev environments](https://dstack.ai/docs/dev-environments),
+ [tasks](https://dstack.ai/docs/tasks), and
[services](https://dstack.ai/docs/services).
diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md
index 9d08fe09c..ddf49fd31 100644
--- a/examples/inference/sglang/README.md
+++ b/examples/inference/sglang/README.md
@@ -1,41 +1,41 @@
---
title: SGLang
-description: Deploying Qwen3.5-397B-A17B-FP8 using SGLang on NVIDIA and AMD GPUs
+description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs
---
# SGLang
-This example shows how to deploy `Qwen/Qwen3.5-397B-A17B-FP8` using
+This example shows how to deploy `Qwen/Qwen3.6-27B` using
[SGLang](https://github.com/sgl-project/sglang) and `dstack`.
## Apply a configuration
Here's an example of a service that deploys
-`Qwen/Qwen3.5-397B-A17B-FP8` using SGLang.
+`Qwen/Qwen3.6-27B` using SGLang.
=== "NVIDIA"
-
+
```yaml
type: service
- name: qwen397
+ name: qwen36
image: lmsysorg/sglang:v0.5.10.post1
commands:
- |
sglang serve \
- --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+ --model-path Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
--port 30000 \
--tp $DSTACK_GPUS_NUM \
--reasoning-parser qwen3 \
- --tool-call-parser qwen3_coder \
- --enable-flashinfer-allreduce-fusion \
- --mem-fraction-static 0.8
+ --mem-fraction-static 0.8 \
+ --context-length 262144
port: 30000
- model: Qwen/Qwen3.5-397B-A17B-FP8
+ model: Qwen/Qwen3.6-27B
volumes:
- instance_path: /root/.cache
@@ -43,52 +43,34 @@ Here's an example of a service that deploys
optional: true
resources:
- cpu: x86:96..
- memory: 512GB..
shm_size: 16GB
- disk: 500GB..
- gpu: H100:80GB:8
+ gpu: H100:4
```
=== "AMD"
-
+
```yaml
type: service
- name: qwen397
-
- image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x
+ name: qwen36
- env:
- - HIP_FORCE_DEV_KERNARG=1
- - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
- - SGLANG_DISABLE_CUDNN_CHECK=1
- - SGLANG_INT4_WEIGHT=0
- - SGLANG_MOE_PADDING=1
- - SGLANG_ROCM_DISABLE_LINEARQUANT=0
- - SGLANG_ROCM_FUSED_DECODE_MLA=1
- - SGLANG_SET_CPU_AFFINITY=1
- - SGLANG_USE_AITER=1
- - SGLANG_USE_ROCM700A=1
+ image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
commands:
- |
sglang serve \
- --model-path Qwen/Qwen3.5-397B-A17B-FP8 \
+ --model-path Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
+ --port 30000 \
--tp $DSTACK_GPUS_NUM \
--reasoning-parser qwen3 \
- --tool-call-parser qwen3_coder \
--mem-fraction-static 0.8 \
- --context-length 262144 \
- --attention-backend triton \
- --disable-cuda-graph \
- --fp8-gemm-backend aiter \
- --port 30000
+ --context-length 262144
port: 30000
- model: Qwen/Qwen3.5-397B-A17B-FP8
+ model: Qwen/Qwen3.6-27B
volumes:
- instance_path: /root/.cache
@@ -96,24 +78,26 @@ Here's an example of a service that deploys
optional: true
resources:
- cpu: x86:52..
- memory: 700GB..
+ cpu: 52..
+ memory: 896GB..
shm_size: 16GB
- disk: 600GB..
- gpu: MI300X:192GB:4
+ disk: 450GB..
+ gpu: MI300X:4
```
-The AMD example uses the exact validated MI300X configuration for this model,
-including the ROCm/AITER settings required for stable FP8 serving.
+The AMD example keeps the deployment close to the upstream Qwen and SGLang
+guidance: a pinned ROCm image, tensor parallelism across all four GPUs, and the
+standard `qwen3` reasoning parser without extra ROCm-specific tuning flags.
+The first startup on MI300X can take longer while SGLang compiles ROCm kernels.
-Save one of the configurations above as `qwen397.dstack.yml`, then use the
+Save one of the configurations above as `qwen36.dstack.yml`, then use the
[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command.
```shell
-$ dstack apply -f qwen397.dstack.yml
+$ dstack apply -f qwen36.dstack.yml
```
@@ -123,26 +107,29 @@ If no gateway is created, the service endpoint will be available at `
```shell
-curl http://127.0.0.1:3000/proxy/services/main/qwen397/v1/chat/completions \
+curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
-X POST \
-H 'Authorization: Bearer <dstack token>' \
-H 'Content-Type: application/json' \
-d '{
- "model": "Qwen/Qwen3.5-397B-A17B-FP8",
+ "model": "Qwen/Qwen3.6-27B",
"messages": [
{
"role": "user",
"content": "A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. How much does the ball cost? Answer with just the dollar amount."
}
],
- "chat_template_kwargs": {"enable_thinking": true},
"separate_reasoning": true,
"max_tokens": 1024
}'
```
-> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen397.
/`.
+Qwen3.6 uses thinking mode by default. To disable thinking, pass
+`"chat_template_kwargs": {"enable_thinking": false}` in the request body. To
+enable tool calling, add `--tool-call-parser qwen3_coder` to the serve command.
+
+> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`.
## Configuration options
@@ -232,4 +219,4 @@ Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics
## What's next?
1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways)
-2. Browse the [Qwen 3.5 SGLang cookbook](https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5) and the [SGLang server arguments reference](https://docs.sglang.ai/advanced_features/server_arguments.html)
+2. Browse the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) and the [SGLang server arguments reference](https://docs.sglang.ai/advanced_features/server_arguments.html)
diff --git a/examples/inference/vllm/README.md b/examples/inference/vllm/README.md
index 7497af669..75d6add9b 100644
--- a/examples/inference/vllm/README.md
+++ b/examples/inference/vllm/README.md
@@ -1,39 +1,39 @@
---
title: vLLM
-description: Deploying Qwen3.5-397B-A17B-FP8 using vLLM on NVIDIA GPUs
+description: Deploying Qwen3.6-27B using vLLM on NVIDIA and AMD GPUs
---
# vLLM
-This example shows how to deploy `Qwen/Qwen3.5-397B-A17B-FP8` using
+This example shows how to deploy `Qwen/Qwen3.6-27B` using
[vLLM](https://docs.vllm.ai/en/latest/) and `dstack`.
## Apply a configuration
Here's an example of a service that deploys
-`Qwen/Qwen3.5-397B-A17B-FP8` using vLLM.
+`Qwen/Qwen3.6-27B` using vLLM.
=== "NVIDIA"
-
+
```yaml
type: service
- name: qwen397
+ name: qwen36
image: vllm/vllm-openai:v0.19.1
commands:
- |
- vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \
+ vllm serve Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size $DSTACK_GPUS_NUM \
--max-model-len 262144 \
- --reasoning-parser qwen3 \
- --language-model-only
+ --reasoning-parser qwen3
port: 8000
- model: Qwen/Qwen3.5-397B-A17B-FP8
+ model: Qwen/Qwen3.6-27B
volumes:
- instance_path: /root/.cache
@@ -41,26 +41,60 @@ Here's an example of a service that deploys
optional: true
resources:
- cpu: x86:96..
- memory: 512GB..
shm_size: 16GB
- disk: 500GB..
- gpu: H100:80GB:8
+ gpu: H100:4
```
-The NVIDIA example serves `Qwen/Qwen3.5-397B-A17B-FP8` on `8x H100` GPUs using
-vLLM with tensor parallelism enabled. It uses `--language-model-only` because
-`Qwen/Qwen3.5-397B-A17B-FP8` is a text-only model.
+=== "AMD"
-Save the configuration above as `qwen397.dstack.yml`, then use the
+
+
+ ```yaml
+ type: service
+ name: qwen36
+
+ image: vllm/vllm-openai-rocm:v0.19.1
+
+ commands:
+ - |
+ vllm serve Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
+ --port 8000 \
+ --tensor-parallel-size $DSTACK_GPUS_NUM \
+ --max-model-len 262144 \
+ --reasoning-parser qwen3
+
+ port: 8000
+ model: Qwen/Qwen3.6-27B
+
+ volumes:
+ - instance_path: /root/.cache
+ path: /root/.cache
+ optional: true
+
+ resources:
+ cpu: 52..
+ memory: 896GB..
+ shm_size: 16GB
+ disk: 450GB..
+ gpu: MI300X:4
+ ```
+
+
+
+Qwen3.6-27B is a multimodal model. For text-only workloads, add
+`--language-model-only` to free more memory for the KV cache. To enable tool
+calling, add `--enable-auto-tool-choice --tool-call-parser qwen3_coder`.
+
+Save one of the configurations above as `qwen36.dstack.yml`, then use the
[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command.
```shell
-$ dstack apply -f qwen397.dstack.yml
+$ dstack apply -f qwen36.dstack.yml
```
@@ -70,12 +104,12 @@ If no gateway is created, the service endpoint will be available at `
```shell
-curl http://127.0.0.1:3000/proxy/services/main/qwen397/v1/chat/completions \
+curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
-X POST \
-H 'Authorization: Bearer <dstack token>' \
-H 'Content-Type: application/json' \
-d '{
- "model": "Qwen/Qwen3.5-397B-A17B-FP8",
+ "model": "Qwen/Qwen3.6-27B",
"messages": [
{
"role": "user",
@@ -88,9 +122,9 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen397/v1/chat/completions \
-> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen397./`.
+> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`.
## What's next?
1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways)
-2. Browse the [SGLang](https://dstack.ai/examples/inference/sglang/) and [NIM](https://dstack.ai/examples/inference/nim/) examples
+2. Browse the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html) and the [SGLang](https://dstack.ai/examples/inference/sglang/) example
diff --git a/examples/models/qwen36/README.md b/examples/models/qwen36/README.md
new file mode 100644
index 000000000..bc92271b2
--- /dev/null
+++ b/examples/models/qwen36/README.md
@@ -0,0 +1,168 @@
+---
+title: Qwen 3.6
+description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs
+---
+
+# Qwen 3.6
+
+This example shows how to deploy `Qwen/Qwen3.6-27B` as a
+[service](https://dstack.ai/docs/services) using
+[SGLang](https://github.com/sgl-project/sglang) and `dstack`.
+
+## Apply a configuration
+
+Save one of the following configurations as `qwen36.dstack.yml`.
+
+=== "NVIDIA"
+
+
+
+ ```yaml
+ type: service
+ name: qwen36
+
+ image: lmsysorg/sglang:v0.5.10.post1
+
+ commands:
+ - |
+ sglang serve \
+ --model-path Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --tp $DSTACK_GPUS_NUM \
+ --reasoning-parser qwen3 \
+ --mem-fraction-static 0.8 \
+ --context-length 262144
+
+ port: 30000
+ model: Qwen/Qwen3.6-27B
+
+ volumes:
+ - instance_path: /root/.cache
+ path: /root/.cache
+ optional: true
+
+ resources:
+ shm_size: 16GB
+ gpu: H100:4
+ ```
+
+
+
+=== "AMD"
+
+
+
+ ```yaml
+ type: service
+ name: qwen36
+
+ image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
+
+ commands:
+ - |
+ sglang serve \
+ --model-path Qwen/Qwen3.6-27B \
+ --host 0.0.0.0 \
+ --port 30000 \
+ --tp $DSTACK_GPUS_NUM \
+ --reasoning-parser qwen3 \
+ --mem-fraction-static 0.8 \
+ --context-length 262144
+
+ port: 30000
+ model: Qwen/Qwen3.6-27B
+
+ volumes:
+ - instance_path: /root/.cache
+ path: /root/.cache
+ optional: true
+
+ resources:
+ cpu: 52..
+ memory: 896GB..
+ shm_size: 16GB
+ disk: 450GB..
+ gpu: MI300X:4
+ ```
+
+
+
+The NVIDIA and AMD configurations above use pinned SGLang images and the same
+straightforward 4-GPU layout used across the Qwen 3.6 docs and examples.
+
+Apply the configuration with
+[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md).
+
+
+
+```shell
+$ dstack apply -f qwen36.dstack.yml
+```
+
+
+
+If no gateway is created, the service endpoint will be available at
+`/proxy/services///`.
+
+
+
+```shell
+curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
+ -X POST \
+ -H 'Authorization: Bearer <dstack token>' \
+ -H 'Content-Type: application/json' \
+ -d '{
+ "model": "Qwen/Qwen3.6-27B",
+ "messages": [
+ {
+ "role": "user",
+ "content": "A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. How much does the ball cost? Answer with just the dollar amount."
+ }
+ ],
+ "max_tokens": 1024
+ }'
+```
+
+
+
+## Thinking mode
+
+Qwen3.6 uses thinking mode by default. With SGLang, the reasoning stream is
+returned separately as `reasoning_content`.
+
+To disable thinking, pass `chat_template_kwargs` in the request body.
+
+
+
+```shell
+curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \
+ -X POST \
+ -H 'Authorization: Bearer <dstack token>' \
+ -H 'Content-Type: application/json' \
+ -d '{
+ "model": "Qwen/Qwen3.6-27B",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Summarize the benefits of container images in one sentence."
+ }
+ ],
+ "max_tokens": 256,
+ "chat_template_kwargs": {
+ "enable_thinking": false
+ }
+ }'
+```
+
+
+
+## What's next?
+
+1. Read the [Qwen/Qwen3.6-27B model card](https://huggingface.co/Qwen/Qwen3.6-27B)
+2. Read the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6)
+3. Read the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html)
+4. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/)
+ and [vLLM](https://dstack.ai/examples/inference/vllm/) examples
+5. Check the [AMD](https://dstack.ai/examples/accelerators/amd/) example for
+ more AMD deployment and training configurations
diff --git a/mkdocs.yml b/mkdocs.yml
index 8dbe0ad85..1baa53015 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -100,8 +100,10 @@ plugins:
"docs/fleets.md": "docs/concepts/fleets.md"
"docs/examples/llms/llama31.md": "examples/inference/vllm/index.md"
"docs/examples/llms/llama32.md": "examples/inference/vllm/index.md"
+ "docs/examples/llms/qwen36.md": "examples/models/qwen36/index.md"
"examples/llms/llama31/index.md": "examples/inference/vllm/index.md"
"examples/llms/llama32/index.md": "examples/inference/vllm/index.md"
+ "examples/llms/qwen36/index.md": "examples/models/qwen36/index.md"
"docs/examples/accelerators/amd/index.md": "examples/accelerators/amd/index.md"
"docs/examples/deployment/nim/index.md": "examples/inference/nim/index.md"
"docs/examples/deployment/vllm/index.md": "examples/inference/vllm/index.md"
@@ -308,6 +310,8 @@ nav:
- AMD: examples/accelerators/amd/index.md
- TPU: examples/accelerators/tpu/index.md
- Tenstorrent: examples/accelerators/tenstorrent/index.md
+ - Models:
+ - Qwen 3.6: examples/models/qwen36/index.md
- Blog:
- blog/index.md
- Case studies: blog/case-studies.md