diff --git a/docs/docs/concepts/services.md b/docs/docs/concepts/services.md index fd0d2a2dc..f1a88e3dd 100644 --- a/docs/docs/concepts/services.md +++ b/docs/docs/concepts/services.md @@ -21,23 +21,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` ```yaml type: service - name: qwen397 + name: qwen36 image: lmsysorg/sglang:v0.5.10.post1 commands: - | sglang serve \ - --model-path Qwen/Qwen3.5-397B-A17B-FP8 \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ --port 30000 \ --tp $DSTACK_GPUS_NUM \ - --reasoning-parser qwen3 \ - --tool-call-parser qwen3_coder \ - --enable-flashinfer-allreduce-fusion \ - --mem-fraction-static 0.8 + --mem-fraction-static 0.8 \ + --context-length 262144 \ + --reasoning-parser qwen3 port: 30000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: # Optional instance volume for model and runtime caches @@ -46,11 +46,8 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` optional: true resources: - cpu: x86:96.. - memory: 512GB.. shm_size: 16GB - disk: 500GB.. - gpu: H100:80GB:8 + gpu: H100:4 ``` @@ -61,38 +58,23 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` ```yaml type: service - name: qwen397 + name: qwen36 - image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x - - env: - - HIP_FORCE_DEV_KERNARG=1 - - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 - - SGLANG_DISABLE_CUDNN_CHECK=1 - - SGLANG_INT4_WEIGHT=0 - - SGLANG_MOE_PADDING=1 - - SGLANG_ROCM_DISABLE_LINEARQUANT=0 - - SGLANG_ROCM_FUSED_DECODE_MLA=1 - - SGLANG_SET_CPU_AFFINITY=1 - - SGLANG_USE_AITER=1 - - SGLANG_USE_ROCM700A=1 + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x commands: - | sglang serve \ - --model-path Qwen/Qwen3.5-397B-A17B-FP8 \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ --tp $DSTACK_GPUS_NUM \ - --reasoning-parser qwen3 \ - --tool-call-parser qwen3_coder \ --mem-fraction-static 0.8 \ --context-length 262144 \ - --attention-backend triton \ - --disable-cuda-graph \ - --fp8-gemm-backend aiter \ - --port 30000 + --reasoning-parser qwen3 port: 30000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: # Optional instance volume for model and runtime caches @@ -101,15 +83,18 @@ The filename must end with `.dstack.yml` (e.g. `.dstack.yml` or `dev.dstack.yml` optional: true resources: - cpu: x86:52.. - memory: 700GB.. + cpu: 52.. + memory: 896GB.. shm_size: 16GB - disk: 600GB.. - gpu: MI300X:192GB:4 + disk: 450GB.. + gpu: MI300X:4 ``` +The first startup on MI300X can take longer while SGLang compiles ROCm +kernels. + To run a service, pass the configuration to [`dstack apply`](../reference/cli/dstack/apply.md):
@@ -117,14 +102,14 @@ To run a service, pass the configuration to [`dstack apply`](../reference/cli/ds ```shell $ dstack apply -f .dstack.yml -Submit the run qwen397? [y/n]: y +Submit the run qwen36? [y/n]: y Provisioning... ---> 100% Service is published at: - http://localhost:3000/proxy/services/main/qwen397/ -Model Qwen/Qwen3.5-397B-A17B-FP8 is published at: + http://localhost:3000/proxy/services/main/qwen36/ +Model Qwen/Qwen3.6-27B is published at: http://localhost:3000/proxy/models/main/ ``` @@ -138,11 +123,11 @@ If you do not have a [gateway](gateways.md) created, the service endpoint will b
```shell -$ curl http://localhost:3000/proxy/services/main/qwen397/v1/chat/completions \ +$ curl http://localhost:3000/proxy/services/main/qwen36/v1/chat/completions \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer <dstack token>' \ -d '{ - "model": "Qwen/Qwen3.5-397B-A17B-FP8", + "model": "Qwen/Qwen3.6-27B", "messages": [ { "role": "user", @@ -213,23 +198,23 @@ You can configure the number of replicas as well as the auto-scaling rules. ```yaml type: service - name: qwen397-service + name: qwen36-service image: lmsysorg/sglang:v0.5.10.post1 commands: - | sglang serve \ - --model-path Qwen/Qwen3.5-397B-A17B-FP8 \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ --port 30000 \ --tp $DSTACK_GPUS_NUM \ --reasoning-parser qwen3 \ - --tool-call-parser qwen3_coder \ - --enable-flashinfer-allreduce-fusion \ - --mem-fraction-static 0.8 + --mem-fraction-static 0.8 \ + --context-length 262144 port: 30000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: # Optional instance volume for model and runtime caches @@ -238,11 +223,8 @@ You can configure the number of replicas as well as the auto-scaling rules. optional: true resources: - cpu: x86:96.. - memory: 512GB.. shm_size: 16GB - disk: 500GB.. - gpu: H100:80GB:8 + gpu: H100:4 replicas: 1..2 scaling: @@ -258,38 +240,23 @@ You can configure the number of replicas as well as the auto-scaling rules. ```yaml type: service - name: qwen397-service - - image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x + name: qwen36-service - env: - - HIP_FORCE_DEV_KERNARG=1 - - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 - - SGLANG_DISABLE_CUDNN_CHECK=1 - - SGLANG_INT4_WEIGHT=0 - - SGLANG_MOE_PADDING=1 - - SGLANG_ROCM_DISABLE_LINEARQUANT=0 - - SGLANG_ROCM_FUSED_DECODE_MLA=1 - - SGLANG_SET_CPU_AFFINITY=1 - - SGLANG_USE_AITER=1 - - SGLANG_USE_ROCM700A=1 + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x commands: - | sglang serve \ - --model-path Qwen/Qwen3.5-397B-A17B-FP8 \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ --tp $DSTACK_GPUS_NUM \ --reasoning-parser qwen3 \ - --tool-call-parser qwen3_coder \ --mem-fraction-static 0.8 \ - --context-length 262144 \ - --attention-backend triton \ - --disable-cuda-graph \ - --fp8-gemm-backend aiter \ - --port 30000 + --context-length 262144 port: 30000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: # Optional instance volume for model and runtime caches @@ -298,11 +265,11 @@ You can configure the number of replicas as well as the auto-scaling rules. optional: true resources: - cpu: x86:52.. - memory: 700GB.. + cpu: 52.. + memory: 896GB.. shm_size: 16GB - disk: 600GB.. - gpu: MI300X:192GB:4 + disk: 450GB.. + gpu: MI300X:4 replicas: 1..2 scaling: diff --git a/docs/docs/quickstart.md b/docs/docs/quickstart.md index 4cdecae5e..80a98f79b 100644 --- a/docs/docs/quickstart.md +++ b/docs/docs/quickstart.md @@ -219,27 +219,26 @@ description: Quick guide to creating fleets and submitting runs ```yaml type: service - name: llama31-service - - # If `image` is not specified, dstack uses its default image - python: "3.11" - #image: dstackai/base:py3.13-0.7-cuda-12.1 - - # Required environment variables - env: - - HF_TOKEN + name: qwen36-service + + image: lmsysorg/sglang:v0.5.10.post1 + commands: - - pip install vllm - - vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct --max-model-len 4096 - # Expose the vllm server port - port: 8000 + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --reasoning-parser qwen3 + # Expose the SGLang server port + port: 30000 # Specify a name if it's an OpenAI-compatible model - model: meta-llama/Meta-Llama-3.1-8B-Instruct - + model: Qwen/Qwen3.6-27B + # Required resources resources: - gpu: 24GB + gpu: H100 ```
@@ -249,22 +248,20 @@ description: Quick guide to creating fleets and submitting runs
```shell - $ HF_TOKEN=... $ dstack apply -f service.dstack.yml - - # BACKEND REGION INSTANCE RESOURCES SPOT PRICE - 1 aws us-west-2 g5.4xlarge 16xCPU, 64GB, 1xA10G (24GB) yes $0.22 - 2 aws us-east-2 g6.xlarge 4xCPU, 16GB, 1xL4 (24GB) yes $0.27 - 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB) yes $0.27 - - Submit the run llama31-service? [y/n]: y - - Provisioning `llama31-service`... + + # BACKEND REGION INSTANCE RESOURCES SPOT PRICE + 1 nebius eu-north1 gpu-h100-sxm 16xCPU, 250GB, 1xH100 (80GB) no $2.95 + 2 runpod US-CA-2 NVIDIA H100 80GB HBM3 64xCPU, 1004GB, 1xH100 (80GB) no $2.99 + + Submit the run qwen36-service? [y/n]: y + + Provisioning `qwen36-service`... ---> 100% Service is published at: - http://localhost:3000/proxy/services/main/llama31-service/ - Model meta-llama/Meta-Llama-3.1-8B-Instruct is published at: + http://localhost:3000/proxy/services/main/qwen36-service/ + Model Qwen/Qwen3.6-27B is published at: http://localhost:3000/proxy/models/main/ ``` diff --git a/docs/examples.md b/docs/examples.md index 04cd5ff0f..b3e3e0d42 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -1,6 +1,6 @@ --- title: Examples -description: Collection of examples for training, inference, and clusters +description: Collection of examples for models, training, inference, and clusters #template: examples.html hide: # - navigation @@ -153,7 +153,7 @@ hide: SGLang

- Deploy DeepSeek distilled models with SGLang + Deploy Qwen3.6-27B with SGLang

- Deploy Llama 3.1 with vLLM + Deploy Qwen3.6-27B with vLLM

+## Models + +
+ +

+ Qwen 3.6 +

+ +

+ Deploy Qwen3.6-27B with SGLang on NVIDIA or AMD +

+
+
+ ## Accelerators
diff --git a/docs/examples/models/qwen36/index.md b/docs/examples/models/qwen36/index.md new file mode 100644 index 000000000..e69de29bb diff --git a/examples/accelerators/amd/README.md b/examples/accelerators/amd/README.md index 36be8044e..b35b29c1c 100644 --- a/examples/accelerators/amd/README.md +++ b/examples/accelerators/amd/README.md @@ -1,6 +1,6 @@ --- title: AMD -description: Deploying and fine-tuning models on AMD MI300X GPUs using vLLM, TRL, and Axolotl +description: Deploying and fine-tuning models on AMD MI300X GPUs using SGLang, vLLM, TRL, and Axolotl --- # AMD @@ -11,8 +11,49 @@ with on-prem AMD GPUs or configuring a backend that offers AMD GPUs such as the ## Deployment -vLLM supports AMD GPUs. Here's an example of a [service](https://dstack.ai/docs/services) that deploys -Llama 3.1 70B in FP16 using [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html). +Here are examples of a [service](https://dstack.ai/docs/services) that deploy +`Qwen/Qwen3.6-27B` on AMD MI300X GPUs using +[SGLang](https://github.com/sgl-project/sglang) and +[vLLM](https://docs.vllm.ai/en/latest/). + +=== "SGLang" + +
+ + ```yaml + type: service + name: qwen36-service-sglang-amd + + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 + ``` + +
=== "vLLM" @@ -20,65 +61,49 @@ Llama 3.1 70B in FP16 using [vLLM](https://docs.vllm.ai/en/latest/getting_starte ```yaml type: service - name: llama31-service-vllm-amd + name: qwen36-service-vllm-amd + + image: vllm/vllm-openai-rocm:v0.19.1 - # Using Runpod's ROCm Docker image - image: runpod/pytorch:2.4.0-py3.10-rocm6.1.0-ubuntu22.04 - # Required environment variables - env: - - HF_TOKEN - - MODEL_ID=meta-llama/Meta-Llama-3.1-70B-Instruct - - MAX_MODEL_LEN=126192 - # Commands of the task commands: - - export PATH=/opt/conda/envs/py_3.10/bin:$PATH - - wget https://github.com/ROCm/hipBLAS/archive/refs/tags/rocm-6.1.0.zip - - unzip rocm-6.1.0.zip - - cd hipBLAS-rocm-6.1.0 - - python rmake.py - - cd .. - - git clone https://github.com/vllm-project/vllm.git - - cd vllm - - pip install triton - - pip uninstall torch -y - - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1 - - pip install /opt/rocm/share/amd_smi - - pip install --upgrade numba scipy huggingface-hub[cli] - - pip install "numpy<2" - - pip install -r requirements-rocm.txt - - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib - - rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* - - export PYTORCH_ROCM_ARCH="gfx90a;gfx942" - - wget https://dstack-binaries.s3.amazonaws.com/vllm-0.6.0%2Brocm614-cp310-cp310-linux_x86_64.whl - - pip install vllm-0.6.0+rocm614-cp310-cp310-linux_x86_64.whl - - vllm serve $MODEL_ID --max-model-len $MAX_MODEL_LEN --port 8000 - # Service port + - | + vllm serve Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size $DSTACK_GPUS_NUM \ + --max-model-len 262144 \ + --reasoning-parser qwen3 + port: 8000 - # Register the model - model: meta-llama/Meta-Llama-3.1-70B-Instruct + model: Qwen/Qwen3.6-27B - # Uncomment to leverage spot instances - #spot_policy: auto + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true resources: - gpu: MI300X - disk: 200GB + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 ```
- Note, maximum size of vLLM’s `KV cache` is 126192, consequently we must set `MAX_MODEL_LEN` to 126192. Adding `/opt/conda/envs/py_3.10/bin` to PATH ensures we use the Python 3.10 environment necessary for the pre-built binaries compiled specifically for this version. - - > To speed up the `vLLM-ROCm` installation, this example uses a pre-built binary from S3. - !!! info "Docker image" - If you want to use AMD, specifying `image` is currently required. This must be an image that includes - ROCm drivers. + AMD deployments require specifying an image that already includes ROCm + drivers. The SGLang and vLLM examples above use pinned ROCm images. To request multiple GPUs, specify the quantity after the GPU name, separated by a colon, e.g., `MI300X:4`. ## Fine-tuning +> If you're planning multi-node AMD training, validate cluster networking first +with the [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) +example. + === "TRL" Below is an example of LoRA fine-tuning Llama 3.1 8B using [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/single-gpu-fine-tuning-and-inference.html) @@ -189,28 +214,30 @@ To request multiple GPUs, specify the quantity after the GPU name, separated by ## Running a configuration -Once a configuration is ready, save it to a `.dstack.yml` file, then run -`dstack apply -f `, and `dstack` will automatically provision the -cloud resources and run the configuration. +Once a configuration is ready, save it to a `.dstack.yml` file. If your +configuration references environment variables such as `HF_TOKEN` or +`WANDB_API_KEY`, export them first. Then run +`dstack apply -f `, and `dstack` will automatically +provision the cloud resources and run the configuration.
```shell -$ HF_TOKEN=... -$ WANDB_API_KEY=... -$ WANDB_PROJECT=... -$ WANDB_NAME=axolotl-amd-llama31-train -$ HUB_MODEL_ID=... -$ dstack apply -f service.dstack.yml +$ dstack apply -f ```
## What's next? -1. Browse [vLLM](https://docs.vllm.ai/en/latest/getting_started/amd-installation.html#build-from-source-rocm), +1. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/) + and [vLLM](https://dstack.ai/examples/inference/vllm/) examples, plus [Axolotl](https://github.com/ROCm/rocm-blogs/tree/release/blogs/artificial-intelligence/axolotl), - [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html) and - [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes) -2. Check [dev environments](https://dstack.ai/docs/dev-environments), [tasks](https://dstack.ai/docs/tasks), and + [TRL](https://rocm.docs.amd.com/en/latest/how-to/llm-fine-tuning-optimization/fine-tuning-and-inference.html), + and [ROCm Bitsandbytes](https://github.com/ROCm/bitsandbytes) +2. For multi-node training, run + [NCCL/RCCL tests](https://dstack.ai/examples/clusters/nccl-rccl-tests/) + to validate AMD cluster networking. +3. Check [dev environments](https://dstack.ai/docs/dev-environments), + [tasks](https://dstack.ai/docs/tasks), and [services](https://dstack.ai/docs/services). diff --git a/examples/inference/sglang/README.md b/examples/inference/sglang/README.md index 9d08fe09c..ddf49fd31 100644 --- a/examples/inference/sglang/README.md +++ b/examples/inference/sglang/README.md @@ -1,41 +1,41 @@ --- title: SGLang -description: Deploying Qwen3.5-397B-A17B-FP8 using SGLang on NVIDIA and AMD GPUs +description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs --- # SGLang -This example shows how to deploy `Qwen/Qwen3.5-397B-A17B-FP8` using +This example shows how to deploy `Qwen/Qwen3.6-27B` using [SGLang](https://github.com/sgl-project/sglang) and `dstack`. ## Apply a configuration Here's an example of a service that deploys -`Qwen/Qwen3.5-397B-A17B-FP8` using SGLang. +`Qwen/Qwen3.6-27B` using SGLang. === "NVIDIA" -
+
```yaml type: service - name: qwen397 + name: qwen36 image: lmsysorg/sglang:v0.5.10.post1 commands: - | sglang serve \ - --model-path Qwen/Qwen3.5-397B-A17B-FP8 \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ --port 30000 \ --tp $DSTACK_GPUS_NUM \ --reasoning-parser qwen3 \ - --tool-call-parser qwen3_coder \ - --enable-flashinfer-allreduce-fusion \ - --mem-fraction-static 0.8 + --mem-fraction-static 0.8 \ + --context-length 262144 port: 30000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: - instance_path: /root/.cache @@ -43,52 +43,34 @@ Here's an example of a service that deploys optional: true resources: - cpu: x86:96.. - memory: 512GB.. shm_size: 16GB - disk: 500GB.. - gpu: H100:80GB:8 + gpu: H100:4 ```
=== "AMD" -
+
```yaml type: service - name: qwen397 - - image: lmsysorg/sglang:v0.5.10.post1-rocm720-mi30x + name: qwen36 - env: - - HIP_FORCE_DEV_KERNARG=1 - - SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 - - SGLANG_DISABLE_CUDNN_CHECK=1 - - SGLANG_INT4_WEIGHT=0 - - SGLANG_MOE_PADDING=1 - - SGLANG_ROCM_DISABLE_LINEARQUANT=0 - - SGLANG_ROCM_FUSED_DECODE_MLA=1 - - SGLANG_SET_CPU_AFFINITY=1 - - SGLANG_USE_AITER=1 - - SGLANG_USE_ROCM700A=1 + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x commands: - | sglang serve \ - --model-path Qwen/Qwen3.5-397B-A17B-FP8 \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ --tp $DSTACK_GPUS_NUM \ --reasoning-parser qwen3 \ - --tool-call-parser qwen3_coder \ --mem-fraction-static 0.8 \ - --context-length 262144 \ - --attention-backend triton \ - --disable-cuda-graph \ - --fp8-gemm-backend aiter \ - --port 30000 + --context-length 262144 port: 30000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: - instance_path: /root/.cache @@ -96,24 +78,26 @@ Here's an example of a service that deploys optional: true resources: - cpu: x86:52.. - memory: 700GB.. + cpu: 52.. + memory: 896GB.. shm_size: 16GB - disk: 600GB.. - gpu: MI300X:192GB:4 + disk: 450GB.. + gpu: MI300X:4 ```
-The AMD example uses the exact validated MI300X configuration for this model, -including the ROCm/AITER settings required for stable FP8 serving. +The AMD example keeps the deployment close to the upstream Qwen and SGLang +guidance: a pinned ROCm image, tensor parallelism across all four GPUs, and the +standard `qwen3` reasoning parser without extra ROCm-specific tuning flags. +The first startup on MI300X can take longer while SGLang compiles ROCm kernels. -Save one of the configurations above as `qwen397.dstack.yml`, then use the +Save one of the configurations above as `qwen36.dstack.yml`, then use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command.
```shell -$ dstack apply -f qwen397.dstack.yml +$ dstack apply -f qwen36.dstack.yml ```
@@ -123,26 +107,29 @@ If no gateway is created, the service endpoint will be available at ` ```shell -curl http://127.0.0.1:3000/proxy/services/main/qwen397/v1/chat/completions \ +curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ -X POST \ -H 'Authorization: Bearer <dstack token>' \ -H 'Content-Type: application/json' \ -d '{ - "model": "Qwen/Qwen3.5-397B-A17B-FP8", + "model": "Qwen/Qwen3.6-27B", "messages": [ { "role": "user", "content": "A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. How much does the ball cost? Answer with just the dollar amount." } ], - "chat_template_kwargs": {"enable_thinking": true}, "separate_reasoning": true, "max_tokens": 1024 }' ```
-> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen397./`. +Qwen3.6 uses thinking mode by default. To disable thinking, pass +`"chat_template_kwargs": {"enable_thinking": false}` in the request body. To +enable tool calling, add `--tool-call-parser qwen3_coder` to the serve command. + +> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. ## Configuration options @@ -232,4 +219,4 @@ Currently, auto-scaling only supports `rps` as the metric. TTFT and ITL metrics ## What's next? 1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways) -2. Browse the [Qwen 3.5 SGLang cookbook](https://cookbook.sglang.io/autoregressive/Qwen/Qwen3.5) and the [SGLang server arguments reference](https://docs.sglang.ai/advanced_features/server_arguments.html) +2. Browse the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) and the [SGLang server arguments reference](https://docs.sglang.ai/advanced_features/server_arguments.html) diff --git a/examples/inference/vllm/README.md b/examples/inference/vllm/README.md index 7497af669..75d6add9b 100644 --- a/examples/inference/vllm/README.md +++ b/examples/inference/vllm/README.md @@ -1,39 +1,39 @@ --- title: vLLM -description: Deploying Qwen3.5-397B-A17B-FP8 using vLLM on NVIDIA GPUs +description: Deploying Qwen3.6-27B using vLLM on NVIDIA and AMD GPUs --- # vLLM -This example shows how to deploy `Qwen/Qwen3.5-397B-A17B-FP8` using +This example shows how to deploy `Qwen/Qwen3.6-27B` using [vLLM](https://docs.vllm.ai/en/latest/) and `dstack`. ## Apply a configuration Here's an example of a service that deploys -`Qwen/Qwen3.5-397B-A17B-FP8` using vLLM. +`Qwen/Qwen3.6-27B` using vLLM. === "NVIDIA" -
+
```yaml type: service - name: qwen397 + name: qwen36 image: vllm/vllm-openai:v0.19.1 commands: - | - vllm serve Qwen/Qwen3.5-397B-A17B-FP8 \ + vllm serve Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ --port 8000 \ --tensor-parallel-size $DSTACK_GPUS_NUM \ --max-model-len 262144 \ - --reasoning-parser qwen3 \ - --language-model-only + --reasoning-parser qwen3 port: 8000 - model: Qwen/Qwen3.5-397B-A17B-FP8 + model: Qwen/Qwen3.6-27B volumes: - instance_path: /root/.cache @@ -41,26 +41,60 @@ Here's an example of a service that deploys optional: true resources: - cpu: x86:96.. - memory: 512GB.. shm_size: 16GB - disk: 500GB.. - gpu: H100:80GB:8 + gpu: H100:4 ```
-The NVIDIA example serves `Qwen/Qwen3.5-397B-A17B-FP8` on `8x H100` GPUs using -vLLM with tensor parallelism enabled. It uses `--language-model-only` because -`Qwen/Qwen3.5-397B-A17B-FP8` is a text-only model. +=== "AMD" -Save the configuration above as `qwen397.dstack.yml`, then use the +
+ + ```yaml + type: service + name: qwen36 + + image: vllm/vllm-openai-rocm:v0.19.1 + + commands: + - | + vllm serve Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 8000 \ + --tensor-parallel-size $DSTACK_GPUS_NUM \ + --max-model-len 262144 \ + --reasoning-parser qwen3 + + port: 8000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 + ``` + +
+ +Qwen3.6-27B is a multimodal model. For text-only workloads, add +`--language-model-only` to free more memory for the KV cache. To enable tool +calling, add `--enable-auto-tool-choice --tool-call-parser qwen3_coder`. + +Save one of the configurations above as `qwen36.dstack.yml`, then use the [`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md) command.
```shell -$ dstack apply -f qwen397.dstack.yml +$ dstack apply -f qwen36.dstack.yml ```
@@ -70,12 +104,12 @@ If no gateway is created, the service endpoint will be available at ` ```shell -curl http://127.0.0.1:3000/proxy/services/main/qwen397/v1/chat/completions \ +curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ -X POST \ -H 'Authorization: Bearer <dstack token>' \ -H 'Content-Type: application/json' \ -d '{ - "model": "Qwen/Qwen3.5-397B-A17B-FP8", + "model": "Qwen/Qwen3.6-27B", "messages": [ { "role": "user", @@ -88,9 +122,9 @@ curl http://127.0.0.1:3000/proxy/services/main/qwen397/v1/chat/completions \
-> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen397./`. +> If a [gateway](https://dstack.ai/docs/concepts/gateways/) is configured (e.g. to enable auto-scaling, HTTPS, rate limits, etc.), the service endpoint will be available at `https://qwen36./`. ## What's next? 1. Read about [services](https://dstack.ai/docs/concepts/services) and [gateways](https://dstack.ai/docs/concepts/gateways) -2. Browse the [SGLang](https://dstack.ai/examples/inference/sglang/) and [NIM](https://dstack.ai/examples/inference/nim/) examples +2. Browse the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html) and the [SGLang](https://dstack.ai/examples/inference/sglang/) example diff --git a/examples/models/qwen36/README.md b/examples/models/qwen36/README.md new file mode 100644 index 000000000..bc92271b2 --- /dev/null +++ b/examples/models/qwen36/README.md @@ -0,0 +1,168 @@ +--- +title: Qwen 3.6 +description: Deploying Qwen3.6-27B using SGLang on NVIDIA and AMD GPUs +--- + +# Qwen 3.6 + +This example shows how to deploy `Qwen/Qwen3.6-27B` as a +[service](https://dstack.ai/docs/services) using +[SGLang](https://github.com/sgl-project/sglang) and `dstack`. + +## Apply a configuration + +Save one of the following configurations as `qwen36.dstack.yml`. + +=== "NVIDIA" + +
+ + ```yaml + type: service + name: qwen36 + + image: lmsysorg/sglang:v0.5.10.post1 + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + shm_size: 16GB + gpu: H100:4 + ``` + +
+ +=== "AMD" + +
+ + ```yaml + type: service + name: qwen36 + + image: lmsysorg/sglang:v0.5.10-rocm720-mi30x + + commands: + - | + sglang serve \ + --model-path Qwen/Qwen3.6-27B \ + --host 0.0.0.0 \ + --port 30000 \ + --tp $DSTACK_GPUS_NUM \ + --reasoning-parser qwen3 \ + --mem-fraction-static 0.8 \ + --context-length 262144 + + port: 30000 + model: Qwen/Qwen3.6-27B + + volumes: + - instance_path: /root/.cache + path: /root/.cache + optional: true + + resources: + cpu: 52.. + memory: 896GB.. + shm_size: 16GB + disk: 450GB.. + gpu: MI300X:4 + ``` + +
+ +The NVIDIA and AMD configurations above use pinned SGLang images and the same +straightforward 4-GPU layout used across the Qwen 3.6 docs and examples. + +Apply the configuration with +[`dstack apply`](https://dstack.ai/docs/reference/cli/dstack/apply.md). + +
+ +```shell +$ dstack apply -f qwen36.dstack.yml +``` + +
+ +If no gateway is created, the service endpoint will be available at +`/proxy/services///`. + +
+ +```shell +curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <dstack token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "Qwen/Qwen3.6-27B", + "messages": [ + { + "role": "user", + "content": "A bat and a ball cost $1.10 total. The bat costs $1.00 more than the ball. How much does the ball cost? Answer with just the dollar amount." + } + ], + "max_tokens": 1024 + }' +``` + +
+ +## Thinking mode + +Qwen3.6 uses thinking mode by default. With SGLang, the reasoning stream is +returned separately as `reasoning_content`. + +To disable thinking, pass `chat_template_kwargs` in the request body. + +
+ +```shell +curl http://127.0.0.1:3000/proxy/services/main/qwen36/v1/chat/completions \ + -X POST \ + -H 'Authorization: Bearer <dstack token>' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "Qwen/Qwen3.6-27B", + "messages": [ + { + "role": "user", + "content": "Summarize the benefits of container images in one sentence." + } + ], + "max_tokens": 256, + "chat_template_kwargs": { + "enable_thinking": false + } + }' +``` + +
+ +## What's next? + +1. Read the [Qwen/Qwen3.6-27B model card](https://huggingface.co/Qwen/Qwen3.6-27B) +2. Read the [Qwen 3.6 SGLang cookbook](https://docs.sglang.io/cookbook/autoregressive/Qwen/Qwen3.6) +3. Read the [Qwen 3.5 & 3.6 vLLM recipe](https://docs.vllm.ai/projects/recipes/en/latest/Qwen/Qwen3.5.html) +4. Browse the dedicated [SGLang](https://dstack.ai/examples/inference/sglang/) + and [vLLM](https://dstack.ai/examples/inference/vllm/) examples +5. Check the [AMD](https://dstack.ai/examples/accelerators/amd/) example for + more AMD deployment and training configurations diff --git a/mkdocs.yml b/mkdocs.yml index 8dbe0ad85..1baa53015 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -100,8 +100,10 @@ plugins: "docs/fleets.md": "docs/concepts/fleets.md" "docs/examples/llms/llama31.md": "examples/inference/vllm/index.md" "docs/examples/llms/llama32.md": "examples/inference/vllm/index.md" + "docs/examples/llms/qwen36.md": "examples/models/qwen36/index.md" "examples/llms/llama31/index.md": "examples/inference/vllm/index.md" "examples/llms/llama32/index.md": "examples/inference/vllm/index.md" + "examples/llms/qwen36/index.md": "examples/models/qwen36/index.md" "docs/examples/accelerators/amd/index.md": "examples/accelerators/amd/index.md" "docs/examples/deployment/nim/index.md": "examples/inference/nim/index.md" "docs/examples/deployment/vllm/index.md": "examples/inference/vllm/index.md" @@ -308,6 +310,8 @@ nav: - AMD: examples/accelerators/amd/index.md - TPU: examples/accelerators/tpu/index.md - Tenstorrent: examples/accelerators/tenstorrent/index.md + - Models: + - Qwen 3.6: examples/models/qwen36/index.md - Blog: - blog/index.md - Case studies: blog/case-studies.md