From 2df200fcbe4e5e7c950e29ec585e21343370cbde Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Wed, 22 Apr 2026 14:41:16 +0200 Subject: [PATCH 1/2] Clarify Kubernetes backend UI config docs --- docs/docs/concepts/backends.md | 48 ++++++++++- .../server/services/test_backend_configs.py | 86 +++++++++++++++++++ 2 files changed, 131 insertions(+), 3 deletions(-) diff --git a/docs/docs/concepts/backends.md b/docs/docs/concepts/backends.md index 4c5606206..9a1092bc5 100644 --- a/docs/docs/concepts/backends.md +++ b/docs/docs/concepts/backends.md @@ -464,7 +464,7 @@ There are two ways to configure GCP: using a service account or using the defaul ??? info "User interface" - If you are configuring the `gcp` backend on the [project settigns page](projects.md#backends), + If you are configuring the `gcp` backend on the [project settings page](projects.md#backends), specify the contents of the JSON file in `data`:
@@ -699,7 +699,7 @@ projects: ``` ??? info "User interface" - If you are configuring the `nebius` backend on the [project settigns page](projects.md#backends), + If you are configuring the `nebius` backend on the [project settings page](projects.md#backends), specify the contents of the private key file in `private_key_content`:
@@ -1048,8 +1048,10 @@ projects: - name: main backends: - type: kubernetes + kubeconfig: filename: ~/.kube/config + proxy_jump: hostname: 204.12.171.137 port: 32000 @@ -1057,7 +1059,7 @@ projects:
-??? info "Proxy jump" +!!! info "Proxy jump" To allow the `dstack` server and CLI to access runs via SSH, `dstack` requires a node that acts as a jump host to proxy SSH traffic into containers. To configure this node, specify `hostname` and `port` under the `proxy_jump` property: @@ -1067,6 +1069,46 @@ projects: No additional setup is required — `dstack` configures and manages the proxy automatically. +??? info "User interface" + If you are configuring the `kubernetes` backend on the [project settings page](projects.md#backends), + specify the contents of the `kubeconfig` file in `data`: + +
+ + ```yaml + type: kubernetes + + kubeconfig: + data: | + apiVersion: v1 + kind: Config + current-context: kubernetes-admin@gpu-cluster + + clusters: + - name: gpu-cluster + cluster: + server: https://gpu-cluster.internal.example.com:6443 + certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0t...LS0tLQo= + + users: + - name: kubernetes-admin + user: + client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0t...LS0tLQo= + client-key-data: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0t...LS0tLQo= + + contexts: + - name: kubernetes-admin@gpu-cluster + context: + cluster: gpu-cluster + user: kubernetes-admin + + proxy_jump: + hostname: 204.12.171.137 + port: 32000 + ``` + +
+ ??? info "Required operators" === "NVIDIA" For `dstack` to correctly detect GPUs in your Kubernetes cluster, the cluster must have the diff --git a/src/tests/_internal/server/services/test_backend_configs.py b/src/tests/_internal/server/services/test_backend_configs.py index 455b38c6e..96b5c998d 100644 --- a/src/tests/_internal/server/services/test_backend_configs.py +++ b/src/tests/_internal/server/services/test_backend_configs.py @@ -1,14 +1,17 @@ import json import sys from pathlib import Path +from textwrap import dedent from unittest.mock import patch import pytest import yaml +from dstack._internal.core.backends.kubernetes.backend import KubernetesBackend from dstack._internal.server import settings from dstack._internal.server.services.config import ( ServerConfigManager, + config_yaml_to_backend_config, file_config_to_config, ) @@ -144,3 +147,86 @@ def test_with_private_key_file(self, tmp_path: Path): assert backend_cfg.creds.service_account_id == "serviceaccount-e00test" assert backend_cfg.creds.public_key_id == "publickey-e00test" assert backend_cfg.creds.private_key_content == "TEST_PRIVATE_KEY" + + +class TestKubernetesBackendConfig: + def test_ui_config_embedded_kubeconfig_initializes_backend(self): + config_yaml = dedent( + """ + type: kubernetes + kubeconfig: + data: | + apiVersion: v1 + kind: Config + current-context: gpu-training + + clusters: + - name: gpu-training + cluster: + server: https://gpu-cluster.internal.example.com:6443 + insecure-skip-tls-verify: true + + users: + - name: ml-engineer + user: + token: test-token + + contexts: + - name: gpu-training + context: + cluster: gpu-training + user: ml-engineer + + proxy_jump: + hostname: 204.12.171.137 + port: 32000 + """ + ) + + backend_config = config_yaml_to_backend_config(config_yaml) + backend = KubernetesBackend(backend_config) + + assert backend.compute().api.api_client.configuration.host == ( + "https://gpu-cluster.internal.example.com:6443" + ) + assert backend.compute().proxy_jump.hostname == "204.12.171.137" + assert backend.compute().proxy_jump.port == 32000 + + def test_kubeconfig_context_namespace_does_not_set_backend_namespace(self): + config_yaml = dedent( + """ + type: kubernetes + kubeconfig: + data: | + apiVersion: v1 + kind: Config + current-context: gpu-training + + clusters: + - name: gpu-training + cluster: + server: https://gpu-cluster.internal.example.com:6443 + insecure-skip-tls-verify: true + + users: + - name: ml-engineer + user: + token: test-token + + contexts: + - name: gpu-training + context: + cluster: gpu-training + user: ml-engineer + namespace: training-jobs + + proxy_jump: + hostname: 204.12.171.137 + port: 32000 + """ + ) + + backend_config = config_yaml_to_backend_config(config_yaml) + backend = KubernetesBackend(backend_config) + + assert backend.compute().config.namespace == "default" From 6372527040bbd981276c6a139570d63c39e0fc38 Mon Sep 17 00:00:00 2001 From: Andrey Cheptsov Date: Wed, 22 Apr 2026 14:43:56 +0200 Subject: [PATCH 2/2] Remove outdated guide pages --- docs/blog/posts/gpu-health-checks.md | 2 +- docs/blog/posts/kubernetes-beta.md | 2 +- docs/blog/posts/nebius-in-dstack-sky.md | 4 +- docs/docs/guides/clusters.md | 82 ----------------- docs/docs/guides/dstack-sky.md | 44 --------- docs/docs/guides/kubernetes.md | 114 ------------------------ 6 files changed, 4 insertions(+), 244 deletions(-) delete mode 100644 docs/docs/guides/clusters.md delete mode 100644 docs/docs/guides/dstack-sky.md delete mode 100644 docs/docs/guides/kubernetes.md diff --git a/docs/blog/posts/gpu-health-checks.md b/docs/blog/posts/gpu-health-checks.md index b864e7785..9b074023c 100644 --- a/docs/blog/posts/gpu-health-checks.md +++ b/docs/blog/posts/gpu-health-checks.md @@ -68,6 +68,6 @@ If you have experience with GPU reliability or ideas for automated recovery, joi !!! info "What's next?" 1. Check [Quickstart](../../docs/quickstart.md) - 2. Explore the [clusters](../../docs/guides/clusters.md) guide + 2. Explore the [clusters](../../examples.md#clusters) examples 3. Learn more about [metrics](../../docs/concepts/metrics.md) 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/kubernetes-beta.md b/docs/blog/posts/kubernetes-beta.md index 6dfc7cd5b..a00a429af 100644 --- a/docs/blog/posts/kubernetes-beta.md +++ b/docs/blog/posts/kubernetes-beta.md @@ -311,5 +311,5 @@ Support for AMD GPUs is coming soon — our team is actively working on it right 2. Explore [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 3. Read the the [clusters](../../docs/guides/clusters.md) guide + 3. Browse the [clusters](../../examples.md#clusters) examples 4. Join [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/blog/posts/nebius-in-dstack-sky.md b/docs/blog/posts/nebius-in-dstack-sky.md index a65a06dcf..dd1617d29 100644 --- a/docs/blog/posts/nebius-in-dstack-sky.md +++ b/docs/blog/posts/nebius-in-dstack-sky.md @@ -104,7 +104,7 @@ $ dstack apply -f my-cluster.dstack.yml Once the fleet is ready, you can run [distributed tasks](../../docs/concepts/tasks.md#distributed-tasks). `dstack` automatically configures drivers, networking, and fast GPU-to-GPU interconnect. -To learn more, see the [clusters](../../docs/guides/clusters.md) guide. +To learn more, see the [clusters](../../examples/clusters/nebius/index.md) guide. With Nebius joining `dstack` Sky, users can now run on-demand and spot GPUs and clusters directly through the marketplace—gaining access to the same production grade infrastrucure Nebius customers use for frontier-scale training, without needing a separate Nebius account. @@ -124,4 +124,4 @@ Our goal is to give teams maximum flexibility while removing the complexity of m 4. Explore [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), [services](../../docs/concepts/services.md), and [fleets](../../docs/concepts/fleets.md) - 5. Reaad the the [clusters](../../docs/guides/clusters.md) guide + 5. Read the [clusters](../../examples/clusters/nebius/index.md) guide diff --git a/docs/docs/guides/clusters.md b/docs/docs/guides/clusters.md deleted file mode 100644 index 30bfbee6e..000000000 --- a/docs/docs/guides/clusters.md +++ /dev/null @@ -1,82 +0,0 @@ -# Clusters - -A cluster is a [fleet](../concepts/fleets.md) with its `placement` set to `cluster`. This configuration ensures that the instances within the fleet are interconnected, enabling fast inter-node communication—crucial for tasks such as efficient distributed training. - -## Fleets - -Ensure a fleet is created before you run any distributed task. This can be either an SSH fleet or a cloud fleet. - -### SSH fleets - -[SSH fleets](../concepts/fleets.md#ssh-fleets) can be used to create a fleet out of existing baremetals or VMs, e.g. if they are already pre-provisioned, or set up on-premises. - -> For SSH fleets, fast interconnect is supported provided that the hosts are pre-configured with the appropriate interconnect drivers. - -### Cloud fleets - -[Cloud fleets](../concepts/fleets.md#backend-fleets) allow to provision interconnected clusters across supported backends. -For cloud fleets, fast interconnect is currently supported only on the `aws`, `gcp`, `nebius`, and `runpod` backends. - -=== "AWS" - When you create a cloud fleet with AWS, [Elastic Fabric Adapter](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html) networking is automatically configured if it’s supported for the corresponding instance type. - - !!! info "Backend configuration" - Note, EFA requires the `public_ips` to be set to `false` in the `aws` backend configuration. - Refer to the [AWS](../../examples/clusters/aws/index.md) example for more details. - -=== "GCP" - When you create a cloud fleet with GCP, `dstack` automatically configures [GPUDirect-TCPXO and GPUDirect-TCPX](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx-autopilot) networking for the A3 Mega and A3 High instance types, as well as RoCE networking for the A4 instance type. - - !!! info "Backend configuration" - You may need to configure `extra_vpcs` and `roce_vpcs` in the `gcp` backend configuration. - Refer to the [GCP](../../examples/clusters/gcp/index.md) examples for more details. - -=== "Nebius" - When you create a cloud fleet with Nebius, [InfiniBand](https://docs.nebius.com/compute/clusters/gpu) networking is automatically configured if it’s supported for the corresponding instance type. - -=== "Runpod" - When you run multinode tasks in a cluster cloud fleet with Runpod, `dstack` provisions [Runpod Instant Clusters](https://docs.runpod.io/instant-clusters) with InfiniBand networking configured. - -> To request fast interconnect support for other backends, -file an [issue](https://github.com/dstackai/dstack/issues){:target="_ blank"}. - -## Distributed tasks - -A distributed task is a task with `nodes` set to a value greater than `2`. In this case, `dstack` first ensures a -suitable fleet is available, then selects the master node (to obtain its IP) and finally runs jobs on each node. - -Within the task's `commands`, it's possible to use `DSTACK_MASTER_NODE_IP`, `DSTACK_NODES_IPS`, `DSTACK_NODE_RANK`, and other -[system environment variables](../concepts/tasks.md#system-environment-variables) for inter-node communication. - -??? info "MPI" - If want to use MPI, you can set `startup_order` to `workers-first` and `stop_criteria` to `master-done`, and use `DSTACK_MPI_HOSTFILE`. - See the [NCCL/RCCL tests](../../examples/clusters/nccl-rccl-tests/index.md) examples. - -!!! info "Retry policy" - By default, if any of the nodes fails, `dstack` terminates the entire run. Configure a [retry policy](../concepts/tasks.md#retry-policy) to restart the run if any node fails. - -Refer to [distributed tasks](../concepts/tasks.md#distributed-tasks) for an example. - -## NCCL/RCCL tests - -To test the interconnect of a created fleet, ensure you run [NCCL/RCCL tests](../../examples/clusters/nccl-rccl-tests/index.md) tests using MPI. - -## Volumes - -### Instance volumes - -[Instance volumes](../concepts/volumes.md#instance-volumes) enable mounting any folder from the host into the container, allowing data persistence during distributed tasks. - -Instance volumes can be used to mount: - -* Regular folders (data persists only while the fleet exists) -* Folders that are mounts of shared filesystems (e.g., manually mounted shared filesystems). - -### Network volumes - -Currently, no backend supports multi-attach [network volumes](../concepts/volumes.md#network-volumes) for distributed tasks. However, single-attach volumes can be used by leveraging volume name [interpolation syntax](../concepts/volumes.md#distributed-tasks). This approach mounts a separate single-attach volume to each node. - -!!! info "What's next?" - 1. Read about [distributed tasks](../concepts/tasks.md#distributed-tasks), [fleets](../concepts/fleets.md), and [volumes](../concepts/volumes.md) - 2. Browse the [Clusters](../../examples.md#clusters) and [Distributed training](../../examples.md#distributed-training) examples - diff --git a/docs/docs/guides/dstack-sky.md b/docs/docs/guides/dstack-sky.md deleted file mode 100644 index e12f1ccef..000000000 --- a/docs/docs/guides/dstack-sky.md +++ /dev/null @@ -1,44 +0,0 @@ -# dstack Sky - -If you don't want to host the `dstack` server or would like to access GPU from the `dstack` marketplace, -sign up with [dstack Sky](../guides/dstack-sky.md). - -### Set up the CLI - -If you've signed up, open your project settings, and copy the `dstack project add` command to point the CLI to the project. - -![](https://raw.githubusercontent.com/dstackai/static-assets/main/static-assets/images/dstack-sky-project-config.png){ width=800 } - -Then, install the CLI on your machine and use the copied command. - -
- -```shell -$ pip install dstack -$ dstack project add --name peterschmidt85 \ - --url https://sky.dstack.ai \ - --token bbae0f28-d3dd-4820-bf61-8f4bb40815da - -Configuration is updated at ~/.dstack/config.yml -``` - -
- -### Configure clouds - -By default, [dstack Sky](https://sky.dstack.ai) -uses the GPU from its marketplace, which requires a credit card to be attached in your account -settings. - -To use your own cloud accounts, click the settings icon of the corresponding backend and specify credentials: - -![](https://raw.githubusercontent.com/dstackai/static-assets/main/static-assets/images/dstack-sky-edit-backend-config.png){ width=800 } - -For more details on how to configure your own cloud accounts, check -the [server/config.yml reference](../reference/server/config.yml.md). - -## What's next? - -1. Follow [quickstart](../quickstart.md) -2. Browse [examples](https://dstack.ai/examples) -3. Join the community via [Discord](https://discord.gg/u8SmfwPpMd) diff --git a/docs/docs/guides/kubernetes.md b/docs/docs/guides/kubernetes.md deleted file mode 100644 index 85dc22a80..000000000 --- a/docs/docs/guides/kubernetes.md +++ /dev/null @@ -1,114 +0,0 @@ -# Kubernetes - -The [kubernetes](../concepts/backends.md#kubernetes) backend enables `dstack` to run [dev environments](/docs/concepts/dev-environments), [tasks](/docs/concepts/tasks), and [services](/docs/concepts/services) directly on existing Kubernetes clusters. - -If your GPUs are already deployed on Kubernetes and your team relies on its ecosystem and tooling, use this backend to integrate `dstack` with your clusters. - -> If Kubernetes is not required, you can run `dstack` on clouds or on-prem clusters without Kubernetes by using [VM-based](../concepts/backends.md#vm-based), [container-based](../concepts/backends.md#container-based), or [on-prem](../concepts/backends.md#on-prem) backends. - -## Setting up the backend - -To use the `kubernetes` backend with `dstack`, you need to configure it with the path to the kubeconfig file, the IP address of any node in the cluster, and the port that `dstack` will use for proxying SSH traffic. -This configuration is defined in the `~/.dstack/server/config.yml` file: - -
- -```yaml -projects: -- name: main - backends: - - type: kubernetes - kubeconfig: - filename: ~/.kube/config - proxy_jump: - hostname: 204.12.171.137 - port: 32000 -``` - -
- -### Proxy jump - -To allow the `dstack` server and CLI to access runs via SSH, `dstack` requires a node that acts as a jump host to proxy SSH traffic into containers. - -To configure this node, specify `hostname` and `port` under the `proxy_jump` property: - -- `hostname` — the IP address of any cluster node selected as the jump host. Both the `dstack` server and CLI must be able to reach it. This node can be either a GPU node or a CPU-only node — it makes no difference. -- `port` — any accessible port on that node, which `dstack` uses to forward SSH traffic. - -No additional setup is required — `dstack` configures and manages the proxy automatically. - -### NVIDIA GPU Operator - -> For `dstack` to correctly detect GPUs in your Kubernetes cluster, the cluster must have the -[NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/index.html) pre-installed. - -After the backend is set up, you interact with `dstack` just as you would with other backends or SSH fleets. You can run dev environments, tasks, and services. - -## Fleets - -### Clusters - -If you’d like to run [distributed tasks](../concepts/tasks.md#distributed-tasks) with the `kubernetes` backend, you first need to create a fleet with `placement` set to `cluster`: - -
- - ```yaml - type: fleet - # The name is optional; if not specified, one is generated automatically - name: my-k8s-fleet - - # For `kubernetes`, `min` should be set to `0` since it can't pre-provision VMs. - # Optionally, you can set the maximum number of nodes to limit scaling. - nodes: 0.. - - placement: cluster - - backends: [kubernetes] - - resources: - # Specify requirements to filter nodes - gpu: 1..8 - ``` - -
- -Then, create the fleet using the `dstack apply` command: - -
- -```shell -$ dstack apply -f examples/misc/fleets/.dstack.yml - -Provisioning... ----> 100% - - FLEET INSTANCE BACKEND GPU PRICE STATUS CREATED -``` - -
- -Once the fleet is created, you can run [distributed tasks](../concepts/tasks.md#distributed-tasks). `dstack` takes care of orchestration automatically. - -For more details on clusters, see the [corresponding guide](clusters.md). - -> Fleets with `placement` set to `cluster` can be used not only for distributed tasks, but also for dev environments, single-node tasks, and services. -> Since Kubernetes clusters are interconnected by default, you can always set `placement` to `cluster`. - -!!! info "Fleets" - It’s generally recommended to create [fleets](../concepts/fleets.md) even if you don’t plan to run distributed tasks. - -## FAQ - -??? info "Is managed Kubernetes with auto-scaling supported?" - Managed Kubernetes is supported. However, the `kubernetes` backend can only run on pre-provisioned nodes. - Support for auto-scalable Kubernetes clusters is coming soon—you can track progress in the corresponding [issue](https://github.com/dstackai/dstack/issues/3126). - - If on-demand provisioning is important, we recommend using [VM-based](../concepts/backends.md#vm-based) backends as they already support auto-scaling. - -??? info "When should I use the Kubernetes backend?" - Choose the `kubernetes` backend if your GPUs already run on Kubernetes and your team depends on its ecosystem and tooling. - - If your priority is orchestrating cloud GPUs and Kubernetes isn’t a must, [VM-based](../concepts/backends.md#vm-based) backends are a better fit thanks to their native cloud integration. - - For on-prem GPUs where Kubernetes is optional, [SSH fleets](../concepts/fleets.md#ssh-fleets) provide a simpler and more lightweight alternative.