From fa0daa22e06b33243e22a52f7d01fe1b218dbcd5 Mon Sep 17 00:00:00 2001 From: mmoulikk Date: Thu, 16 Apr 2026 18:14:49 +0530 Subject: [PATCH 1/9] fix(nebius): filter ARM64 images from boot disk selection --- v1/providers/nebius/instance.go | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index b474773..8c76f68 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -1209,7 +1209,7 @@ func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName stri publicImagesParent := c.getPublicImagesParent() // Skip validation for known-good common families to speed up instance start - knownFamilies := []string{"ubuntu22.04-cuda12", "mk8s-worker-node-v-1-32-ubuntu24.04", "mk8s-worker-node-v-1-32-ubuntu24.04-cuda12.8"} + knownFamilies := []string{"ubuntu24.04-cuda13.0", "ubuntu24.04-cuda12", "ubuntu22.04-cuda12", "mk8s-worker-node-v-1-32-ubuntu24.04", "mk8s-worker-node-v-1-32-ubuntu24.04-cuda12.8"} isKnownFamily := false for _, known := range knownFamilies { if imageFamily == known { @@ -1230,21 +1230,24 @@ func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName stri return baseReq, nil } - // For unknown families, validate first - _, err := c.sdk.Services().Compute().V1().Image().GetLatestByFamily(ctx, &compute.GetImageLatestByFamilyRequest{ + // For unknown families, validate first and check architecture + latestImage, err := c.sdk.Services().Compute().V1().Image().GetLatestByFamily(ctx, &compute.GetImageLatestByFamilyRequest{ ParentId: publicImagesParent, ImageFamily: imageFamily, }) if err == nil { - // Family works, use it - baseReq.Spec.Source = &compute.DiskSpec_SourceImageFamily{ - SourceImageFamily: &compute.SourceImageFamily{ - ImageFamily: imageFamily, - ParentId: publicImagesParent, - }, + isARM64 := latestImage.Spec != nil && latestImage.Spec.GetCpuArchitecture() == compute.ImageSpec_ARM64 + if !isARM64 { + baseReq.Spec.Source = &compute.DiskSpec_SourceImageFamily{ + SourceImageFamily: &compute.SourceImageFamily{ + ImageFamily: imageFamily, + ParentId: publicImagesParent, + }, + } + baseReq.Metadata.Labels["image-family"] = imageFamily + return baseReq, nil } - baseReq.Metadata.Labels["image-family"] = imageFamily - return baseReq, nil + // ARM64 family — fall through to getWorkingPublicImageID which filters by architecture } } @@ -1290,6 +1293,10 @@ func (c *NebiusClient) getWorkingPublicImageID(ctx context.Context, requestedIma continue } + if image.Spec != nil && image.Spec.GetCpuArchitecture() == compute.ImageSpec_ARM64 { + continue + } + imageName := strings.ToLower(image.Metadata.Name) // Set fallback to first available image @@ -1583,6 +1590,9 @@ func (c *NebiusClient) parseInstanceType(ctx context.Context, instanceTypeID str func (c *NebiusClient) resolveImageFamily(ctx context.Context, imageID string) (string, error) { // Common Nebius image families - if ImageID matches one of these, use it directly commonFamilies := []string{ + "ubuntu24.04-cuda13.0", + "ubuntu24.04-cuda12", + "ubuntu24.04-driverless", "ubuntu22.04-cuda12", "mk8s-worker-node-v-1-32-ubuntu24.04", "mk8s-worker-node-v-1-32-ubuntu24.04-cuda12.8", From df17220a563f2d4c3a589750abd0c1fcb9e07afb Mon Sep 17 00:00:00 2001 From: abhtripathi Date: Fri, 17 Apr 2026 13:40:56 +0530 Subject: [PATCH 2/9] fix: iptable persistent removed from packages --- v1/providers/nebius/instance.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index 8c76f68..9068d76 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -1762,7 +1762,6 @@ func generateCloudInitUserData(publicKey string, firewallRules v1.FirewallRules) script := `#cloud-config packages: - ufw - - iptables-persistent ` // Add SSH key configuration if provided @@ -1794,6 +1793,13 @@ packages: // accessible from the internet by default. commands = append(commands, generateIPTablesCommands()...) + // Install iptables-persistent here (in runcmd, after UFW is configured) rather than + // in the packages: directive. Installing it as a package would start netfilter-persistent.service + // immediately at first boot, which races with ufw.service — netfilter-persistent flushes + // UFW's rules before UFW finishes loading them (Launchpad bug #1987227). By installing + // it here, the service only starts after UFW is already set up and the drop-in is in place. + commands = append(commands, "DEBIAN_FRONTEND=noninteractive apt-get install -y iptables-persistent") + // Save the complete iptables state (UFW chains + DOCKER-USER rules) so it // survives instance stop/start cycles. Cloud-init runcmd only executes on // first boot; on subsequent boots netfilter-persistent restores this snapshot, From 070bea4bd4fa224201c813d2940ae04aec99962b Mon Sep 17 00:00:00 2001 From: harshsharma Date: Fri, 17 Apr 2026 16:08:22 +0530 Subject: [PATCH 3/9] fix(nebius): score-based image selection in getWorkingPublicImageID --- v1/providers/nebius/instance.go | 104 ++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 45 deletions(-) diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index 9068d76..65454cb 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -1265,11 +1265,9 @@ func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName stri return nil, fmt.Errorf("could not resolve image %s to either a working family or image ID: %w", attrs.ImageID, err) } -// getWorkingPublicImageID gets a working public image ID based on the requested image type -// -//nolint:gocognit,gocyclo // Complex function trying multiple image resolution strategies +// getWorkingPublicImageID gets a working public image ID based on the requested image type. +// It scores every non-ARM64 image and returns the highest-scored one, this is done to handle change in ordering of images from nebius api. func (c *NebiusClient) getWorkingPublicImageID(ctx context.Context, requestedImage string) (string, error) { - // Get available public images from the correct region publicImagesParent := c.getPublicImagesParent() imagesResp, err := c.sdk.Services().Compute().V1().Image().List(ctx, &compute.ListImagesRequest{ ParentId: publicImagesParent, @@ -1282,67 +1280,83 @@ func (c *NebiusClient) getWorkingPublicImageID(ctx context.Context, requestedIma return "", fmt.Errorf("no public images available") } - // Try to find the best match based on the requested image requestedLower := strings.ToLower(requestedImage) - var bestMatch *compute.Image - var fallbackImage *compute.Image + var bestImage *compute.Image + bestScore := -1 for _, image := range imagesResp.GetItems() { if image.Metadata == nil { continue } - if image.Spec != nil && image.Spec.GetCpuArchitecture() == compute.ImageSpec_ARM64 { continue } - imageName := strings.ToLower(image.Metadata.Name) - - // Set fallback to first available image - if fallbackImage == nil { - fallbackImage = image + score := scoreImage(image, requestedLower) + if score > bestScore { + bestScore = score + bestImage = image } + } - // Look for Ubuntu matches - if strings.Contains(requestedLower, "ubuntu") && strings.Contains(imageName, "ubuntu") { - // Prefer specific version matches - //nolint:gocritic // if-else chain is clearer than switch for version matching logic - if strings.Contains(requestedLower, "24.04") || strings.Contains(requestedLower, "24") { - if strings.Contains(imageName, "ubuntu24.04") { - bestMatch = image - break - } - } else if strings.Contains(requestedLower, "22.04") || strings.Contains(requestedLower, "22") { - if strings.Contains(imageName, "ubuntu22.04") { - bestMatch = image - break - } - } else if strings.Contains(requestedLower, "20.04") || strings.Contains(requestedLower, "20") { - if strings.Contains(imageName, "ubuntu20.04") { - bestMatch = image - break - } - } + if bestImage == nil { + return "", fmt.Errorf("no suitable public image found") + } + + return bestImage.Metadata.Id, nil +} - // Any Ubuntu image is better than non-Ubuntu - if bestMatch == nil { - bestMatch = image +// scoreImage assigns a priority score to an image. Higher score = better match. +// When requestedImage is empty (default deploy), the function uses a standard +// preference order: Ubuntu 24 CUDA 13 > Ubuntu 24 CUDA 12 > Ubuntu 22 CUDA 12 +// > Ubuntu 24 driverless > any Ubuntu > worker node > anything else. +// When requestedImage is non-empty, exact family/name matches get a bonus. +func scoreImage(image *compute.Image, requestedLower string) int { + family := "" + if image.Spec != nil { + family = strings.ToLower(image.Spec.GetImageFamily()) + } + name := strings.ToLower(image.Metadata.Name) + + isWorkerNode := strings.Contains(family, "mk8s-worker") || strings.Contains(name, "worker-node") + isUbuntu := strings.Contains(family, "ubuntu") || strings.Contains(name, "ubuntu") + hasCuda13 := strings.Contains(family, "cuda13") || strings.Contains(name, "cuda13") + hasCuda12 := strings.Contains(family, "cuda12") || strings.Contains(name, "cuda12") + isUbuntu24 := strings.Contains(family, "ubuntu24") || strings.Contains(name, "ubuntu24") + isUbuntu22 := strings.Contains(family, "ubuntu22") || strings.Contains(name, "ubuntu22") + + score := 1 // baseline for any non-ARM64 image + + if isWorkerNode { + score = 10 + } else if isUbuntu { + score = 50 + if isUbuntu24 { + score = 60 + if hasCuda13 { + score = 100 + } else if hasCuda12 { + score = 90 + } + } else if isUbuntu22 { + score = 55 + if hasCuda12 { + score = 80 } } } - // Use best match if found, otherwise fallback - selectedImage := bestMatch - if selectedImage == nil { - selectedImage = fallbackImage - } - - if selectedImage == nil { - return "", fmt.Errorf("no suitable public image found") + // If the caller explicitly requested something, boost images that match the request + if requestedLower != "" { + if strings.Contains(name, requestedLower) || strings.Contains(family, requestedLower) || requestedLower == family { + score += 200 + } else if isUbuntu && strings.Contains(requestedLower, "ubuntu") { + score += 50 + } } - return selectedImage.Metadata.Id, nil + return score } // getPublicImagesParent determines the correct public images parent ID based on project routing code From 3f3e60d7b6a9e345575cdf46af81abda8d5e7ca6 Mon Sep 17 00:00:00 2001 From: harshsharma Date: Fri, 17 Apr 2026 17:10:34 +0530 Subject: [PATCH 4/9] refactor(nebius): extract scoreImage helpers and add unit tests --- v1/providers/nebius/instance.go | 99 +++++++++++++++++----------- v1/providers/nebius/instance_test.go | 81 +++++++++++++++++++++++ 2 files changed, 142 insertions(+), 38 deletions(-) diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index 65454cb..59537c5 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -18,6 +18,22 @@ const ( platformTypeCPU = "cpu" ) +// image scoring tiers. higher wins. exact-match bonus (200) > max baseline gap +// so explicit requests always override defaults, even for worker-node. +const ( + imageScoreBaseline = 1 + imageScoreWorkerNode = 10 + imageScoreUbuntuGeneric = 50 + imageScoreUbuntu22 = 55 + imageScoreUbuntu24 = 60 + imageScoreUbuntu22Cuda = 80 + imageScoreUbuntu24Cuda12 = 90 + imageScoreUbuntu24Cuda13 = 100 + + imageScoreExactMatchBonus = 200 + imageScoreUbuntuHintBonus = 50 +) + //nolint:gocyclo,funlen // Complex instance creation with resource management func (c *NebiusClient) CreateInstance(ctx context.Context, attrs v1.CreateInstanceAttrs) (*v1.Instance, error) { // Track created resources for automatic cleanup on failure @@ -1307,11 +1323,10 @@ func (c *NebiusClient) getWorkingPublicImageID(ctx context.Context, requestedIma return bestImage.Metadata.Id, nil } -// scoreImage assigns a priority score to an image. Higher score = better match. -// When requestedImage is empty (default deploy), the function uses a standard -// preference order: Ubuntu 24 CUDA 13 > Ubuntu 24 CUDA 12 > Ubuntu 22 CUDA 12 -// > Ubuntu 24 driverless > any Ubuntu > worker node > anything else. -// When requestedImage is non-empty, exact family/name matches get a bonus. +// scoreImage picks the best public image when Nebius returns a messy list. +// default order: ubuntu24+cuda13 > ubuntu24+cuda12 > ubuntu22+cuda12 > +// ubuntu24 > ubuntu22 > other ubuntu > worker-node > rest. +// request bonus layers on top if the caller asked for something specific. func scoreImage(image *compute.Image, requestedLower string) int { family := "" if image.Spec != nil { @@ -1319,44 +1334,52 @@ func scoreImage(image *compute.Image, requestedLower string) int { } name := strings.ToLower(image.Metadata.Name) - isWorkerNode := strings.Contains(family, "mk8s-worker") || strings.Contains(name, "worker-node") - isUbuntu := strings.Contains(family, "ubuntu") || strings.Contains(name, "ubuntu") - hasCuda13 := strings.Contains(family, "cuda13") || strings.Contains(name, "cuda13") - hasCuda12 := strings.Contains(family, "cuda12") || strings.Contains(name, "cuda12") - isUbuntu24 := strings.Contains(family, "ubuntu24") || strings.Contains(name, "ubuntu24") - isUbuntu22 := strings.Contains(family, "ubuntu22") || strings.Contains(name, "ubuntu22") - - score := 1 // baseline for any non-ARM64 image - - if isWorkerNode { - score = 10 - } else if isUbuntu { - score = 50 - if isUbuntu24 { - score = 60 - if hasCuda13 { - score = 100 - } else if hasCuda12 { - score = 90 - } - } else if isUbuntu22 { - score = 55 - if hasCuda12 { - score = 80 - } + return baseImageScore(name, family) + requestMatchBonus(name, family, requestedLower) +} + +// baseImageScore scores based on the image alone. worker-node is checked first +// so an ubuntu24-cuda12.8 worker image doesn't get classified as ubuntu24+cuda12. +func baseImageScore(name, family string) int { + has := func(s string) bool { return strings.Contains(name, s) || strings.Contains(family, s) } + + if has("mk8s-worker") || strings.Contains(name, "worker-node") { + return imageScoreWorkerNode + } + if !has("ubuntu") { + return imageScoreBaseline + } + if has("ubuntu24") { + if has("cuda13") { + return imageScoreUbuntu24Cuda13 } + if has("cuda12") { + return imageScoreUbuntu24Cuda12 + } + return imageScoreUbuntu24 } - - // If the caller explicitly requested something, boost images that match the request - if requestedLower != "" { - if strings.Contains(name, requestedLower) || strings.Contains(family, requestedLower) || requestedLower == family { - score += 200 - } else if isUbuntu && strings.Contains(requestedLower, "ubuntu") { - score += 50 + if has("ubuntu22") { + if has("cuda12") { + return imageScoreUbuntu22Cuda } + return imageScoreUbuntu22 } + return imageScoreUbuntuGeneric +} - return score +// requestMatchBonus: +200 if request is a substring of name/family (big enough +// to beat any baseline gap), +50 as a weak nudge when caller said "ubuntu" but +// nothing matched directly. +func requestMatchBonus(name, family, requestedLower string) int { + if requestedLower == "" { + return 0 + } + if strings.Contains(name, requestedLower) || strings.Contains(family, requestedLower) || requestedLower == family { + return imageScoreExactMatchBonus + } + if strings.Contains(requestedLower, "ubuntu") && (strings.Contains(name, "ubuntu") || strings.Contains(family, "ubuntu")) { + return imageScoreUbuntuHintBonus + } + return 0 } // getPublicImagesParent determines the correct public images parent ID based on project routing code diff --git a/v1/providers/nebius/instance_test.go b/v1/providers/nebius/instance_test.go index 389dea2..58ffe5b 100644 --- a/v1/providers/nebius/instance_test.go +++ b/v1/providers/nebius/instance_test.go @@ -6,6 +6,8 @@ import ( "time" v1 "github.com/brevdev/cloud/v1" + common "github.com/nebius/gosdk/proto/nebius/common/v1" + compute "github.com/nebius/gosdk/proto/nebius/compute/v1" "github.com/stretchr/testify/assert" ) @@ -417,3 +419,82 @@ func TestParseInstanceTypeFormat(t *testing.T) { }) } } + +func makeTestImage(name, family string) *compute.Image { + return &compute.Image{ + Metadata: &common.ResourceMetadata{Name: name}, + Spec: &compute.ImageSpec{ + ImageFamily: family, + CpuArchitecture: compute.ImageSpec_AMD64, + }, + } +} + +func TestBaseImageScore(t *testing.T) { + tests := []struct { + name, family string + want int + }{ + {"ubuntu24.04-cuda13.0.0.2.673", "ubuntu24.04-cuda13.0", imageScoreUbuntu24Cuda13}, + {"ubuntu24.04-cuda12.8.0.0.12", "ubuntu24.04-cuda12", imageScoreUbuntu24Cuda12}, + {"ubuntu22.04-cuda12.3.0.0.5", "ubuntu22.04-cuda12", imageScoreUbuntu22Cuda}, + {"ubuntu24.04-driverless-20260401", "ubuntu24.04-driverless", imageScoreUbuntu24}, + {"ubuntu22.04-20260401", "ubuntu22.04", imageScoreUbuntu22}, + {"ubuntu20.04-20260401", "ubuntu20.04", imageScoreUbuntuGeneric}, + {"worker-node-v-1-33-ubuntu24.04-cuda12.8-20260403", "mk8s-worker-node-v-1-33-ubuntu24.04-cuda12.8", imageScoreWorkerNode}, + {"debian12-20260301", "debian12", imageScoreBaseline}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := baseImageScore(strings.ToLower(tc.name), strings.ToLower(tc.family)) + assert.Equal(t, tc.want, got) + }) + } +} + +func TestRequestMatchBonus(t *testing.T) { + tests := []struct { + desc, name, family, requested string + want int + }{ + {"empty request, no bonus", "ubuntu24.04-cuda13.0", "ubuntu24.04-cuda13.0", "", 0}, + {"exact family match", "ubuntu24.04-cuda13.0.0.2.673", "ubuntu24.04-cuda13.0", "ubuntu24.04-cuda13.0", imageScoreExactMatchBonus}, + {"substring match in name", "worker-node-v-1-33-ubuntu24.04-cuda12.8", "mk8s-worker-node-v-1-33-ubuntu24.04-cuda12.8", "worker-node", imageScoreExactMatchBonus}, + {"ubuntu hint without exact match", "ubuntu22.04-cuda12", "ubuntu22.04-cuda12", "ubuntu24.04", imageScoreUbuntuHintBonus}, + {"non-ubuntu request, non-matching image, no bonus", "debian12", "debian12", "ubuntu", 0}, + } + for _, tc := range tests { + t.Run(tc.desc, func(t *testing.T) { + got := requestMatchBonus(strings.ToLower(tc.name), strings.ToLower(tc.family), strings.ToLower(tc.requested)) + assert.Equal(t, tc.want, got) + }) + } +} + +func TestScoreImage_prioritizesUbuntu24Cuda13OverWorkerNode(t *testing.T) { + ubuntu24Cuda13 := makeTestImage("ubuntu24.04-cuda13.0.0.2.673", "ubuntu24.04-cuda13.0") + workerNode := makeTestImage("worker-node-v-1-33-ubuntu24.04-cuda12.8-20260403", "mk8s-worker-node-v-1-33-ubuntu24.04-cuda12.8") + + // Regression guard for BREV-8794 scenario: default deploy (empty request) + // must prefer ubuntu24-cuda13 over any mk8s worker-node image. + assert.Greater(t, scoreImage(ubuntu24Cuda13, ""), scoreImage(workerNode, "")) + + // Request that happens to contain 'ubuntu24.04' must still prefer + // ubuntu24-cuda13 over worker-node (worker image name contains 'ubuntu24.04' + // as a substring, but the baseline score gap keeps it below). + assert.Greater(t, scoreImage(ubuntu24Cuda13, "ubuntu24.04"), scoreImage(workerNode, "ubuntu24.04")) +} + +func TestScoreImage_exactRequestForWorkerNodeWins(t *testing.T) { + ubuntu24Cuda13 := makeTestImage("ubuntu24.04-cuda13.0.0.2.673", "ubuntu24.04-cuda13.0") + workerNode := makeTestImage("worker-node-v-1-33-ubuntu24.04-cuda12.8-20260403", "mk8s-worker-node-v-1-33-ubuntu24.04-cuda12.8") + + // If a caller explicitly asks for the worker-node family, it must win. + requested := "mk8s-worker-node-v-1-33-ubuntu24.04-cuda12.8" + assert.Greater(t, scoreImage(workerNode, requested), scoreImage(ubuntu24Cuda13, requested)) +} + +func TestScoreImage_nilSpecUsesBaseline(t *testing.T) { + img := &compute.Image{Metadata: &common.ResourceMetadata{Name: "unknown-image"}} + assert.Equal(t, imageScoreBaseline, scoreImage(img, "")) +} From dceb545ff119223a934d414bf3d129f636306bae Mon Sep 17 00:00:00 2001 From: harshsharma Date: Fri, 17 Apr 2026 17:38:49 +0530 Subject: [PATCH 5/9] fix(nebius): preseed iptables-persistent to prevent first-boot UFW flush --- v1/providers/nebius/instance.go | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index 59537c5..178f8b4 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -1830,12 +1830,18 @@ packages: // accessible from the internet by default. commands = append(commands, generateIPTablesCommands()...) - // Install iptables-persistent here (in runcmd, after UFW is configured) rather than - // in the packages: directive. Installing it as a package would start netfilter-persistent.service - // immediately at first boot, which races with ufw.service — netfilter-persistent flushes - // UFW's rules before UFW finishes loading them (Launchpad bug #1987227). By installing - // it here, the service only starts after UFW is already set up and the drop-in is in place. - commands = append(commands, "DEBIAN_FRONTEND=noninteractive apt-get install -y iptables-persistent") + // Install iptables-persistent in runcmd (not packages:) so netfilter-persistent.service + // doesn't race ufw.service on first boot (Launchpad #1987227). + // + // Preseed autosave_v4/v6 before installing. Without this, the postinst with + // DEBIAN_FRONTEND=noninteractive writes empty rules.v4/v6, and the service + // flushes the UFW + DOCKER-USER rules we just applied (Launchpad #1949643). + // With autosave=true, postinst snapshots the currently-applied iptables state. + commands = append(commands, + `echo iptables-persistent iptables-persistent/autosave_v4 boolean true | sudo debconf-set-selections`, + `echo iptables-persistent iptables-persistent/autosave_v6 boolean true | sudo debconf-set-selections`, + "sudo DEBIAN_FRONTEND=noninteractive apt-get install -y iptables-persistent", + ) // Save the complete iptables state (UFW chains + DOCKER-USER rules) so it // survives instance stop/start cycles. Cloud-init runcmd only executes on From 758e9339b0b71ec01e014fbea1767cd2fb38985e Mon Sep 17 00:00:00 2001 From: harshsharma Date: Fri, 17 Apr 2026 19:59:51 +0530 Subject: [PATCH 6/9] chore(nebius): adding better logging --- v1/providers/nebius/instance.go | 114 ++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 5 deletions(-) diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index 178f8b4..9d2c5cf 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -1198,6 +1198,12 @@ func (c *NebiusClient) createBootDisk(ctx context.Context, attrs v1.CreateInstan // buildDiskCreateRequest builds a disk creation request, trying image family first, then image ID func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName string, attrs v1.CreateInstanceAttrs) (*compute.CreateDiskRequest, error) { + c.logger.Info(ctx, "buildDiskCreateRequest: start", + v1.LogField("diskName", diskName), + v1.LogField("attrs.ImageID", attrs.ImageID), + v1.LogField("attrs.RefID", attrs.RefID), + v1.LogField("attrs.DiskSize", attrs.DiskSize)) + if attrs.DiskSize == 0 { attrs.DiskSize = 1280 * units.Gibibyte // Defaulted by the Nebius Console } @@ -1221,7 +1227,13 @@ func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName stri } // First, try to resolve and use image family - if imageFamily, err := c.resolveImageFamily(ctx, attrs.ImageID); err == nil { + imageFamily, resolveErr := c.resolveImageFamily(ctx, attrs.ImageID) + c.logger.Info(ctx, "buildDiskCreateRequest: resolveImageFamily result", + v1.LogField("attrs.ImageID", attrs.ImageID), + v1.LogField("resolvedFamily", imageFamily), + v1.LogField("err", fmt.Sprintf("%v", resolveErr))) + + if resolveErr == nil { publicImagesParent := c.getPublicImagesParent() // Skip validation for known-good common families to speed up instance start @@ -1233,8 +1245,14 @@ func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName stri break } } + c.logger.Info(ctx, "buildDiskCreateRequest: known-family check", + v1.LogField("imageFamily", imageFamily), + v1.LogField("isKnownFamily", isKnownFamily), + v1.LogField("publicImagesParent", publicImagesParent)) if isKnownFamily { + c.logger.Info(ctx, "buildDiskCreateRequest: BRANCH=known-family (skipping validation)", + v1.LogField("imageFamily", imageFamily)) // Use known family without validation baseReq.Spec.Source = &compute.DiskSpec_SourceImageFamily{ SourceImageFamily: &compute.SourceImageFamily{ @@ -1251,9 +1269,29 @@ func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName stri ParentId: publicImagesParent, ImageFamily: imageFamily, }) + latestName, latestID, latestArch := "", "", "" + if latestImage != nil { + if latestImage.Metadata != nil { + latestName = latestImage.Metadata.Name + latestID = latestImage.Metadata.Id + } + if latestImage.Spec != nil { + latestArch = latestImage.Spec.GetCpuArchitecture().String() + } + } + c.logger.Info(ctx, "buildDiskCreateRequest: GetLatestByFamily result", + v1.LogField("imageFamily", imageFamily), + v1.LogField("err", fmt.Sprintf("%v", err)), + v1.LogField("latestImageID", latestID), + v1.LogField("latestImageName", latestName), + v1.LogField("latestImageArch", latestArch)) + if err == nil { isARM64 := latestImage.Spec != nil && latestImage.Spec.GetCpuArchitecture() == compute.ImageSpec_ARM64 if !isARM64 { + c.logger.Info(ctx, "buildDiskCreateRequest: BRANCH=validated-family (non-ARM64)", + v1.LogField("imageFamily", imageFamily), + v1.LogField("latestImageID", latestID)) baseReq.Spec.Source = &compute.DiskSpec_SourceImageFamily{ SourceImageFamily: &compute.SourceImageFamily{ ImageFamily: imageFamily, @@ -1263,12 +1301,20 @@ func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName stri baseReq.Metadata.Labels["image-family"] = imageFamily return baseReq, nil } + c.logger.Info(ctx, "buildDiskCreateRequest: validated-family is ARM64, falling through to scoring", + v1.LogField("imageFamily", imageFamily)) // ARM64 family — fall through to getWorkingPublicImageID which filters by architecture } } // Family approach failed, try to use a known working public image ID + c.logger.Info(ctx, "buildDiskCreateRequest: BRANCH=scoring (falling back to getWorkingPublicImageID)", + v1.LogField("attrs.ImageID", attrs.ImageID)) publicImageID, err := c.getWorkingPublicImageID(ctx, attrs.ImageID) + c.logger.Info(ctx, "buildDiskCreateRequest: getWorkingPublicImageID result", + v1.LogField("publicImageID", publicImageID), + v1.LogField("err", fmt.Sprintf("%v", err))) + if err == nil { baseReq.Spec.Source = &compute.DiskSpec_SourceImageId{ SourceImageId: publicImageID, @@ -1285,14 +1331,24 @@ func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName stri // It scores every non-ARM64 image and returns the highest-scored one, this is done to handle change in ordering of images from nebius api. func (c *NebiusClient) getWorkingPublicImageID(ctx context.Context, requestedImage string) (string, error) { publicImagesParent := c.getPublicImagesParent() + c.logger.Info(ctx, "getWorkingPublicImageID: listing images", + v1.LogField("requestedImage", requestedImage), + v1.LogField("publicImagesParent", publicImagesParent)) + imagesResp, err := c.sdk.Services().Compute().V1().Image().List(ctx, &compute.ListImagesRequest{ ParentId: publicImagesParent, }) if err != nil { + c.logger.Error(ctx, fmt.Errorf("failed to list public images: %w", err), + v1.LogField("publicImagesParent", publicImagesParent)) return "", fmt.Errorf("failed to list public images: %w", err) } - if len(imagesResp.GetItems()) == 0 { + totalCount := len(imagesResp.GetItems()) + c.logger.Info(ctx, "getWorkingPublicImageID: list returned", + v1.LogField("totalImages", totalCount)) + + if totalCount == 0 { return "", fmt.Errorf("no public images available") } @@ -1300,26 +1356,56 @@ func (c *NebiusClient) getWorkingPublicImageID(ctx context.Context, requestedIma var bestImage *compute.Image bestScore := -1 + consideredCount, arm64Skipped, nilMetadataSkipped := 0, 0, 0 for _, image := range imagesResp.GetItems() { if image.Metadata == nil { + nilMetadataSkipped++ continue } if image.Spec != nil && image.Spec.GetCpuArchitecture() == compute.ImageSpec_ARM64 { + arm64Skipped++ continue } + consideredCount++ score := scoreImage(image, requestedLower) + family := "" + if image.Spec != nil { + family = image.Spec.GetImageFamily() + } + c.logger.Info(ctx, "getWorkingPublicImageID: scored", + v1.LogField("id", image.Metadata.Id), + v1.LogField("name", image.Metadata.Name), + v1.LogField("family", family), + v1.LogField("score", score)) + if score > bestScore { bestScore = score bestImage = image } } + c.logger.Info(ctx, "getWorkingPublicImageID: scoring summary", + v1.LogField("consideredCount", consideredCount), + v1.LogField("arm64Skipped", arm64Skipped), + v1.LogField("nilMetadataSkipped", nilMetadataSkipped), + v1.LogField("bestScore", bestScore)) + if bestImage == nil { return "", fmt.Errorf("no suitable public image found") } + winnerFamily := "" + if bestImage.Spec != nil { + winnerFamily = bestImage.Spec.GetImageFamily() + } + c.logger.Info(ctx, "getWorkingPublicImageID: winner", + v1.LogField("id", bestImage.Metadata.Id), + v1.LogField("name", bestImage.Metadata.Name), + v1.LogField("family", winnerFamily), + v1.LogField("score", bestScore)) + return bestImage.Metadata.Id, nil } @@ -1625,6 +1711,10 @@ func (c *NebiusClient) parseInstanceType(ctx context.Context, instanceTypeID str // //nolint:gocyclo,unparam // Complex image family resolution with fallback logic func (c *NebiusClient) resolveImageFamily(ctx context.Context, imageID string) (string, error) { + c.logger.Info(ctx, "resolveImageFamily: start", + v1.LogField("imageID", imageID), + v1.LogField("imageIDLen", len(imageID))) + // Common Nebius image families - if ImageID matches one of these, use it directly commonFamilies := []string{ "ubuntu24.04-cuda13.0", @@ -1641,6 +1731,8 @@ func (c *NebiusClient) resolveImageFamily(ctx context.Context, imageID string) ( // Check if ImageID is already a known family name for _, family := range commonFamilies { if imageID == family { + c.logger.Info(ctx, "resolveImageFamily: matched commonFamilies", + v1.LogField("family", family)) return family, nil } } @@ -1648,7 +1740,8 @@ func (c *NebiusClient) resolveImageFamily(ctx context.Context, imageID string) ( // If ImageID looks like a family name pattern (contains dots, dashes, no UUIDs) // and doesn't look like a UUID, assume it's a family name if !strings.Contains(imageID, "-") || len(imageID) < 32 { - // Likely a family name, use it directly + c.logger.Info(ctx, "resolveImageFamily: treating as family (short/no-dash)", + v1.LogField("returnValue", imageID)) return imageID, nil } @@ -1657,17 +1750,22 @@ func (c *NebiusClient) resolveImageFamily(ctx context.Context, imageID string) ( Id: imageID, }) if err != nil { - // If we can't get the image, try using the ID as a family name anyway - // This allows for custom family names that don't match our patterns + c.logger.Info(ctx, "resolveImageFamily: Get failed, returning imageID as family", + v1.LogField("imageID", imageID), + v1.LogField("err", fmt.Sprintf("%v", err))) return imageID, nil } // Extract family from image metadata/labels if available if image.Metadata != nil && image.Metadata.Labels != nil { if family, exists := image.Metadata.Labels["family"]; exists && family != "" { + c.logger.Info(ctx, "resolveImageFamily: resolved via labels[family]", + v1.LogField("family", family)) return family, nil } if family, exists := image.Metadata.Labels["image-family"]; exists && family != "" { + c.logger.Info(ctx, "resolveImageFamily: resolved via labels[image-family]", + v1.LogField("family", family)) return family, nil } } @@ -1677,15 +1775,21 @@ func (c *NebiusClient) resolveImageFamily(ctx context.Context, imageID string) ( // Try to extract a reasonable family name from the image name name := strings.ToLower(image.Metadata.Name) if strings.Contains(name, "ubuntu22") || strings.Contains(name, "ubuntu-22") { + c.logger.Info(ctx, "resolveImageFamily: inferred ubuntu22 from name", + v1.LogField("name", image.Metadata.Name)) return "ubuntu22.04", nil } if strings.Contains(name, "ubuntu20") || strings.Contains(name, "ubuntu-20") { + c.logger.Info(ctx, "resolveImageFamily: inferred ubuntu20 from name", + v1.LogField("name", image.Metadata.Name)) return "ubuntu20.04", nil } } // Default fallback - use the original ImageID as family // This handles cases where users provide custom family names + c.logger.Info(ctx, "resolveImageFamily: default fallback, returning imageID as family", + v1.LogField("imageID", imageID)) return imageID, nil } From 4073617bd1a2155b42b961f63a87d41b2bebe7aa Mon Sep 17 00:00:00 2001 From: harshsharma Date: Fri, 17 Apr 2026 20:35:19 +0530 Subject: [PATCH 7/9] fix(nebius): pagination in get images call --- v1/providers/nebius/instance.go | 46 ++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index 9d2c5cf..83e2913 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -1335,37 +1335,33 @@ func (c *NebiusClient) getWorkingPublicImageID(ctx context.Context, requestedIma v1.LogField("requestedImage", requestedImage), v1.LogField("publicImagesParent", publicImagesParent)) - imagesResp, err := c.sdk.Services().Compute().V1().Image().List(ctx, &compute.ListImagesRequest{ - ParentId: publicImagesParent, - }) - if err != nil { - c.logger.Error(ctx, fmt.Errorf("failed to list public images: %w", err), - v1.LogField("publicImagesParent", publicImagesParent)) - return "", fmt.Errorf("failed to list public images: %w", err) - } - - totalCount := len(imagesResp.GetItems()) - c.logger.Info(ctx, "getWorkingPublicImageID: list returned", - v1.LogField("totalImages", totalCount)) - - if totalCount == 0 { - return "", fmt.Errorf("no public images available") - } - requestedLower := strings.ToLower(requestedImage) var bestImage *compute.Image bestScore := -1 - consideredCount, arm64Skipped, nilMetadataSkipped := 0, 0, 0 + totalCount, consideredCount, arm64Skipped, nilMetadataSkipped := 0, 0, 0, 0 + var iterErr error + + // Filter auto-paginates via the SDK. Using List directly only returns the first + // page (small default size), which can omit ubuntu24.04-cuda13.0 entirely. + imageIter := c.sdk.Services().Compute().V1().Image().Filter(ctx, &compute.ListImagesRequest{ + ParentId: publicImagesParent, + PageSize: 1000, + }) + imageIter(func(image *compute.Image, err error) bool { + if err != nil { + iterErr = err + return false + } + totalCount++ - for _, image := range imagesResp.GetItems() { if image.Metadata == nil { nilMetadataSkipped++ - continue + return true } if image.Spec != nil && image.Spec.GetCpuArchitecture() == compute.ImageSpec_ARM64 { arm64Skipped++ - continue + return true } consideredCount++ @@ -1384,9 +1380,17 @@ func (c *NebiusClient) getWorkingPublicImageID(ctx context.Context, requestedIma bestScore = score bestImage = image } + return true + }) + + if iterErr != nil { + c.logger.Error(ctx, fmt.Errorf("failed to iterate public images: %w", iterErr), + v1.LogField("publicImagesParent", publicImagesParent)) + return "", fmt.Errorf("failed to iterate public images: %w", iterErr) } c.logger.Info(ctx, "getWorkingPublicImageID: scoring summary", + v1.LogField("totalImages", totalCount), v1.LogField("consideredCount", consideredCount), v1.LogField("arm64Skipped", arm64Skipped), v1.LogField("nilMetadataSkipped", nilMetadataSkipped), From 1f66f107ccbbfcc25e56dde6628d48cd7861cb28 Mon Sep 17 00:00:00 2001 From: harshsharma Date: Fri, 17 Apr 2026 20:37:03 +0530 Subject: [PATCH 8/9] fix(nebius): pagination size in get images call to less than limit --- v1/providers/nebius/instance.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index 83e2913..1a81740 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -1346,7 +1346,7 @@ func (c *NebiusClient) getWorkingPublicImageID(ctx context.Context, requestedIma // page (small default size), which can omit ubuntu24.04-cuda13.0 entirely. imageIter := c.sdk.Services().Compute().V1().Image().Filter(ctx, &compute.ListImagesRequest{ ParentId: publicImagesParent, - PageSize: 1000, + PageSize: 988, }) imageIter(func(image *compute.Image, err error) bool { if err != nil { From c9b799376f2012ce8ab91bc32174495da508e00e Mon Sep 17 00:00:00 2001 From: harshsharma Date: Fri, 17 Apr 2026 21:24:27 +0530 Subject: [PATCH 9/9] fix(nebius): updaing ufw rules --- v1/providers/nebius/instance.go | 167 ++++++++++++++++---------------- 1 file changed, 84 insertions(+), 83 deletions(-) diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index 1a81740..fa5dacc 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -1227,84 +1227,8 @@ func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName stri } // First, try to resolve and use image family - imageFamily, resolveErr := c.resolveImageFamily(ctx, attrs.ImageID) - c.logger.Info(ctx, "buildDiskCreateRequest: resolveImageFamily result", - v1.LogField("attrs.ImageID", attrs.ImageID), - v1.LogField("resolvedFamily", imageFamily), - v1.LogField("err", fmt.Sprintf("%v", resolveErr))) - - if resolveErr == nil { - publicImagesParent := c.getPublicImagesParent() - - // Skip validation for known-good common families to speed up instance start - knownFamilies := []string{"ubuntu24.04-cuda13.0", "ubuntu24.04-cuda12", "ubuntu22.04-cuda12", "mk8s-worker-node-v-1-32-ubuntu24.04", "mk8s-worker-node-v-1-32-ubuntu24.04-cuda12.8"} - isKnownFamily := false - for _, known := range knownFamilies { - if imageFamily == known { - isKnownFamily = true - break - } - } - c.logger.Info(ctx, "buildDiskCreateRequest: known-family check", - v1.LogField("imageFamily", imageFamily), - v1.LogField("isKnownFamily", isKnownFamily), - v1.LogField("publicImagesParent", publicImagesParent)) - - if isKnownFamily { - c.logger.Info(ctx, "buildDiskCreateRequest: BRANCH=known-family (skipping validation)", - v1.LogField("imageFamily", imageFamily)) - // Use known family without validation - baseReq.Spec.Source = &compute.DiskSpec_SourceImageFamily{ - SourceImageFamily: &compute.SourceImageFamily{ - ImageFamily: imageFamily, - ParentId: publicImagesParent, - }, - } - baseReq.Metadata.Labels["image-family"] = imageFamily - return baseReq, nil - } - - // For unknown families, validate first and check architecture - latestImage, err := c.sdk.Services().Compute().V1().Image().GetLatestByFamily(ctx, &compute.GetImageLatestByFamilyRequest{ - ParentId: publicImagesParent, - ImageFamily: imageFamily, - }) - latestName, latestID, latestArch := "", "", "" - if latestImage != nil { - if latestImage.Metadata != nil { - latestName = latestImage.Metadata.Name - latestID = latestImage.Metadata.Id - } - if latestImage.Spec != nil { - latestArch = latestImage.Spec.GetCpuArchitecture().String() - } - } - c.logger.Info(ctx, "buildDiskCreateRequest: GetLatestByFamily result", - v1.LogField("imageFamily", imageFamily), - v1.LogField("err", fmt.Sprintf("%v", err)), - v1.LogField("latestImageID", latestID), - v1.LogField("latestImageName", latestName), - v1.LogField("latestImageArch", latestArch)) - - if err == nil { - isARM64 := latestImage.Spec != nil && latestImage.Spec.GetCpuArchitecture() == compute.ImageSpec_ARM64 - if !isARM64 { - c.logger.Info(ctx, "buildDiskCreateRequest: BRANCH=validated-family (non-ARM64)", - v1.LogField("imageFamily", imageFamily), - v1.LogField("latestImageID", latestID)) - baseReq.Spec.Source = &compute.DiskSpec_SourceImageFamily{ - SourceImageFamily: &compute.SourceImageFamily{ - ImageFamily: imageFamily, - ParentId: publicImagesParent, - }, - } - baseReq.Metadata.Labels["image-family"] = imageFamily - return baseReq, nil - } - c.logger.Info(ctx, "buildDiskCreateRequest: validated-family is ARM64, falling through to scoring", - v1.LogField("imageFamily", imageFamily)) - // ARM64 family — fall through to getWorkingPublicImageID which filters by architecture - } + if c.tryApplyImageFamilySource(ctx, baseReq, attrs.ImageID) { + return baseReq, nil } // Family approach failed, try to use a known working public image ID @@ -1327,6 +1251,87 @@ func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName stri return nil, fmt.Errorf("could not resolve image %s to either a working family or image ID: %w", attrs.ImageID, err) } +// tryApplyImageFamilySource attempts to set baseReq's disk source via image-family lookup. +// Returns true if a family-based source was applied (caller should return baseReq). +// Returns false if the caller should fall back to scoring (getWorkingPublicImageID). +func (c *NebiusClient) tryApplyImageFamilySource(ctx context.Context, baseReq *compute.CreateDiskRequest, imageID string) bool { + imageFamily, resolveErr := c.resolveImageFamily(ctx, imageID) + c.logger.Info(ctx, "buildDiskCreateRequest: resolveImageFamily result", + v1.LogField("attrs.ImageID", imageID), + v1.LogField("resolvedFamily", imageFamily), + v1.LogField("err", fmt.Sprintf("%v", resolveErr))) + if resolveErr != nil { + return false + } + + publicImagesParent := c.getPublicImagesParent() + knownFamilies := []string{"ubuntu24.04-cuda13.0", "ubuntu24.04-cuda12", "ubuntu22.04-cuda12", "mk8s-worker-node-v-1-32-ubuntu24.04", "mk8s-worker-node-v-1-32-ubuntu24.04-cuda12.8"} + isKnownFamily := false + for _, known := range knownFamilies { + if imageFamily == known { + isKnownFamily = true + break + } + } + c.logger.Info(ctx, "buildDiskCreateRequest: known-family check", + v1.LogField("imageFamily", imageFamily), + v1.LogField("isKnownFamily", isKnownFamily), + v1.LogField("publicImagesParent", publicImagesParent)) + + if isKnownFamily { + c.logger.Info(ctx, "buildDiskCreateRequest: BRANCH=known-family (skipping validation)", + v1.LogField("imageFamily", imageFamily)) + applyImageFamilySource(baseReq, imageFamily, publicImagesParent) + return true + } + + latestImage, err := c.sdk.Services().Compute().V1().Image().GetLatestByFamily(ctx, &compute.GetImageLatestByFamilyRequest{ + ParentId: publicImagesParent, + ImageFamily: imageFamily, + }) + latestName, latestID, latestArch := "", "", "" + if latestImage != nil { + if latestImage.Metadata != nil { + latestName = latestImage.Metadata.Name + latestID = latestImage.Metadata.Id + } + if latestImage.Spec != nil { + latestArch = latestImage.Spec.GetCpuArchitecture().String() + } + } + c.logger.Info(ctx, "buildDiskCreateRequest: GetLatestByFamily result", + v1.LogField("imageFamily", imageFamily), + v1.LogField("err", fmt.Sprintf("%v", err)), + v1.LogField("latestImageID", latestID), + v1.LogField("latestImageName", latestName), + v1.LogField("latestImageArch", latestArch)) + if err != nil { + return false + } + + if latestImage.Spec != nil && latestImage.Spec.GetCpuArchitecture() == compute.ImageSpec_ARM64 { + c.logger.Info(ctx, "buildDiskCreateRequest: validated-family is ARM64, falling through to scoring", + v1.LogField("imageFamily", imageFamily)) + return false + } + + c.logger.Info(ctx, "buildDiskCreateRequest: BRANCH=validated-family (non-ARM64)", + v1.LogField("imageFamily", imageFamily), + v1.LogField("latestImageID", latestID)) + applyImageFamilySource(baseReq, imageFamily, publicImagesParent) + return true +} + +func applyImageFamilySource(baseReq *compute.CreateDiskRequest, imageFamily, publicImagesParent string) { + baseReq.Spec.Source = &compute.DiskSpec_SourceImageFamily{ + SourceImageFamily: &compute.SourceImageFamily{ + ImageFamily: imageFamily, + ParentId: publicImagesParent, + }, + } + baseReq.Metadata.Labels["image-family"] = imageFamily +} + // getWorkingPublicImageID gets a working public image ID based on the requested image type. // It scores every non-ARM64 image and returns the highest-scored one, this is done to handle change in ordering of images from nebius api. func (c *NebiusClient) getWorkingPublicImageID(ctx context.Context, requestedImage string) (string, error) { @@ -1945,11 +1950,7 @@ packages: // DEBIAN_FRONTEND=noninteractive writes empty rules.v4/v6, and the service // flushes the UFW + DOCKER-USER rules we just applied (Launchpad #1949643). // With autosave=true, postinst snapshots the currently-applied iptables state. - commands = append(commands, - `echo iptables-persistent iptables-persistent/autosave_v4 boolean true | sudo debconf-set-selections`, - `echo iptables-persistent iptables-persistent/autosave_v6 boolean true | sudo debconf-set-selections`, - "sudo DEBIAN_FRONTEND=noninteractive apt-get install -y iptables-persistent", - ) + // removing from here ip tables // Save the complete iptables state (UFW chains + DOCKER-USER rules) so it // survives instance stop/start cycles. Cloud-init runcmd only executes on