From 408c88eb414c82e9c9761f93941ac5efe5593844 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Thu, 2 Jul 2026 15:23:29 -0700 Subject: [PATCH] feat(gpu): recognize aks-gpu-cuda in LoadConfig alongside aks-gpu-cuda-lts #8811 moved the managed CUDA driver from aks-gpu-cuda to aks-gpu-cuda-lts, reusing the NvidiaCudaDriverVersion / AKSGPUCudaVersionSuffix globals for the LTS image. This restores a first-class `case "aks-gpu-cuda"` in LoadConfig so the pre-LTS image's version is loaded and available if a SKU is ever routed to the "cuda" image in CSE again -- without disturbing today's render. - components.json: add an aks-gpu-cuda entry pinned to the R580 line (580.126.09), NOT the R595 line that drops Volta/V100. - gpu_components.go: aks-gpu-cuda reclaims NvidiaCudaDriverVersion / AKSGPUCudaVersionSuffix (its pre-#8811 names); aks-gpu-cuda-lts moves to NvidiaCudaLTSDriverVersion / AKSGPUCudaLTSVersionSuffix. Mirrors the existing base-vs-variant naming (NvidiaGridDriverVersion vs NvidiaGridV20DriverVersion) and avoids clobbering a shared global. - baker.go: GetGPUDriverVersion / GetAKSGPUImageSHA render the LTS globals for modern CUDA SKUs, so rendered output is byte-identical (verified: zero testdata drift). aks-gpu-cuda is loaded but not the default render target. - renovate.json: constrain aks-gpu-cuda to /^580\./ so it never bumps to R595. Still not baked into the VHD (install-dependencies.sh only pre-pulls aks-gpu-cuda-lts). Old-VHD / skewed nodes that target aks-gpu-cuda resolve it at boot via the hardened pull (#8821), served by required-MCR or the wildcard network-isolated ACR cache. Signed-off-by: Ganeshkumar Ashokavardhanan --- .github/renovate.json | 11 +++++++++++ parts/common/components.json | 7 +++++++ pkg/agent/baker.go | 4 ++-- pkg/agent/baker_test.go | 8 ++++---- pkg/agent/datamodel/gpu_components.go | 8 ++++++++ pkg/agent/datamodel/gpu_components_test.go | 10 ++++++++++ 6 files changed, 42 insertions(+), 6 deletions(-) diff --git a/.github/renovate.json b/.github/renovate.json index a877abf4655..d01bf358c8d 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -580,6 +580,17 @@ "enabled": true, "ignoreUnstable": false }, + { + "matchPackageNames": [ + "aks/aks-gpu-cuda" + ], + "groupName": "nvidia-gpu-cuda-legacy", + "versioning": "regex:^(?\\d+)\\.(?\\d+)\\.(?\\d+)-(?\\d{14})$", + "allowedVersions": "/^580\\./", + "automerge": false, + "enabled": true, + "ignoreUnstable": false + }, { "matchPackageNames": [ "aks/aks-gpu-grid" diff --git a/parts/common/components.json b/parts/common/components.json index e6cd79b0fb5..cd4fd26741b 100644 --- a/parts/common/components.json +++ b/parts/common/components.json @@ -742,6 +742,13 @@ "latestVersion": "580.159.04-20260629214430" } }, + { + "downloadURL": "mcr.microsoft.com/aks/aks-gpu-cuda:*", + "gpuVersion": { + "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda", + "latestVersion": "580.126.09-20260430040408" + } + }, { "downloadURL": "mcr.microsoft.com/aks/aks-gpu-grid:*", "gpuVersion": { diff --git a/pkg/agent/baker.go b/pkg/agent/baker.go index 2e8b4d6e101..b7566d7edbb 100644 --- a/pkg/agent/baker.go +++ b/pkg/agent/baker.go @@ -1497,7 +1497,7 @@ func GetGPUDriverVersion(size string) string { if isStandardNCv1(size) { return datamodel.Nvidia470CudaDriverVersion } - return datamodel.NvidiaCudaDriverVersion + return datamodel.NvidiaCudaLTSDriverVersion } func isStandardNCv1(size string) bool { @@ -1522,7 +1522,7 @@ func GetAKSGPUImageSHA(size string) string { if useGridDrivers(size) { return datamodel.AKSGPUGridVersionSuffix } - return datamodel.AKSGPUCudaVersionSuffix + return datamodel.AKSGPUCudaLTSVersionSuffix } // GetGPUDriverType maps a GPU VM size to the aks-gpu image variant used to install its driver. diff --git a/pkg/agent/baker_test.go b/pkg/agent/baker_test.go index 5478db71632..779c22c10e7 100644 --- a/pkg/agent/baker_test.go +++ b/pkg/agent/baker_test.go @@ -943,7 +943,7 @@ var _ = Describe("GetGPUDriverVersion", func() { Expect(GetGPUDriverVersion("standard_nc6")).To(Equal(datamodel.Nvidia470CudaDriverVersion)) }) It("should use cuda with nc v3", func() { - Expect(GetGPUDriverVersion("standard_nc6_v3")).To(Equal(datamodel.NvidiaCudaDriverVersion)) + Expect(GetGPUDriverVersion("standard_nc6_v3")).To(Equal(datamodel.NvidiaCudaLTSDriverVersion)) }) It("should use grid with nv v5", func() { Expect(GetGPUDriverVersion("standard_nv6ads_a10_v5")).To(Equal(datamodel.NvidiaGridDriverVersion)) @@ -958,7 +958,7 @@ var _ = Describe("GetGPUDriverVersion", func() { }) // NV V1 SKUs were retired in September 2023, leaving this test just for safety It("should use cuda with nv v1", func() { - Expect(GetGPUDriverVersion("standard_nv6")).To(Equal(datamodel.NvidiaCudaDriverVersion)) + Expect(GetGPUDriverVersion("standard_nv6")).To(Equal(datamodel.NvidiaCudaLTSDriverVersion)) }) }) @@ -995,8 +995,8 @@ var _ = Describe("GetAKSGPUImageSHA", func() { Expect(GetAKSGPUImageSHA("standard_nc128ds_xl_rtxpro6000bse_v6")).To(Equal(datamodel.AKSGPUGridV20VersionSuffix)) Expect(GetAKSGPUImageSHA("standard_nc128lds_xl_rtxpro6000bse_v6")).To(Equal(datamodel.AKSGPUGridV20VersionSuffix)) }) - It("should use newest AKSGPUCudaVersionSuffix with non grid SKU", func() { - Expect(GetAKSGPUImageSHA("standard_nc6_v3")).To(Equal(datamodel.AKSGPUCudaVersionSuffix)) + It("should use newest AKSGPUCudaLTSVersionSuffix with non grid SKU", func() { + Expect(GetAKSGPUImageSHA("standard_nc6_v3")).To(Equal(datamodel.AKSGPUCudaLTSVersionSuffix)) }) }) diff --git a/pkg/agent/datamodel/gpu_components.go b/pkg/agent/datamodel/gpu_components.go index e2c8b33090d..2708f077296 100644 --- a/pkg/agent/datamodel/gpu_components.go +++ b/pkg/agent/datamodel/gpu_components.go @@ -13,9 +13,11 @@ const Nvidia470CudaDriverVersion = "cuda-470.82.01" //nolint:gochecknoglobals var ( NvidiaCudaDriverVersion string + NvidiaCudaLTSDriverVersion string NvidiaGridDriverVersion string NvidiaGridV20DriverVersion string AKSGPUCudaVersionSuffix string + AKSGPUCudaLTSVersionSuffix string AKSGPUGridVersionSuffix string AKSGPUGridV20VersionSuffix string ) @@ -63,6 +65,12 @@ func LoadConfig() error { // confused by substring matching. switch gpuImageRepo(image.DownloadURL) { case "aks-gpu-cuda-lts": + NvidiaCudaLTSDriverVersion = version + AKSGPUCudaLTSVersionSuffix = suffix + case "aks-gpu-cuda": + // Pre-LTS CUDA image, pinned to the R580 line (V100-capable). Loaded so its version is + // available if a SKU is ever routed to the "cuda" image in CSE; the default managed CUDA + // image is aks-gpu-cuda-lts (see GetGPUDriverType / GetGPUDriverVersion in baker.go). NvidiaCudaDriverVersion = version AKSGPUCudaVersionSuffix = suffix case "aks-gpu-grid": diff --git a/pkg/agent/datamodel/gpu_components_test.go b/pkg/agent/datamodel/gpu_components_test.go index 86e55937f73..9dc78b327fb 100644 --- a/pkg/agent/datamodel/gpu_components_test.go +++ b/pkg/agent/datamodel/gpu_components_test.go @@ -63,6 +63,16 @@ func TestLoadConfig(t *testing.T) { if !suffixPattern.MatchString(AKSGPUGridV20VersionSuffix) { t.Errorf("AKSGPUGridV20VersionSuffix '%s' does not match expected format", AKSGPUGridV20VersionSuffix) } + + // aks-gpu-cuda-lts drives the render, so its version/suffix must be loaded. aks-gpu-cuda + // (NvidiaCudaDriverVersion / AKSGPUCudaVersionSuffix, checked above) is the recognized pre-LTS + // image, available if a SKU is routed to the "cuda" image in CSE later. + if !versionPattern.MatchString(NvidiaCudaLTSDriverVersion) { + t.Errorf("NvidiaCudaLTSDriverVersion '%s' does not match expected format", NvidiaCudaLTSDriverVersion) + } + if !suffixPattern.MatchString(AKSGPUCudaLTSVersionSuffix) { + t.Errorf("AKSGPUCudaLTSVersionSuffix '%s' does not match expected format", AKSGPUCudaLTSVersionSuffix) + } } // TestGPUImageRepo verifies that the bare repo name is extracted via exact final