diff --git a/.github/renovate.json b/.github/renovate.json index a877abf4655..d01bf358c8d 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -580,6 +580,17 @@ "enabled": true, "ignoreUnstable": false }, + { + "matchPackageNames": [ + "aks/aks-gpu-cuda" + ], + "groupName": "nvidia-gpu-cuda-legacy", + "versioning": "regex:^(?\\d+)\\.(?\\d+)\\.(?\\d+)-(?\\d{14})$", + "allowedVersions": "/^580\\./", + "automerge": false, + "enabled": true, + "ignoreUnstable": false + }, { "matchPackageNames": [ "aks/aks-gpu-grid" diff --git a/parts/common/components.json b/parts/common/components.json index e6cd79b0fb5..cd4fd26741b 100644 --- a/parts/common/components.json +++ b/parts/common/components.json @@ -742,6 +742,13 @@ "latestVersion": "580.159.04-20260629214430" } }, + { + "downloadURL": "mcr.microsoft.com/aks/aks-gpu-cuda:*", + "gpuVersion": { + "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda", + "latestVersion": "580.126.09-20260430040408" + } + }, { "downloadURL": "mcr.microsoft.com/aks/aks-gpu-grid:*", "gpuVersion": { diff --git a/pkg/agent/baker.go b/pkg/agent/baker.go index 2e8b4d6e101..b7566d7edbb 100644 --- a/pkg/agent/baker.go +++ b/pkg/agent/baker.go @@ -1497,7 +1497,7 @@ func GetGPUDriverVersion(size string) string { if isStandardNCv1(size) { return datamodel.Nvidia470CudaDriverVersion } - return datamodel.NvidiaCudaDriverVersion + return datamodel.NvidiaCudaLTSDriverVersion } func isStandardNCv1(size string) bool { @@ -1522,7 +1522,7 @@ func GetAKSGPUImageSHA(size string) string { if useGridDrivers(size) { return datamodel.AKSGPUGridVersionSuffix } - return datamodel.AKSGPUCudaVersionSuffix + return datamodel.AKSGPUCudaLTSVersionSuffix } // GetGPUDriverType maps a GPU VM size to the aks-gpu image variant used to install its driver. diff --git a/pkg/agent/baker_test.go b/pkg/agent/baker_test.go index 5478db71632..779c22c10e7 100644 --- a/pkg/agent/baker_test.go +++ b/pkg/agent/baker_test.go @@ -943,7 +943,7 @@ var _ = Describe("GetGPUDriverVersion", func() { Expect(GetGPUDriverVersion("standard_nc6")).To(Equal(datamodel.Nvidia470CudaDriverVersion)) }) It("should use cuda with nc v3", func() { - Expect(GetGPUDriverVersion("standard_nc6_v3")).To(Equal(datamodel.NvidiaCudaDriverVersion)) + Expect(GetGPUDriverVersion("standard_nc6_v3")).To(Equal(datamodel.NvidiaCudaLTSDriverVersion)) }) It("should use grid with nv v5", func() { Expect(GetGPUDriverVersion("standard_nv6ads_a10_v5")).To(Equal(datamodel.NvidiaGridDriverVersion)) @@ -958,7 +958,7 @@ var _ = Describe("GetGPUDriverVersion", func() { }) // NV V1 SKUs were retired in September 2023, leaving this test just for safety It("should use cuda with nv v1", func() { - Expect(GetGPUDriverVersion("standard_nv6")).To(Equal(datamodel.NvidiaCudaDriverVersion)) + Expect(GetGPUDriverVersion("standard_nv6")).To(Equal(datamodel.NvidiaCudaLTSDriverVersion)) }) }) @@ -995,8 +995,8 @@ var _ = Describe("GetAKSGPUImageSHA", func() { Expect(GetAKSGPUImageSHA("standard_nc128ds_xl_rtxpro6000bse_v6")).To(Equal(datamodel.AKSGPUGridV20VersionSuffix)) Expect(GetAKSGPUImageSHA("standard_nc128lds_xl_rtxpro6000bse_v6")).To(Equal(datamodel.AKSGPUGridV20VersionSuffix)) }) - It("should use newest AKSGPUCudaVersionSuffix with non grid SKU", func() { - Expect(GetAKSGPUImageSHA("standard_nc6_v3")).To(Equal(datamodel.AKSGPUCudaVersionSuffix)) + It("should use newest AKSGPUCudaLTSVersionSuffix with non grid SKU", func() { + Expect(GetAKSGPUImageSHA("standard_nc6_v3")).To(Equal(datamodel.AKSGPUCudaLTSVersionSuffix)) }) }) diff --git a/pkg/agent/datamodel/gpu_components.go b/pkg/agent/datamodel/gpu_components.go index e2c8b33090d..2708f077296 100644 --- a/pkg/agent/datamodel/gpu_components.go +++ b/pkg/agent/datamodel/gpu_components.go @@ -13,9 +13,11 @@ const Nvidia470CudaDriverVersion = "cuda-470.82.01" //nolint:gochecknoglobals var ( NvidiaCudaDriverVersion string + NvidiaCudaLTSDriverVersion string NvidiaGridDriverVersion string NvidiaGridV20DriverVersion string AKSGPUCudaVersionSuffix string + AKSGPUCudaLTSVersionSuffix string AKSGPUGridVersionSuffix string AKSGPUGridV20VersionSuffix string ) @@ -63,6 +65,12 @@ func LoadConfig() error { // confused by substring matching. switch gpuImageRepo(image.DownloadURL) { case "aks-gpu-cuda-lts": + NvidiaCudaLTSDriverVersion = version + AKSGPUCudaLTSVersionSuffix = suffix + case "aks-gpu-cuda": + // Pre-LTS CUDA image, pinned to the R580 line (V100-capable). Loaded so its version is + // available if a SKU is ever routed to the "cuda" image in CSE; the default managed CUDA + // image is aks-gpu-cuda-lts (see GetGPUDriverType / GetGPUDriverVersion in baker.go). NvidiaCudaDriverVersion = version AKSGPUCudaVersionSuffix = suffix case "aks-gpu-grid": diff --git a/pkg/agent/datamodel/gpu_components_test.go b/pkg/agent/datamodel/gpu_components_test.go index 86e55937f73..9dc78b327fb 100644 --- a/pkg/agent/datamodel/gpu_components_test.go +++ b/pkg/agent/datamodel/gpu_components_test.go @@ -63,6 +63,16 @@ func TestLoadConfig(t *testing.T) { if !suffixPattern.MatchString(AKSGPUGridV20VersionSuffix) { t.Errorf("AKSGPUGridV20VersionSuffix '%s' does not match expected format", AKSGPUGridV20VersionSuffix) } + + // aks-gpu-cuda-lts drives the render, so its version/suffix must be loaded. aks-gpu-cuda + // (NvidiaCudaDriverVersion / AKSGPUCudaVersionSuffix, checked above) is the recognized pre-LTS + // image, available if a SKU is routed to the "cuda" image in CSE later. + if !versionPattern.MatchString(NvidiaCudaLTSDriverVersion) { + t.Errorf("NvidiaCudaLTSDriverVersion '%s' does not match expected format", NvidiaCudaLTSDriverVersion) + } + if !suffixPattern.MatchString(AKSGPUCudaLTSVersionSuffix) { + t.Errorf("AKSGPUCudaLTSVersionSuffix '%s' does not match expected format", AKSGPUCudaLTSVersionSuffix) + } } // TestGPUImageRepo verifies that the bare repo name is extracted via exact final