Azure · ganeshkumarashok · Jun 5, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 29, 2026
@@ -1211,20 +1211,35 @@ pullGPUDriverImage() {
 }
 
 installGPUDriverImage() {
-    retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install"
+    local gpuInstallAction="${1:-install}"
+    retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh ${gpuInstallAction}"
 }
 
 configGPUDrivers() {
     if [ "$OS" = "$UBUNTU_OS_NAME" ]; then
         waitForContainerdReady || exit $ERR_GPU_DRIVERS_START_FAIL
         mkdir -p /opt/{actions,gpu}
+        # When the kernel module was pre-built into the VHD (build-only at image-bake time),
+        # a marker is present. Ask aks-gpu to skip the ~100s DKMS recompile and run only the
+        # device-dependent steps -- but ONLY when the marker's driver_kind matches THIS node's
+        # driver (NVIDIA_GPU_DRIVER_TYPE). A CUDA-prebaked marker on a GRID node (or vice-versa)
+        # must request a full "install": the other driver image may not even support
+        # install-skip-build and would fail to stage its userspace files (e.g. /opt/gpu/config.sh).
+        # aks-gpu still independently re-validates the marker (kernel + driver_version +
+        # driver_kind) and falls back to a full build on any remaining mismatch (e.g. kernel drift).
+        local GPU_INSTALL_ACTION="install"
+        local GPU_DKMS_MARKER="${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}"
+        if [ -f "$GPU_DKMS_MARKER" ] && \
+           [ "$(sed -n 's/^driver_kind=//p' "$GPU_DKMS_MARKER" | head -n1)" = "$NVIDIA_GPU_DRIVER_TYPE" ]; then
+            GPU_INSTALL_ACTION="install-skip-build"
+        fi
         # The driver image is normally pre-pulled into the VHD; only hit the registry when it is
         # actually missing so provisioning doesn't pay a redundant manifest/layer round trip.
         # Use containerd's native exact-name filter rather than text-matching `images ls` output.
         if [ -z "$(ctr -n k8s.io images ls -q "name==${NVIDIA_DRIVER_IMAGE}:${NVIDIA_DRIVER_IMAGE_TAG}")" ]; then
             logs_to_events "AKS.CSE.configGPUDrivers.pullGPUDriverImage" pullGPUDriverImage
         fi
-        logs_to_events "AKS.CSE.configGPUDrivers.installGPUDriverImage" installGPUDriverImage
+        logs_to_events "AKS.CSE.configGPUDrivers.installGPUDriverImage" installGPUDriverImage "$GPU_INSTALL_ACTION"
         ret=$?
         if [ "$ret" -ne 0 ]; then
             echo "Failed to install GPU driver, exiting..."

@@ -223,12 +223,58 @@ removeNvidiaRepos() {
     fi
 }
 
+# cleanUpPrebakedGPUDriver tears down a CUDA driver that was pre-baked into a shared Ubuntu VHD
+# when this node is NOT a GPU node. On a non-GPU node the installed driver is dead weight: it
+# wastes disk and, because it stays DKMS-registered, forces an nvidia.ko rebuild on every kernel
+# patch. The nvidia module is never loaded on a non-GPU node, so deregistration cannot hit
+# "module in use". It is a no-op unless the aks-gpu prebake marker is present, so today's
+# non-prebaked VHDs are completely unaffected. Idempotent and safe to re-run.
+# NOTE: this runs synchronously on the (non-GPU) provisioning path; the teardown is light
+# (deregister + rm + ldconfig, no initramfs rebuild). To move it fully off the critical path it
+# can be launched via a transient systemd unit (systemd-run --no-block) instead -- see the PR
+# description for that variant and its trade-offs.
+cleanUpPrebakedGPUDriver() {
+    local marker="${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}"
+    if [ ! -f "${marker}" ]; then
+        return 0
+    fi
+    echo "Removing pre-baked NVIDIA driver inherited from shared VHD on non-GPU node"
+
+    # Deregister the nvidia DKMS module so future kernel upgrades stop rebuilding it, WITHOUT the
+    # slow `dkms remove --all` (it dominated CSE duration on the non-GPU provisioning path, ~35s).
+    # Removing the DKMS source tree deregisters it (dkms autoinstall iterates /var/lib/dkms/*), and
+    # removing the built module reclaims disk. The module is never loaded on a non-GPU node, so no
+    # depmod/initramfs refresh is required.
+    rm -rf /var/lib/dkms/nvidia || true
+    rm -f /lib/modules/*/updates/dkms/nvidia*.ko* 2>/dev/null || true
+    # aks-gpu stages the driver userspace libs under its container's GPU_DEST/lib64. NOTE: that is
+    # the aks-gpu *container's* GPU_DEST=/usr/bin (aks-gpu config.sh), NOT this CSE script's
+    # GPU_DEST=/usr/local/nvidia -- the prebake writes to /usr/bin, so the teardown clears /usr/bin.
+    rm -rf /usr/bin/lib64 || true
+    # nvidia-installer likewise drops the driver userspace BINARIES under that same /usr/bin.
+    # Remove them too so a non-GPU node looks genuinely driver-free: otherwise e.g. `nvidia-smi`
+    # remains on PATH and, with its libs (lib64) gone, errors instead of being "command not found".
+    for nvidiaBin in nvidia-smi nvidia-debugdump nvidia-persistenced nvidia-cuda-mps-control \
+                     nvidia-cuda-mps-server nvidia-modprobe nvidia-bug-report.sh nvidia-powerd \
+                     nvidia-ngx-updater nvidia-sleep.sh; do
+        rm -f "/usr/bin/${nvidiaBin}" || true
+    done
+    rm -f /etc/ld.so.conf.d/nvidia.conf || true
+    ldconfig || true
+    rm -f "${marker}" || true
+}
+
 cleanUpGPUDrivers() {
     rm -Rf $GPU_DEST /opt/gpu
 
     for packageName in $(managedGPUPackageList); do
         rm -rf "/opt/${packageName}"
     done
+
+    # A CUDA driver pre-baked into a shared Ubuntu VHD is dead weight on a non-GPU node, and while
+    # DKMS-registered it forces an nvidia.ko rebuild on every kernel patch. Tear it down here.
+    # No-op on VHDs without the aks-gpu prebake marker.
+    cleanUpPrebakedGPUDriver
 }
 
 installCriCtlPackage() {

@@ -1888,6 +1888,57 @@ SETUP_EOF
         End
     End
 
+    Describe 'configGPUDrivers'
+        # Mock everything the Ubuntu path touches so the test exercises only the
+        # marker -> aks-gpu action selection (install vs install-skip-build), including the
+        # driver_kind guard (a CUDA-baked marker on a GRID node must NOT skip the build).
+        # logs_to_events is mocked to faithfully dispatch the wrapped command (dropping the
+        # event-name arg) so the real installGPUDriverImage runs and surfaces the action.
+        logs_to_events() { shift; $@; }
+        waitForContainerdReady() { return 0; }
+        mkdir() { :; }
+        ctr() { echo "ctr $*"; }
+        nvidia-modprobe() { return 0; }
+        nvidia-smi() { return 0; }
+        ldconfig() { return 0; }
+        isMarinerOrAzureLinux() { return 1; }
+        isAzureLinuxOSGuard() { return 1; }
+        isACL() { return 1; }
+        systemctlEnableAndStart() { return 0; }
+        systemctl() { return 0; }
+        # Capture the action passed to the install container.
+        retrycmd_if_failure() { shift 3; echo "INSTALL_CMD: $*"; return 0; }
+
+        BeforeEach 'OS="$UBUNTU_OS_NAME"; NVIDIA_GPU_DRIVER_TYPE="cuda"; NVIDIA_DRIVER_IMAGE="mcr.microsoft.com/aks/aks-gpu-cuda"; NVIDIA_DRIVER_IMAGE_TAG="580.0.0"; CTR_GPU_INSTALL_CMD="ctr-run"; GPU_DKMS_MARKER_FILE="$(mktemp)"; rm -f "$GPU_DKMS_MARKER_FILE"'
+
+        It 'uses the full install action when no prebake marker is present'
+            When call configGPUDrivers
+            The output should include "/entrypoint.sh install"
+            The output should not include "install-skip-build"
+        End
+
+        It 'uses install-skip-build when the prebake marker matches the node driver kind'
+            marker="$(mktemp)"
+            printf 'driver_kind=cuda\n' > "$marker"
+            GPU_DKMS_MARKER_FILE="$marker"
+            When call configGPUDrivers
+            The output should include "/entrypoint.sh install-skip-build"
+            rm -f "$marker"
+        End
+
+        It 'falls back to full install when the marker driver_kind does not match the node (CUDA marker on GRID node)'
+            marker="$(mktemp)"
+            printf 'driver_kind=cuda\n' > "$marker"
+            GPU_DKMS_MARKER_FILE="$marker"
+            NVIDIA_GPU_DRIVER_TYPE="grid"
+            NVIDIA_DRIVER_IMAGE="mcr.microsoft.com/aks/aks-gpu-grid"
+            When call configGPUDrivers
+            The output should include "/entrypoint.sh install"
+            The output should not include "install-skip-build"
+            rm -f "$marker"
+        End
+    End
+
     Describe 'configureManagedGPUExperience'
         # Mock the helper functions
         logs_to_events() {

@@ -0,0 +1,34 @@
+#!/bin/bash
+
+Describe 'cse_install_ubuntu.sh'
+    Include "./parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh"
+
+    Describe 'cleanUpPrebakedGPUDriver'
+        It 'is a no-op when the prebake marker is absent'
+            GPU_DKMS_MARKER_FILE="/tmp/aks-gpu-marker-absent-$$"
+            When call cleanUpPrebakedGPUDriver
+            The status should be success
+            The output should equal ""
+        End
+
+        It 'deregisters the nvidia DKMS module and removes baked artifacts (libs, binaries, marker) when present'
+            marker="$(mktemp)"
+            GPU_DKMS_MARKER_FILE="${marker}"
+            rm() { echo "mock rm $*"; }
+            ldconfig() { echo "mock ldconfig"; }
+            When call cleanUpPrebakedGPUDriver
+            The status should be success
+            The output should include "Removing pre-baked NVIDIA driver"
+            # deregisters via the DKMS source tree + built module removal (no slow dkms remove)
+            The output should include "mock rm -rf /var/lib/dkms/nvidia"
+            The output should include "mock rm -f /lib/modules"
+            # relocated userspace libs
+            The output should include "mock rm -rf /usr/bin/lib64"
+            # driver userspace binaries so nvidia-smi becomes "command not found" on non-GPU nodes
+            The output should include "mock rm -f /usr/bin/nvidia-smi"
+            The output should include "mock ldconfig"
+            # the slow per-version dkms remove --all must NOT be on the critical path anymore
+            The output should not include "dkms remove"
+        End
+    End
+End
@@ -737,6 +737,33 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then  # No ARM64 SKU wit
     cat << EOF >> ${VHD_LOGS_FILEPATH}
   - nvidia-cuda-driver=${NVIDIA_DRIVER_IMAGE_TAG}
 EOF
+
+  # Opt-in: pre-build the NVIDIA kernel module into the VHD so node provisioning skips the
+  # ~100s in-CSE DKMS compile. The aks-gpu container is run in "build-only" mode: it compiles
+  # and DKMS-registers the kernel module + stages userspace libs against THIS VHD's kernel,
+  # performs NO device access (safe on the GPU-less Packer builder), and writes the marker
+  # /opt/azure/aks-gpu/dkms-marker. At node boot, configGPUDrivers passes "install-skip-build"
+  # when that marker matches, running only the device-dependent steps.
+  # The driver image is intentionally LEFT in the VHD: boot-time device init still sources the
+  # container toolkit debs, fabric manager, containerd runtime config and udev rules from it.
+  # Dropping the image is a separate, deferred size optimization.
+  if grep -q "NVIDIA_CUDA_PREBAKE" <<< "$FEATURE_FLAGS"; then
+    echo "Pre-building NVIDIA CUDA kernel module into the VHD (build-only) for kernel $(uname -r)"
+    # nvidia-installer compiles the kernel module and needs the libc development headers (libc6-dev),
+    # which the standard (non-GPU) VHD builder image does not ship by default (gcc/make are present
+    # but libc6-dev is not). Ensure the kernel-module build toolchain before the bake; the boot-time
+    # fallback path already gets these via installDeps, so the runtime recompile stays intact.
+    apt_get_install 10 2 300 gcc make libc6-dev || exit 1
+    CTR_GPU_PREBUILD_CMD="ctr -n k8s.io run --privileged --rm --net-host --with-ns pid:/proc/1/ns/pid --mount type=bind,src=/opt/gpu,dst=/mnt/gpu,options=rbind --mount type=bind,src=/opt/actions,dst=/mnt/actions,options=rbind"
+    retrycmd_if_failure 3 10 600 bash -c "$CTR_GPU_PREBUILD_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuprebuild /entrypoint.sh build-only" || exit 1
+    if [ ! -f /opt/azure/aks-gpu/dkms-marker ]; then
+      echo "Error: NVIDIA CUDA prebake did not produce /opt/azure/aks-gpu/dkms-marker"
+      exit 1
+    fi
+    cat << EOF >> ${VHD_LOGS_FILEPATH}
+  - nvidia-cuda-driver-prebaked=${NVIDIA_DRIVER_IMAGE_TAG} (kernel $(uname -r))
+EOF
+  fi
 fi
 
 if grep -q "NVIDIA_GB" <<< "$FEATURE_FLAGS"; then