From d88499bd7edf07d676936249a35f0c634a1e55cf Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 19 May 2026 11:47:15 -0400 Subject: [PATCH] Migrate pynvml to cuda.core.system --- CLAUDE.md | 2 +- dependencies.yaml | 3 +- docs/source/api/debug.rst | 2 +- docs/source/plugin_development.rst | 7 +- docs/source/troubleshooting.rst | 2 +- pyproject.toml | 3 +- rapids_cli/hardware.py | 32 ++++----- rapids_cli/tests/test_hardware.py | 111 +++++++++++++++-------------- 8 files changed, 83 insertions(+), 79 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index d73a716..fec93ed 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -111,7 +111,7 @@ The doctor command discovers and runs checks via Python entry points defined in ### Key Dependencies - `rich` and `rich-click` for terminal output and CLI interface -- `pynvml` (nvidia-ml-py) for GPU information +- `cuda.core` (`cuda.core.system`) for GPU information - `cuda-pathfinder` for locating CUDA installations - `psutil` for system memory checks diff --git a/dependencies.yaml b/dependencies.yaml index 3afec7e..41640ed 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -61,12 +61,11 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - cuda-core >=0.6.0 + - cuda-core>=1.0.0 # NVML APIs we use via cuda.core.system landed in cuda-bindings # 12.9.6 (CUDA 12) and 13.2.0 (CUDA 13). The 13.0/13.1 # wheels pre-date the 13.x landing and are excluded. - cuda-bindings>=12.9.6,!=13.0.*,!=13.1.* - - nvidia-ml-py>=12.0 - cuda-pathfinder >=1.2.3 - packaging - psutil diff --git a/docs/source/api/debug.rst b/docs/source/api/debug.rst index aa4e84b..b54b27b 100644 --- a/docs/source/api/debug.rst +++ b/docs/source/api/debug.rst @@ -10,7 +10,7 @@ for troubleshooting RAPIDS installations. :func:`~rapids_cli.debug.debug.run_debug` is the main entry point. It collects: - Platform and OS details (from ``platform`` and ``/etc/os-release``) -- NVIDIA driver and CUDA versions (via ``pynvml``) +- NVIDIA driver and CUDA versions (via ``cuda.core``) - CUDA runtime path (via ``cuda-pathfinder``) - System CUDA toolkit locations (globbing ``/usr/local/cuda*``) - Python version and hash info diff --git a/docs/source/plugin_development.rst b/docs/source/plugin_development.rst index d5b5e45..c59d1af 100644 --- a/docs/source/plugin_development.rst +++ b/docs/source/plugin_development.rst @@ -95,14 +95,13 @@ GPU memory requirement check: .. code-block:: python - import pynvml + from cuda.core import system def gpu_memory_check(verbose=False, **kwargs): """Check that GPU has at least 8GB memory.""" - pynvml.nvmlInit() - handle = pynvml.nvmlDeviceGetHandleByIndex(0) - mem = pynvml.nvmlDeviceGetMemoryInfo(handle) + device = system.Device(0) + mem = device.memory_info available_gb = mem.total / (1024**3) if available_gb < 8: diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index 5da7f2c..54243c1 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -19,7 +19,7 @@ No GPUs Detected .. code-block:: bash - python -c "import pynvml; pynvml.nvmlInit(); print(pynvml.nvmlDeviceGetCount())" + python -c "from cuda.core import system; print(system.get_num_devices())" 3. If running in a container, ensure GPU passthrough is enabled: diff --git a/pyproject.toml b/pyproject.toml index b2c0d4b..1d076a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,10 +8,9 @@ readme = "README.md" requires-python = ">=3.10" dependencies = [ "cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*", - "cuda-core >=0.6.0", + "cuda-core>=1.0.0", "cuda-pathfinder >=1.2.3", "importlib-metadata >= 4.13.0; python_version < '3.12'", - "nvidia-ml-py>=12.0", "packaging", "psutil", "pyyaml", diff --git a/rapids_cli/hardware.py b/rapids_cli/hardware.py index 947b986..1e40405 100644 --- a/rapids_cli/hardware.py +++ b/rapids_cli/hardware.py @@ -63,7 +63,7 @@ def cuda_runtime_path(self) -> str | None: class NvmlGpuInfo: - """Real GPU info provider backed by pynvml. + """Real GPU info provider backed by cuda.core.system. Lazily loads all device information on first property access and caches results. """ @@ -80,37 +80,37 @@ def _ensure_loaded(self) -> None: if self._loaded: return - import pynvml + from cuda.core import system try: - pynvml.nvmlInit() - except pynvml.NVMLError as e: + self._device_count = system.get_num_devices() + except system.NvmlError as e: raise HardwareInfoError("Unable to initialize GPU driver (NVML)") from e - self._device_count = pynvml.nvmlDeviceGetCount() - self._cuda_driver_version = pynvml.nvmlSystemGetCudaDriverVersion() - self._driver_version = pynvml.nvmlSystemGetDriverVersion() + cuda_driver_version = system.get_user_mode_driver_version() + self._cuda_driver_version = cuda_driver_version[0] * 1000 + cuda_driver_version[1] * 10 + driver_version = system.get_kernel_mode_driver_version() + self._driver_version = ".".join(str(x) for x in driver_version[:2]) self._devices = [] - for i in range(self._device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) - memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + for device in system.Device.get_all_devices(): + major, minor = device.cuda_compute_capability + memory_info = device.memory_info nvlink_states: list[bool] = [] - for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS): + for link_id in range(system.NvlinkInfo.max_links): try: - state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id) + state = device.get_nvlink(link_id).state nvlink_states.append(bool(state)) except ( - pynvml.NVMLError_InvalidArgument, - pynvml.NVMLError_NotSupported, + system.InvalidArgumentError, + system.NotSupportedError, ): break self._devices.append( DeviceInfo( - index=i, + index=device.index, compute_capability=(major, minor), memory_total_bytes=memory_info.total, nvlink_states=nvlink_states, diff --git a/rapids_cli/tests/test_hardware.py b/rapids_cli/tests/test_hardware.py index 215d68c..12bdfb9 100644 --- a/rapids_cli/tests/test_hardware.py +++ b/rapids_cli/tests/test_hardware.py @@ -2,7 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 from unittest.mock import MagicMock, patch -import pynvml +from cuda.bindings import nvml +from cuda.core import system import pytest from rapids_cli.hardware import ( @@ -25,8 +26,8 @@ def test_nvml_gpu_info_init_failure(): with patch( - "pynvml.nvmlInit", - side_effect=pynvml.NVMLError(pynvml.NVML_ERROR_DRIVER_NOT_LOADED), + "cuda.bindings.nvml.init_v2", + side_effect=nvml.NvmlError(nvml.Return.ERROR_DRIVER_NOT_LOADED), ): gpu_info = NvmlGpuInfo() with pytest.raises(HardwareInfoError, match="Unable to initialize GPU driver"): @@ -34,21 +35,20 @@ def test_nvml_gpu_info_init_failure(): def test_nvml_gpu_info_loads_once(): - mock_handle = MagicMock() + mock_device = MagicMock() + mock_device.cuda_compute_capability = (7, 5) mock_memory = MagicMock() + mock_device.memory_info = mock_memory mock_memory.total = 16 * 1024**3 + mock_nvlink_info = MagicMock() + mock_nvlink_info.max_links = 3 with ( - patch("pynvml.nvmlInit") as mock_init, - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), - patch( - "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported - ), + patch("cuda.core.system.NvlinkInfo", mock_nvlink_info), + patch("cuda.core.system.get_num_devices", return_value=1), + patch("cuda.core.system.get_user_mode_driver_version", return_value=(12, 5)), + patch("cuda.core.system.get_kernel_mode_driver_version", return_value=(550, 54, 0)), + patch("cuda.core.system.Device", return_value=mock_device), ): gpu_info = NvmlGpuInfo() # Access multiple properties to verify caching @@ -56,24 +56,24 @@ def test_nvml_gpu_info_loads_once(): _ = gpu_info.devices _ = gpu_info.cuda_driver_version _ = gpu_info.driver_version - # nvmlInit should be called exactly once - mock_init.assert_called_once() def test_nvml_gpu_info_device_data(): - mock_handle = MagicMock() + mock_device = MagicMock() + mock_device.cuda_compute_capability = (9, 0) + mock_device.get_all_devices.return_value = [mock_device, mock_device] mock_memory = MagicMock() + mock_device.memory_info = mock_memory mock_memory.total = 24 * 1024**3 + mock_nvlink_info = MagicMock() + mock_device.get_nvlink.side_effect = lambda link_id: MagicMock(state=True) + mock_nvlink_info.max_links = 3 with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=2), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12060), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="560.10"), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(9, 0)), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), - patch("pynvml.nvmlDeviceGetNvLinkState", return_value=1), + patch("cuda.core.system.get_num_devices", return_value=2), + patch("cuda.core.system.get_user_mode_driver_version", return_value=(12, 6)), + patch("cuda.core.system.get_kernel_mode_driver_version", return_value=(560, 10, 0)), + patch("cuda.core.system.Device", mock_device), ): gpu_info = NvmlGpuInfo() assert gpu_info.device_count == 2 @@ -85,45 +85,52 @@ def test_nvml_gpu_info_device_data(): def test_nvml_gpu_info_nvlink_states(): - mock_handle = MagicMock() - mock_memory = MagicMock() - mock_memory.total = 16 * 1024**3 - - def nvlink_side_effect(handle, link_id): + def nvlink_side_effect(link_id): if link_id < 2: - return 1 - raise pynvml.NVMLError_NotSupported() + nvlink_info = MagicMock() + nvlink_info.state = True + return nvlink_info + raise system.NotSupportedError(nvml.Return.ERROR_NOT_SUPPORTED) + + mock_device = MagicMock() + mock_device.cuda_compute_capability = (9, 0) + mock_device.get_all_devices.return_value = [mock_device, mock_device] + mock_memory = MagicMock() + mock_device.memory_info = mock_memory + mock_memory.total = 24 * 1024**3 + mock_nvlink_info = MagicMock() + mock_device.get_nvlink.side_effect = nvlink_side_effect + mock_nvlink_info.max_links = 3 with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), - patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=nvlink_side_effect), + patch("cuda.core.system.get_num_devices", return_value=2), + patch("cuda.core.system.get_user_mode_driver_version", return_value=(12, 6)), + patch("cuda.core.system.get_kernel_mode_driver_version", return_value=(560, 10, 0)), + patch("cuda.core.system.Device", mock_device), ): gpu_info = NvmlGpuInfo() assert gpu_info.devices[0].nvlink_states == [True, True] def test_nvml_gpu_info_no_nvlink(): - mock_handle = MagicMock() + def nvlink_side_effect(link_id): + raise system.NotSupportedError(nvml.Return.ERROR_NOT_SUPPORTED) + + mock_device = MagicMock() + mock_device.cuda_compute_capability = (9, 0) + mock_device.get_all_devices.return_value = [mock_device, mock_device] mock_memory = MagicMock() - mock_memory.total = 16 * 1024**3 + mock_device.memory_info = mock_memory + mock_memory.total = 24 * 1024**3 + mock_nvlink_info = MagicMock() + mock_device.get_nvlink.side_effect = nvlink_side_effect + mock_nvlink_info.max_links = 3 with ( - patch("pynvml.nvmlInit"), - patch("pynvml.nvmlDeviceGetCount", return_value=1), - patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050), - patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"), - patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle), - patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)), - patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory), - patch( - "pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported - ), + patch("cuda.core.system.get_num_devices", return_value=2), + patch("cuda.core.system.get_user_mode_driver_version", return_value=(12, 6)), + patch("cuda.core.system.get_kernel_mode_driver_version", return_value=(560, 10, 0)), + patch("cuda.core.system.Device", mock_device), ): gpu_info = NvmlGpuInfo() assert gpu_info.devices[0].nvlink_states == []