Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ The doctor command discovers and runs checks via Python entry points defined in
### Key Dependencies

- `rich` and `rich-click` for terminal output and CLI interface
- `pynvml` (nvidia-ml-py) for GPU information
- `cuda.core` (`cuda.core.system`) for GPU information
- `cuda-pathfinder` for locating CUDA installations
- `psutil` for system memory checks

Expand Down
3 changes: 1 addition & 2 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,11 @@ dependencies:
common:
- output_types: [conda, requirements, pyproject]
packages:
- cuda-core >=0.6.0
- cuda-core>=1.0.0
# NVML APIs we use via cuda.core.system landed in cuda-bindings
# 12.9.6 (CUDA 12) and 13.2.0 (CUDA 13). The 13.0/13.1
# wheels pre-date the 13.x landing and are excluded.
- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*
- nvidia-ml-py>=12.0
- cuda-pathfinder >=1.2.3
- packaging
- psutil
Expand Down
2 changes: 1 addition & 1 deletion docs/source/api/debug.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ for troubleshooting RAPIDS installations.
:func:`~rapids_cli.debug.debug.run_debug` is the main entry point. It collects:

- Platform and OS details (from ``platform`` and ``/etc/os-release``)
- NVIDIA driver and CUDA versions (via ``pynvml``)
- NVIDIA driver and CUDA versions (via ``cuda.core``)
- CUDA runtime path (via ``cuda-pathfinder``)
- System CUDA toolkit locations (globbing ``/usr/local/cuda*``)
- Python version and hash info
Expand Down
7 changes: 3 additions & 4 deletions docs/source/plugin_development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,13 @@ GPU memory requirement check:

.. code-block:: python
import pynvml
from cuda.core import system
def gpu_memory_check(verbose=False, **kwargs):
"""Check that GPU has at least 8GB memory."""
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
device = system.Device(0)
mem = device.memory_info
available_gb = mem.total / (1024**3)
if available_gb < 8:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/troubleshooting.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ No GPUs Detected

.. code-block:: bash

python -c "import pynvml; pynvml.nvmlInit(); print(pynvml.nvmlDeviceGetCount())"
python -c "from cuda.core import system; print(system.get_num_devices())"

3. If running in a container, ensure GPU passthrough is enabled:

Expand Down
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@ readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*",
"cuda-core >=0.6.0",
"cuda-core>=1.0.0",
"cuda-pathfinder >=1.2.3",
"importlib-metadata >= 4.13.0; python_version < '3.12'",
"nvidia-ml-py>=12.0",
"packaging",
"psutil",
"pyyaml",
Expand Down
32 changes: 16 additions & 16 deletions rapids_cli/hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def cuda_runtime_path(self) -> str | None:


class NvmlGpuInfo:
"""Real GPU info provider backed by pynvml.
"""Real GPU info provider backed by cuda.core.system.

Lazily loads all device information on first property access and caches results.
"""
Expand All @@ -80,37 +80,37 @@ def _ensure_loaded(self) -> None:
if self._loaded:
return

import pynvml
from cuda.core import system

try:
pynvml.nvmlInit()
except pynvml.NVMLError as e:
self._device_count = system.get_num_devices()
except system.NvmlError as e:
raise HardwareInfoError("Unable to initialize GPU driver (NVML)") from e

self._device_count = pynvml.nvmlDeviceGetCount()
self._cuda_driver_version = pynvml.nvmlSystemGetCudaDriverVersion()
self._driver_version = pynvml.nvmlSystemGetDriverVersion()
cuda_driver_version = system.get_user_mode_driver_version()
self._cuda_driver_version = cuda_driver_version[0] * 1000 + cuda_driver_version[1] * 10
driver_version = system.get_kernel_mode_driver_version()
self._driver_version = ".".join(str(x) for x in driver_version[:2])

self._devices = []
for i in range(self._device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
for device in system.Device.get_all_devices():
major, minor = device.cuda_compute_capability
memory_info = device.memory_info

nvlink_states: list[bool] = []
for link_id in range(pynvml.NVML_NVLINK_MAX_LINKS):
for link_id in range(system.NvlinkInfo.max_links):
try:
state = pynvml.nvmlDeviceGetNvLinkState(handle, link_id)
state = device.get_nvlink(link_id).state
nvlink_states.append(bool(state))
except (
pynvml.NVMLError_InvalidArgument,
pynvml.NVMLError_NotSupported,
system.InvalidArgumentError,
system.NotSupportedError,
):
break

self._devices.append(
DeviceInfo(
index=i,
index=device.index,
compute_capability=(major, minor),
memory_total_bytes=memory_info.total,
nvlink_states=nvlink_states,
Expand Down
111 changes: 59 additions & 52 deletions rapids_cli/tests/test_hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import MagicMock, patch

import pynvml
from cuda.bindings import nvml
from cuda.core import system
import pytest

from rapids_cli.hardware import (
Expand All @@ -25,55 +26,54 @@

def test_nvml_gpu_info_init_failure():
with patch(
"pynvml.nvmlInit",
side_effect=pynvml.NVMLError(pynvml.NVML_ERROR_DRIVER_NOT_LOADED),
"cuda.bindings.nvml.init_v2",
side_effect=nvml.NvmlError(nvml.Return.ERROR_DRIVER_NOT_LOADED),
):
gpu_info = NvmlGpuInfo()
with pytest.raises(HardwareInfoError, match="Unable to initialize GPU driver"):
_ = gpu_info.device_count


def test_nvml_gpu_info_loads_once():
mock_handle = MagicMock()
mock_device = MagicMock()
mock_device.cuda_compute_capability = (7, 5)
mock_memory = MagicMock()
mock_device.memory_info = mock_memory
mock_memory.total = 16 * 1024**3
mock_nvlink_info = MagicMock()
mock_nvlink_info.max_links = 3

with (
patch("pynvml.nvmlInit") as mock_init,
patch("pynvml.nvmlDeviceGetCount", return_value=1),
patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050),
patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"),
patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle),
patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)),
patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory),
patch(
"pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported
),
patch("cuda.core.system.NvlinkInfo", mock_nvlink_info),
patch("cuda.core.system.get_num_devices", return_value=1),
patch("cuda.core.system.get_user_mode_driver_version", return_value=(12, 5)),
patch("cuda.core.system.get_kernel_mode_driver_version", return_value=(550, 54, 0)),
patch("cuda.core.system.Device", return_value=mock_device),
):
gpu_info = NvmlGpuInfo()
# Access multiple properties to verify caching
_ = gpu_info.device_count
_ = gpu_info.devices
_ = gpu_info.cuda_driver_version
_ = gpu_info.driver_version
# nvmlInit should be called exactly once
mock_init.assert_called_once()


def test_nvml_gpu_info_device_data():
mock_handle = MagicMock()
mock_device = MagicMock()
mock_device.cuda_compute_capability = (9, 0)
mock_device.get_all_devices.return_value = [mock_device, mock_device]
mock_memory = MagicMock()
mock_device.memory_info = mock_memory
mock_memory.total = 24 * 1024**3
mock_nvlink_info = MagicMock()
mock_device.get_nvlink.side_effect = lambda link_id: MagicMock(state=True)
mock_nvlink_info.max_links = 3

with (
patch("pynvml.nvmlInit"),
patch("pynvml.nvmlDeviceGetCount", return_value=2),
patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12060),
patch("pynvml.nvmlSystemGetDriverVersion", return_value="560.10"),
patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle),
patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(9, 0)),
patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory),
patch("pynvml.nvmlDeviceGetNvLinkState", return_value=1),
patch("cuda.core.system.get_num_devices", return_value=2),
patch("cuda.core.system.get_user_mode_driver_version", return_value=(12, 6)),
patch("cuda.core.system.get_kernel_mode_driver_version", return_value=(560, 10, 0)),
patch("cuda.core.system.Device", mock_device),
):
gpu_info = NvmlGpuInfo()
assert gpu_info.device_count == 2
Expand All @@ -85,45 +85,52 @@ def test_nvml_gpu_info_device_data():


def test_nvml_gpu_info_nvlink_states():
mock_handle = MagicMock()
mock_memory = MagicMock()
mock_memory.total = 16 * 1024**3

def nvlink_side_effect(handle, link_id):
def nvlink_side_effect(link_id):
if link_id < 2:
return 1
raise pynvml.NVMLError_NotSupported()
nvlink_info = MagicMock()
nvlink_info.state = True
return nvlink_info
raise system.NotSupportedError(nvml.Return.ERROR_NOT_SUPPORTED)

mock_device = MagicMock()
mock_device.cuda_compute_capability = (9, 0)
mock_device.get_all_devices.return_value = [mock_device, mock_device]
mock_memory = MagicMock()
mock_device.memory_info = mock_memory
mock_memory.total = 24 * 1024**3
mock_nvlink_info = MagicMock()
mock_device.get_nvlink.side_effect = nvlink_side_effect
mock_nvlink_info.max_links = 3

with (
patch("pynvml.nvmlInit"),
patch("pynvml.nvmlDeviceGetCount", return_value=1),
patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050),
patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"),
patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle),
patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)),
patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory),
patch("pynvml.nvmlDeviceGetNvLinkState", side_effect=nvlink_side_effect),
patch("cuda.core.system.get_num_devices", return_value=2),
patch("cuda.core.system.get_user_mode_driver_version", return_value=(12, 6)),
patch("cuda.core.system.get_kernel_mode_driver_version", return_value=(560, 10, 0)),
patch("cuda.core.system.Device", mock_device),
):
gpu_info = NvmlGpuInfo()
assert gpu_info.devices[0].nvlink_states == [True, True]


def test_nvml_gpu_info_no_nvlink():
mock_handle = MagicMock()
def nvlink_side_effect(link_id):
raise system.NotSupportedError(nvml.Return.ERROR_NOT_SUPPORTED)

mock_device = MagicMock()
mock_device.cuda_compute_capability = (9, 0)
mock_device.get_all_devices.return_value = [mock_device, mock_device]
mock_memory = MagicMock()
mock_memory.total = 16 * 1024**3
mock_device.memory_info = mock_memory
mock_memory.total = 24 * 1024**3
mock_nvlink_info = MagicMock()
mock_device.get_nvlink.side_effect = nvlink_side_effect
mock_nvlink_info.max_links = 3

with (
patch("pynvml.nvmlInit"),
patch("pynvml.nvmlDeviceGetCount", return_value=1),
patch("pynvml.nvmlSystemGetCudaDriverVersion", return_value=12050),
patch("pynvml.nvmlSystemGetDriverVersion", return_value="550.54"),
patch("pynvml.nvmlDeviceGetHandleByIndex", return_value=mock_handle),
patch("pynvml.nvmlDeviceGetCudaComputeCapability", return_value=(7, 5)),
patch("pynvml.nvmlDeviceGetMemoryInfo", return_value=mock_memory),
patch(
"pynvml.nvmlDeviceGetNvLinkState", side_effect=pynvml.NVMLError_NotSupported
),
patch("cuda.core.system.get_num_devices", return_value=2),
patch("cuda.core.system.get_user_mode_driver_version", return_value=(12, 6)),
patch("cuda.core.system.get_kernel_mode_driver_version", return_value=(560, 10, 0)),
patch("cuda.core.system.Device", mock_device),
):
gpu_info = NvmlGpuInfo()
assert gpu_info.devices[0].nvlink_states == []
Expand Down