From fe43444a012e54ae870579c48aade9d8ffbd09f0 Mon Sep 17 00:00:00 2001 From: romerojosh Date: Wed, 6 May 2026 06:56:30 +0000 Subject: [PATCH] Allow test ranks to share GPUs under MPS Signed-off-by: romerojosh --- tests/cc/halo_test.cc | 8 +++++++- tests/cc/transpose_test.cc | 8 +++++++- tests/fortran/halo_test.f90 | 9 +++++++-- tests/fortran/transpose_test.f90 | 9 +++++++-- 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/tests/cc/halo_test.cc b/tests/cc/halo_test.cc index 64a8f8d..79ab320 100644 --- a/tests/cc/halo_test.cc +++ b/tests/cc/halo_test.cc @@ -510,7 +510,13 @@ int main(int argc, char** argv) { MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local_comm); int local_rank; MPI_Comm_rank(local_comm, &local_rank); - CHECK_CUDA_EXIT(cudaSetDevice(local_rank)); + int num_devices = 0; + CHECK_CUDA_EXIT(cudaGetDeviceCount(&num_devices)); + if (num_devices <= 0) { + fprintf(stderr, "No CUDA devices available.\n"); + exit(EXIT_FAILURE); + } + CHECK_CUDA_EXIT(cudaSetDevice(local_rank % num_devices)); // Check if test file was provided std::string testfile; diff --git a/tests/cc/transpose_test.cc b/tests/cc/transpose_test.cc index 5b0a414..0c6f585 100644 --- a/tests/cc/transpose_test.cc +++ b/tests/cc/transpose_test.cc @@ -574,7 +574,13 @@ int main(int argc, char** argv) { MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local_comm); int local_rank; MPI_Comm_rank(local_comm, &local_rank); - CHECK_CUDA_EXIT(cudaSetDevice(local_rank)); + int num_devices = 0; + CHECK_CUDA_EXIT(cudaGetDeviceCount(&num_devices)); + if (num_devices <= 0) { + fprintf(stderr, "No CUDA devices available.\n"); + exit(EXIT_FAILURE); + } + CHECK_CUDA_EXIT(cudaSetDevice(local_rank % num_devices)); // Check if test file was provided std::string testfile; diff --git a/tests/fortran/halo_test.f90 b/tests/fortran/halo_test.f90 index ce29191..23a9f6d 100644 --- a/tests/fortran/halo_test.f90 +++ b/tests/fortran/halo_test.f90 @@ -529,7 +529,7 @@ program main integer :: i, idx real(real64) :: t0 - integer :: local_rank, ierr + integer :: local_rank, ierr, num_devices integer :: local_comm integer :: res, retcode logical :: using_testfile @@ -546,7 +546,12 @@ program main call MPI_Comm_split_Type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, local_comm, ierr) call MPI_Comm_rank(local_comm, local_rank, ierr) - CHECK_CUDA_EXIT(cudaSetDevice(local_rank)) + CHECK_CUDA_EXIT(cudaGetDeviceCount(num_devices)) + if (num_devices <= 0) then + print*, 'No CUDA devices available.' + call exit(1) + endif + CHECK_CUDA_EXIT(cudaSetDevice(mod(local_rank, num_devices))) using_testfile = .false. do i = 1, command_argument_count() diff --git a/tests/fortran/transpose_test.f90 b/tests/fortran/transpose_test.f90 index ac65d51..c627c51 100644 --- a/tests/fortran/transpose_test.f90 +++ b/tests/fortran/transpose_test.f90 @@ -554,7 +554,7 @@ program main integer :: i, idx real(real64) :: t0 - integer :: local_rank, ierr + integer :: local_rank, ierr, num_devices integer :: local_comm integer :: res, retcode logical :: using_testfile @@ -571,7 +571,12 @@ program main call MPI_Comm_split_Type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, local_comm, ierr) call MPI_Comm_rank(local_comm, local_rank, ierr) - CHECK_CUDA_EXIT(cudaSetDevice(local_rank)) + CHECK_CUDA_EXIT(cudaGetDeviceCount(num_devices)) + if (num_devices <= 0) then + print*, 'No CUDA devices available.' + call exit(1) + endif + CHECK_CUDA_EXIT(cudaSetDevice(mod(local_rank, num_devices))) using_testfile = .false. do i = 1, command_argument_count()