From 0f1f13d1ef590d2c6aa42d29f15bca3940188546 Mon Sep 17 00:00:00 2001 From: romerojosh Date: Wed, 10 Jun 2026 20:25:59 +0000 Subject: [PATCH 1/2] Use native NCCL Alltoall API when able. Signed-off-by: romerojosh --- include/internal/comm_routines.h | 41 ++++++++++++++++++++++++++++++++ tests/ctest/transpose_tests.cc | 8 +++++++ 2 files changed, 49 insertions(+) diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h index b132066..cc372db 100644 --- a/include/internal/comm_routines.h +++ b/include/internal/comm_routines.h @@ -48,6 +48,15 @@ static inline MPI_Datatype getMpiDataType(cuda::std::complex) { return MP static inline MPI_Datatype getMpiDataType(cuda::std::complex) { return MPI_C_DOUBLE_COMPLEX; } template static inline MPI_Datatype getMpiDataType() { return getMpiDataType(T(0)); } +static inline bool ncclAlltoAllRuntimeAvailable() { +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 3) + int version = 0; + return ncclGetVersion(&version) == ncclSuccess && version >= NCCL_VERSION(2, 28, 3); +#else + return false; +#endif +} + static inline bool canUseMpiAlltoall(const std::vector& send_counts, const std::vector& send_offsets, const std::vector& recv_counts, @@ -73,6 +82,30 @@ static inline bool canUseMpiAlltoall(const std::vector& send_count return true; } +static inline bool canUseNcclAlltoAll(const cudecompHandle_t& handle, const cudecompGridDesc_t& grid_desc, + const std::vector& send_counts, + const std::vector& send_offsets, + const std::vector& recv_counts, + const std::vector& recv_offsets, cudecompCommAxis comm_axis) { + if (!canUseMpiAlltoall(send_counts, send_offsets, recv_counts, recv_offsets)) { return false; } + if (send_counts[0] != recv_counts[0]) { return false; } + + const auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info; + int nccl_comm_size = handle->nranks; + if (comm_info.ngroups == 1) { + nccl_comm_size = (handle->mpi_clique_comm != MPI_COMM_NULL) ? handle->clique_nranks : handle->local_nranks; + } + if (nccl_comm_size != static_cast(send_counts.size())) { return false; } + + for (int i = 0; i < static_cast(send_counts.size()); ++i) { + int peer_rank = getGlobalRank(handle, grid_desc, comm_axis, i); + if (comm_info.ngroups == 1) { peer_rank = handle->rank_to_clique_rank[peer_rank]; } + if (peer_rank != i) { return false; } + } + + return true; +} + static inline void checkMpiInt32Limit(int64_t val, cudecompTransposeCommBackend_t backend) { if (val > std::numeric_limits::max()) { std::ostringstream os; @@ -281,6 +314,14 @@ static void cudecompAlltoall(const cudecompHandle_t& handle, const cudecompGridD // For fully intra-group alltoall, use distinct NCCL local comm instead of global comm as it is faster. auto comm = (comm_info.ngroups == 1) ? *grid_desc->nccl_local_comm : *grid_desc->nccl_comm; +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 28, 3) + if (ncclAlltoAllRuntimeAvailable() && + canUseNcclAlltoAll(handle, grid_desc, send_counts, send_offsets, recv_counts, recv_offsets, comm_axis)) { + CHECK_NCCL(ncclAlltoAll(send_buff, recv_buff, send_counts[0] * sizeof(T), ncclChar, comm, stream)); + break; + } +#endif + CHECK_NCCL(ncclGroupStart()); for (int i = 0; i < send_counts.size(); ++i) { int peer_rank_global = getGlobalRank(handle, grid_desc, comm_axis, i); diff --git a/tests/ctest/transpose_tests.cc b/tests/ctest/transpose_tests.cc index e97382e..b234d54 100644 --- a/tests/ctest/transpose_tests.cc +++ b/tests/ctest/transpose_tests.cc @@ -187,6 +187,13 @@ void appendBaselineCases(std::vector& cases, const cudecomp_test: } } +void appendNcclNativeAlltoAllCases(std::vector& cases, const cudecomp_test::TransposeBackend& backend) { + if (backend.backend != CUDECOMP_TRANSPOSE_COMM_NCCL) return; + + cases.push_back( + makeCase(backend, "NativeAlltoAllFastPath", TransposeOperation::YToZ, {8, 8, 8}, {1, 4}, CUDECOMP_FLOAT, true)); +} + void appendCoverageCases(std::vector& cases, const cudecomp_test::TransposeBackend& backend) { // Coverage cases select explicit memory orders, nonzero halo/padding, dtypes, rank order, and rank counts to reach // transpose paths not guaranteed by the baseline sweep. These are not inherently MPI-only, but running them in the @@ -271,6 +278,7 @@ std::vector transposeCasesForLabel(const char* label) { if (std::string(backend.label) != label) continue; appendBaselineCases(cases, backend); + if (std::string(label) == "nccl") { appendNcclNativeAlltoAllCases(cases, backend); } if (std::string(label) == "mpi") { appendCoverageCases(cases, backend); } } return cases; From 51dedcc34c05bd1e68cbeebfce7b0825a1466cf5 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 11 Jun 2026 09:31:29 -0700 Subject: [PATCH 2/2] Rename test. Signed-off-by: Josh Romero --- tests/ctest/transpose_tests.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ctest/transpose_tests.cc b/tests/ctest/transpose_tests.cc index b234d54..4582e2c 100644 --- a/tests/ctest/transpose_tests.cc +++ b/tests/ctest/transpose_tests.cc @@ -191,7 +191,7 @@ void appendNcclNativeAlltoAllCases(std::vector& cases, const cude if (backend.backend != CUDECOMP_TRANSPOSE_COMM_NCCL) return; cases.push_back( - makeCase(backend, "NativeAlltoAllFastPath", TransposeOperation::YToZ, {8, 8, 8}, {1, 4}, CUDECOMP_FLOAT, true)); + makeCase(backend, "NativeAlltoAllPath", TransposeOperation::YToZ, {8, 8, 8}, {1, 4}, CUDECOMP_FLOAT, true)); } void appendCoverageCases(std::vector& cases, const cudecomp_test::TransposeBackend& backend) {