From 81c87e7bb7d39169226817f465cf317e660b1ef1 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Wed, 29 Apr 2026 15:17:54 -0700 Subject: [PATCH 1/4] Fix NCCL user buffer registration implementation. Signed-off-by: Josh Romero --- src/cudecomp.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cudecomp.cc b/src/cudecomp.cc index 61bbf7e..5afb58b 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -1268,11 +1268,11 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid if (handle->nccl_enable_ubr) { void* nccl_ubr_handle; if (grid_desc->nccl_comm) { - CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_comm, buffer, buffer_size_bytes, &nccl_ubr_handle)); + CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_comm, *buffer, buffer_size_bytes, &nccl_ubr_handle)); handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(*grid_desc->nccl_comm, nccl_ubr_handle)); } if (grid_desc->nccl_local_comm) { - CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_local_comm, buffer, buffer_size_bytes, &nccl_ubr_handle)); + CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_local_comm, *buffer, buffer_size_bytes, &nccl_ubr_handle)); handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(*grid_desc->nccl_local_comm, nccl_ubr_handle)); } } From 536d4e3da835a2e2183108f950c63722517d8cec Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 30 Apr 2026 08:56:04 -0700 Subject: [PATCH 2/4] Deregister buffers based on presence in ubr_handle map. Signed-off-by: Josh Romero --- src/cudecomp.cc | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/cudecomp.cc b/src/cudecomp.cc index 5afb58b..ca2ebf0 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -1195,6 +1195,8 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid try { checkHandle(handle); checkGridDesc(grid_desc); + if (!buffer) { THROW_INVALID_USAGE("buffer argument cannot be null"); } + if (buffer_size_bytes == 0) { THROW_INVALID_USAGE("buffer size cannot be zero"); } if (transposeBackendRequiresNvshmem(grid_desc->config.transpose_comm_backend) || haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend)) { @@ -1294,15 +1296,11 @@ cudecompResult_t cudecompFree(cudecompHandle_t handle, cudecompGridDesc_t grid_d checkGridDesc(grid_desc); #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 19, 0) - if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) || - haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) { - - if (handle->nccl_ubr_handles.count(buffer) != 0) { - for (const auto& entry : handle->nccl_ubr_handles[buffer]) { - CHECK_NCCL(ncclCommDeregister(entry.first, entry.second)); - } - handle->nccl_ubr_handles.erase(buffer); + if (handle->nccl_ubr_handles.count(buffer) != 0) { + for (const auto& entry : handle->nccl_ubr_handles[buffer]) { + CHECK_NCCL(ncclCommDeregister(entry.first, entry.second)); } + handle->nccl_ubr_handles.erase(buffer); } #endif From c3c55bb47fe192a1847fea1d6fc2651c1b7de845 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 30 Apr 2026 09:32:12 -0700 Subject: [PATCH 3/4] More robust handling of NCCL registration failures in cudecompMalloc. Signed-off-by: Josh Romero --- src/cudecomp.cc | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/cudecomp.cc b/src/cudecomp.cc index ca2ebf0..746c04e 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -1268,14 +1268,20 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) { if (handle->nccl_enable_ubr) { - void* nccl_ubr_handle; - if (grid_desc->nccl_comm) { - CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_comm, *buffer, buffer_size_bytes, &nccl_ubr_handle)); - handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(*grid_desc->nccl_comm, nccl_ubr_handle)); - } - if (grid_desc->nccl_local_comm) { - CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_local_comm, *buffer, buffer_size_bytes, &nccl_ubr_handle)); - handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(*grid_desc->nccl_local_comm, nccl_ubr_handle)); + try { + void* nccl_ubr_handle; + if (grid_desc->nccl_comm) { + CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_comm, *buffer, buffer_size_bytes, &nccl_ubr_handle)); + handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(*grid_desc->nccl_comm, nccl_ubr_handle)); + } + if (grid_desc->nccl_local_comm) { + CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_local_comm, *buffer, buffer_size_bytes, &nccl_ubr_handle)); + handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(*grid_desc->nccl_local_comm, nccl_ubr_handle)); + } + } catch (...) { + cudecompFree(handle, grid_desc, *buffer); + *buffer = nullptr; + throw; } } } From 0f3ca65541191c69dcb9f6e22d2f0e3a3f6784b5 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Thu, 30 Apr 2026 09:55:21 -0700 Subject: [PATCH 4/4] Improve communicator lifetime handling for NCCL buffer deregistation. Signed-off-by: Josh Romero --- include/internal/common.h | 2 +- src/cudecomp.cc | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/internal/common.h b/include/internal/common.h index 694ad01..b4551d5 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -69,7 +69,7 @@ struct cudecompHandle { cudecomp::ncclComm nccl_comm; // NCCL communicator (global) cudecomp::ncclComm nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems) bool nccl_enable_ubr = false; // Flag to control NCCL user buffer registration usage - std::unordered_map>> + std::unordered_map>> nccl_ubr_handles; // map of allocated buffer address to NCCL registration handle(s) std::vector streams; // internal streams for concurrent scheduling diff --git a/src/cudecomp.cc b/src/cudecomp.cc index 746c04e..edfdfad 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -579,6 +579,15 @@ cudecompResult_t cudecompFinalize(cudecompHandle_t handle) { try { checkHandle(handle); +#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 19, 0) + for (auto& entry : handle->nccl_ubr_handles) { + for (const auto& ubr_handle : entry.second) { + CHECK_NCCL(ncclCommDeregister(*ubr_handle.first, ubr_handle.second)); + } + } + handle->nccl_ubr_handles.clear(); +#endif + handle->nccl_comm.reset(); handle->nccl_local_comm.reset(); @@ -1272,11 +1281,11 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid void* nccl_ubr_handle; if (grid_desc->nccl_comm) { CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_comm, *buffer, buffer_size_bytes, &nccl_ubr_handle)); - handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(*grid_desc->nccl_comm, nccl_ubr_handle)); + handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(grid_desc->nccl_comm, nccl_ubr_handle)); } if (grid_desc->nccl_local_comm) { CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_local_comm, *buffer, buffer_size_bytes, &nccl_ubr_handle)); - handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(*grid_desc->nccl_local_comm, nccl_ubr_handle)); + handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(grid_desc->nccl_local_comm, nccl_ubr_handle)); } } catch (...) { cudecompFree(handle, grid_desc, *buffer); @@ -1304,7 +1313,7 @@ cudecompResult_t cudecompFree(cudecompHandle_t handle, cudecompGridDesc_t grid_d #if NCCL_VERSION_CODE >= NCCL_VERSION(2, 19, 0) if (handle->nccl_ubr_handles.count(buffer) != 0) { for (const auto& entry : handle->nccl_ubr_handles[buffer]) { - CHECK_NCCL(ncclCommDeregister(entry.first, entry.second)); + CHECK_NCCL(ncclCommDeregister(*entry.first, entry.second)); } handle->nccl_ubr_handles.erase(buffer); }