Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/internal/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ struct cudecompHandle {
cudecomp::ncclComm nccl_comm; // NCCL communicator (global)
cudecomp::ncclComm nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems)
bool nccl_enable_ubr = false; // Flag to control NCCL user buffer registration usage
std::unordered_map<void*, std::vector<std::pair<ncclComm_t, void*>>>
std::unordered_map<void*, std::vector<std::pair<cudecomp::ncclComm, void*>>>
nccl_ubr_handles; // map of allocated buffer address to NCCL registration handle(s)

std::vector<cudaStream_t> streams; // internal streams for concurrent scheduling
Expand Down
45 changes: 29 additions & 16 deletions src/cudecomp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,15 @@ cudecompResult_t cudecompFinalize(cudecompHandle_t handle) {
try {
checkHandle(handle);

#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 19, 0)
for (auto& entry : handle->nccl_ubr_handles) {
for (const auto& ubr_handle : entry.second) {
CHECK_NCCL(ncclCommDeregister(*ubr_handle.first, ubr_handle.second));
}
}
handle->nccl_ubr_handles.clear();
#endif

handle->nccl_comm.reset();
handle->nccl_local_comm.reset();

Expand Down Expand Up @@ -1195,6 +1204,8 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid
try {
checkHandle(handle);
checkGridDesc(grid_desc);
if (!buffer) { THROW_INVALID_USAGE("buffer argument cannot be null"); }
if (buffer_size_bytes == 0) { THROW_INVALID_USAGE("buffer size cannot be zero"); }

if (transposeBackendRequiresNvshmem(grid_desc->config.transpose_comm_backend) ||
haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend)) {
Expand Down Expand Up @@ -1266,14 +1277,20 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid
haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {

if (handle->nccl_enable_ubr) {
void* nccl_ubr_handle;
if (grid_desc->nccl_comm) {
CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_comm, buffer, buffer_size_bytes, &nccl_ubr_handle));
handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(*grid_desc->nccl_comm, nccl_ubr_handle));
}
if (grid_desc->nccl_local_comm) {
CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_local_comm, buffer, buffer_size_bytes, &nccl_ubr_handle));
handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(*grid_desc->nccl_local_comm, nccl_ubr_handle));
try {
void* nccl_ubr_handle;
if (grid_desc->nccl_comm) {
CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_comm, *buffer, buffer_size_bytes, &nccl_ubr_handle));
handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(grid_desc->nccl_comm, nccl_ubr_handle));
}
if (grid_desc->nccl_local_comm) {
CHECK_NCCL(ncclCommRegister(*grid_desc->nccl_local_comm, *buffer, buffer_size_bytes, &nccl_ubr_handle));
handle->nccl_ubr_handles[*buffer].push_back(std::make_pair(grid_desc->nccl_local_comm, nccl_ubr_handle));
}
} catch (...) {
cudecompFree(handle, grid_desc, *buffer);
*buffer = nullptr;
throw;
}
}
}
Expand All @@ -1294,15 +1311,11 @@ cudecompResult_t cudecompFree(cudecompHandle_t handle, cudecompGridDesc_t grid_d
checkGridDesc(grid_desc);

#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 19, 0)
if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {

if (handle->nccl_ubr_handles.count(buffer) != 0) {
for (const auto& entry : handle->nccl_ubr_handles[buffer]) {
CHECK_NCCL(ncclCommDeregister(entry.first, entry.second));
}
handle->nccl_ubr_handles.erase(buffer);
if (handle->nccl_ubr_handles.count(buffer) != 0) {
for (const auto& entry : handle->nccl_ubr_handles[buffer]) {
CHECK_NCCL(ncclCommDeregister(*entry.first, entry.second));
}
handle->nccl_ubr_handles.erase(buffer);
}
#endif

Expand Down
Loading