From f09c75ba1f32b7bdfeffeae55c5ecef7169b0ef2 Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Mon, 4 May 2026 16:05:44 -0700 Subject: [PATCH 1/3] Free temporary resources if failure occurs during autotuning. Signed-off-by: Josh Romero --- src/autotune.cc | 137 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 115 insertions(+), 22 deletions(-) diff --git a/src/autotune.cc b/src/autotune.cc index e2649f3..2f3d7f8 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,42 @@ namespace cudecomp { namespace { +struct cudaBufferGuardDeleter { + void operator()(void* ptr) const noexcept { + if (ptr) { cudaFree(ptr); } + } +}; + +struct transposeWorkspaceGuardDeleter { + cudecompHandle_t handle = nullptr; + cudecompGridDesc_t grid_desc = nullptr; + cudecompTransposeCommBackend_t backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P; + + void operator()(void* ptr) const noexcept { + if (!ptr) return; + + grid_desc->config.transpose_comm_backend = backend; + cudecompFree(handle, grid_desc, ptr); + } +}; + +struct haloWorkspaceGuardDeleter { + cudecompHandle_t handle = nullptr; + cudecompGridDesc_t grid_desc = nullptr; + cudecompHaloCommBackend_t backend = CUDECOMP_HALO_COMM_MPI; + + void operator()(void* ptr) const noexcept { + if (!ptr) return; + + grid_desc->config.halo_comm_backend = backend; + cudecompFree(handle, grid_desc, ptr); + } +}; + +using cudaBufferGuard = std::unique_ptr; +using transposeWorkspaceGuard = std::unique_ptr; +using haloWorkspaceGuard = std::unique_ptr; + static std::vector getFactors(int N) { std::vector factors; for (int i = 1; i <= std::sqrt(N); ++i) { @@ -154,6 +191,11 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d void* work = nullptr; void* work_nvshmem = nullptr; + cudaBufferGuard data_guard; + cudaBufferGuard data2_guard; + transposeWorkspaceGuard work_guard; + transposeWorkspaceGuard work_nvshmem_guard; + int64_t data_sz = 0; int64_t work_sz = 0; @@ -213,11 +255,19 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d int64_t work_sz_new = num_elements_work * dtype_size; if (data_sz_new > data_sz) { data_sz = data_sz_new; - if (data) CHECK_CUDA(cudaFree(data)); + if (data) { + data_guard.release(); + CHECK_CUDA(cudaFree(data)); + } CHECK_CUDA(cudaMalloc(&data, data_sz)); + data_guard.reset(data); if (need_data2) { - if (data2) CHECK_CUDA(cudaFree(data2)); + if (data2) { + data2_guard.release(); + CHECK_CUDA(cudaFree(data2)); + } CHECK_CUDA(cudaMalloc(&data2, data_sz)); + data2_guard.reset(data2); } } @@ -232,15 +282,21 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d auto tmp = grid_desc->config.transpose_comm_backend; grid_desc->config.transpose_comm_backend = (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P; + work_guard.release(); CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); grid_desc->config.transpose_comm_backend = tmp; } // Temporarily set backend to force nvshmem_malloc path in cudecompMalloc/Free auto tmp = grid_desc->config.transpose_comm_backend; grid_desc->config.transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_NVSHMEM; - if (work_nvshmem) CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem)); + if (work_nvshmem) { + work_nvshmem_guard.release(); + CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem)); + } CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast(&work_nvshmem), work_sz)); grid_desc->config.transpose_comm_backend = tmp; + work_nvshmem_guard = + transposeWorkspaceGuard(work_nvshmem, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_NVSHMEM}); // Check if there is enough memory for separate non-NVSHMEM allocated work buffer auto ret = cudaMalloc(&work, work_sz); @@ -254,22 +310,25 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d cudaGetLastError(); // Reset CUDA error state } else { CHECK_CUDA(cudaFree(work)); - auto tmp = grid_desc->config.transpose_comm_backend; - grid_desc->config.transpose_comm_backend = - (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P; - cudecompResult_t ret = cudecompMalloc(handle, grid_desc, reinterpret_cast(&work), work_sz); + auto backend = (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P; + tmp = grid_desc->config.transpose_comm_backend; + grid_desc->config.transpose_comm_backend = backend; + CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast(&work), work_sz)); grid_desc->config.transpose_comm_backend = tmp; - // Check after restoring the temporary backend used to select the allocation path. - CHECK_CUDECOMP(ret); + work_guard = transposeWorkspaceGuard(work, {handle, grid_desc, backend}); } #endif } else { + auto backend = (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P; auto tmp = grid_desc->config.transpose_comm_backend; - grid_desc->config.transpose_comm_backend = - (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P; - if (work) { CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); } + grid_desc->config.transpose_comm_backend = backend; + if (work) { + work_guard.release(); + CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); + } CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast(&work), work_sz)); grid_desc->config.transpose_comm_backend = tmp; + work_guard = transposeWorkspaceGuard(work, {handle, grid_desc, backend}); } } @@ -490,6 +549,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d auto tmp = grid_desc->config.transpose_comm_backend; grid_desc->config.transpose_comm_backend = (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P; + work_guard.release(); CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); grid_desc->config.transpose_comm_backend = tmp; } @@ -497,6 +557,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d // Temporarily set backend to force nvshmem_malloc path in cudecompMalloc/Free auto tmp = grid_desc->config.transpose_comm_backend; grid_desc->config.transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_NVSHMEM; + work_nvshmem_guard.release(); CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem)); grid_desc->config.transpose_comm_backend = tmp; #endif @@ -504,12 +565,17 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d auto tmp = grid_desc->config.transpose_comm_backend; grid_desc->config.transpose_comm_backend = (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P; + work_guard.release(); CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); grid_desc->config.transpose_comm_backend = tmp; } + data_guard.release(); CHECK_CUDA(cudaFree(data)); - if (need_data2) { CHECK_CUDA(cudaFree(data2)); } + if (need_data2) { + data2_guard.release(); + CHECK_CUDA(cudaFree(data2)); + } // Set handle to best option (broadcast from rank 0 for consistency) CHECK_MPI(MPI_Bcast(&comm_backend_best, sizeof(cudecompTransposeCommBackend_t), MPI_CHAR, 0, handle->mpi_comm)); @@ -535,12 +601,17 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d // registration handles on larger test data buffers. These stale registration handles can cause to delays in freeing // the test buffer GPU memory. char *tmp1, *tmp2; + cudaBufferGuard tmp1_guard, tmp2_guard; size_t per_rank_size = (1024 * 1024 + handle->nranks - 1) / handle->nranks; CHECK_CUDA(cudaMalloc(&tmp1, per_rank_size * handle->nranks * sizeof(*tmp1))); + tmp1_guard.reset(tmp1); CHECK_CUDA(cudaMalloc(&tmp2, per_rank_size * handle->nranks * sizeof(*tmp2))); + tmp2_guard.reset(tmp2); CHECK_MPI(MPI_Alltoall(tmp1, per_rank_size, MPI_CHAR, tmp2, per_rank_size, MPI_CHAR, handle->mpi_comm)); CHECK_MPI(MPI_Barrier(handle->mpi_comm)); + tmp1_guard.release(); CHECK_CUDA(cudaFree(tmp1)); + tmp2_guard.release(); CHECK_CUDA(cudaFree(tmp2)); // Reset performance samples after autotuning @@ -601,6 +672,10 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* work = nullptr; void* work_nvshmem = nullptr; + cudaBufferGuard data_guard; + haloWorkspaceGuard work_guard; + haloWorkspaceGuard work_nvshmem_guard; + int64_t data_sz = 0; int64_t work_sz = 0; @@ -644,8 +719,12 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, int64_t work_sz_new = num_elements_work * dtype_size; if (data_sz_new > data_sz) { data_sz = data_sz_new; - if (data) CHECK_CUDA(cudaFree(data)); + if (data) { + data_guard.release(); + CHECK_CUDA(cudaFree(data)); + } CHECK_CUDA(cudaMalloc(&data, data_sz)); + data_guard.reset(data); } // For nvshmem, buffers must be the same size. Find global maximums. @@ -658,15 +737,20 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, if (work && work != work_nvshmem) { auto tmp = grid_desc->config.halo_comm_backend; grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI; + work_guard.release(); CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); grid_desc->config.halo_comm_backend = tmp; } // Temporarily set backend to force nvshmem_malloc path in cudecompMalloc/Free auto tmp = grid_desc->config.halo_comm_backend; grid_desc->config.halo_comm_backend = CUDECOMP_HALO_COMM_NVSHMEM; - if (work_nvshmem) CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem)); + if (work_nvshmem) { + work_nvshmem_guard.release(); + CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem)); + } CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast(&work_nvshmem), work_sz)); grid_desc->config.halo_comm_backend = tmp; + work_nvshmem_guard = haloWorkspaceGuard(work_nvshmem, {handle, grid_desc, CUDECOMP_HALO_COMM_NVSHMEM}); // Check if there is enough memory for separate non-NVSHMEM allocated work buffer auto ret = cudaMalloc(&work, work_sz); @@ -680,20 +764,25 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, cudaGetLastError(); // Reset CUDA error state } else { CHECK_CUDA(cudaFree(work)); - auto tmp = grid_desc->config.halo_comm_backend; - grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI; - cudecompResult_t ret = cudecompMalloc(handle, grid_desc, reinterpret_cast(&work), work_sz); + auto backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI; + tmp = grid_desc->config.halo_comm_backend; + grid_desc->config.halo_comm_backend = backend; + CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast(&work), work_sz)); grid_desc->config.halo_comm_backend = tmp; - // Check after restoring the temporary backend used to select the allocation path. - CHECK_CUDECOMP(ret); + work_guard = haloWorkspaceGuard(work, {handle, grid_desc, backend}); } #endif } else { + auto backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI; auto tmp = grid_desc->config.halo_comm_backend; - grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI; - if (work) { CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); } + grid_desc->config.halo_comm_backend = backend; + if (work) { + work_guard.release(); + CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); + } CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast(&work), work_sz)); grid_desc->config.halo_comm_backend = tmp; + work_guard = haloWorkspaceGuard(work, {handle, grid_desc, backend}); } } @@ -827,6 +916,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, if (work != work_nvshmem) { auto tmp = grid_desc->config.halo_comm_backend; grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI; + work_guard.release(); CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); grid_desc->config.halo_comm_backend = tmp; } @@ -834,16 +924,19 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, // Temporarily set backend to force nvshmem_malloc path in cudecompMalloc/Free auto tmp = grid_desc->config.halo_comm_backend; grid_desc->config.halo_comm_backend = CUDECOMP_HALO_COMM_NVSHMEM; + work_nvshmem_guard.release(); CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem)); grid_desc->config.halo_comm_backend = tmp; #endif } else { auto tmp = grid_desc->config.halo_comm_backend; grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI; + work_guard.release(); CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); grid_desc->config.halo_comm_backend = tmp; } + data_guard.release(); CHECK_CUDA(cudaFree(data)); // Set handle to best option (broadcast from rank 0 for consistency) From d2b9894a928b4620d70ac08701d0bb93b4610b1a Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Tue, 5 May 2026 11:34:33 -0700 Subject: [PATCH 2/3] Avoid exceptions in graph cache destructor. Signed-off-by: Josh Romero --- include/internal/graph.h | 4 +++- src/graph.cc | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/internal/graph.h b/include/internal/graph.h index 578b420..171a7b0 100644 --- a/include/internal/graph.h +++ b/include/internal/graph.h @@ -35,7 +35,7 @@ class graphCache { using key_type = std::tuple; public: - ~graphCache(); + ~graphCache() noexcept; void replay(const key_type& key, cudaStream_t stream) const; cudaStream_t startCapture(const key_type& key, cudaStream_t stream) const; void endCapture(const key_type& key); @@ -43,6 +43,8 @@ class graphCache { void clear(); private: + void clearNoThrow() noexcept; + std::unordered_map graph_cache_; cudaStream graph_stream_; }; diff --git a/src/graph.cc b/src/graph.cc index 2d6dc86..43fcde2 100644 --- a/src/graph.cc +++ b/src/graph.cc @@ -27,7 +27,7 @@ namespace cudecomp { -graphCache::~graphCache() { this->clear(); } +graphCache::~graphCache() noexcept { clearNoThrow(); } void graphCache::replay(const graphCache::key_type& key, cudaStream_t stream) const { CHECK_CUDA(cudaGraphLaunch(graph_cache_.at(key), stream)); @@ -58,4 +58,12 @@ void graphCache::clear() { graph_cache_.clear(); } +void graphCache::clearNoThrow() noexcept { + for (auto& entry : graph_cache_) { + cudaGraphExecDestroy(entry.second); + } + + graph_cache_.clear(); +} + } // namespace cudecomp From d57378231f0a8833f5ce3a964ca7565078fbd7ba Mon Sep 17 00:00:00 2001 From: Josh Romero Date: Tue, 5 May 2026 11:52:19 -0700 Subject: [PATCH 3/3] Simplify workspace guard design. Signed-off-by: Josh Romero --- src/autotune.cc | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/src/autotune.cc b/src/autotune.cc index 2f3d7f8..d797bd5 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -50,35 +51,26 @@ struct cudaBufferGuardDeleter { } }; -struct transposeWorkspaceGuardDeleter { - cudecompHandle_t handle = nullptr; - cudecompGridDesc_t grid_desc = nullptr; - cudecompTransposeCommBackend_t backend = CUDECOMP_TRANSPOSE_COMM_MPI_P2P; +template struct workspaceGuardDeleter { + cudecompHandle_t handle; + cudecompGridDesc_t grid_desc; + Backend backend; void operator()(void* ptr) const noexcept { if (!ptr) return; - grid_desc->config.transpose_comm_backend = backend; - cudecompFree(handle, grid_desc, ptr); - } -}; - -struct haloWorkspaceGuardDeleter { - cudecompHandle_t handle = nullptr; - cudecompGridDesc_t grid_desc = nullptr; - cudecompHaloCommBackend_t backend = CUDECOMP_HALO_COMM_MPI; - - void operator()(void* ptr) const noexcept { - if (!ptr) return; - - grid_desc->config.halo_comm_backend = backend; + if constexpr (std::is_same_v) { + grid_desc->config.transpose_comm_backend = backend; + } else { + grid_desc->config.halo_comm_backend = backend; + } cudecompFree(handle, grid_desc, ptr); } }; using cudaBufferGuard = std::unique_ptr; -using transposeWorkspaceGuard = std::unique_ptr; -using haloWorkspaceGuard = std::unique_ptr; +using transposeWorkspaceGuard = std::unique_ptr>; +using haloWorkspaceGuard = std::unique_ptr>; static std::vector getFactors(int N) { std::vector factors; @@ -193,8 +185,8 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d cudaBufferGuard data_guard; cudaBufferGuard data2_guard; - transposeWorkspaceGuard work_guard; - transposeWorkspaceGuard work_nvshmem_guard; + transposeWorkspaceGuard work_guard(nullptr, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_MPI_P2P}); + transposeWorkspaceGuard work_nvshmem_guard(nullptr, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_NVSHMEM}); int64_t data_sz = 0; int64_t work_sz = 0; @@ -673,8 +665,8 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, void* work_nvshmem = nullptr; cudaBufferGuard data_guard; - haloWorkspaceGuard work_guard; - haloWorkspaceGuard work_nvshmem_guard; + haloWorkspaceGuard work_guard(nullptr, {handle, grid_desc, CUDECOMP_HALO_COMM_MPI}); + haloWorkspaceGuard work_nvshmem_guard(nullptr, {handle, grid_desc, CUDECOMP_HALO_COMM_NVSHMEM}); int64_t data_sz = 0; int64_t work_sz = 0;