NVIDIA · romerojosh · May 5, 2026 · May 4, 2026 · May 5, 2026 · May 5, 2026
diff --git a/include/internal/graph.h b/include/internal/graph.h
@@ -35,14 +35,16 @@ class graphCache {
   using key_type = std::tuple<void*, void*, int, int, cudecompPencilInfo_t, cudecompPencilInfo_t, cudecompDataType_t>;
 
 public:
-  ~graphCache();
+  ~graphCache() noexcept;
   void replay(const key_type& key, cudaStream_t stream) const;
   cudaStream_t startCapture(const key_type& key, cudaStream_t stream) const;
   void endCapture(const key_type& key);
   bool cached(const key_type& key) const;
   void clear();
 
 private:
+  void clearNoThrow() noexcept;
+
   std::unordered_map<key_type, cudaGraphExec_t> graph_cache_;
   cudaStream graph_stream_;
 };

diff --git a/src/autotune.cc b/src/autotune.cc
@@ -21,8 +21,10 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <memory>
 #include <numeric>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include <cuda_runtime.h>
@@ -43,6 +45,33 @@
 namespace cudecomp {
 namespace {
 
+struct cudaBufferGuardDeleter {
+  void operator()(void* ptr) const noexcept {
+    if (ptr) { cudaFree(ptr); }
+  }
+};
+
+template <typename Backend> struct workspaceGuardDeleter {
+  cudecompHandle_t handle;
+  cudecompGridDesc_t grid_desc;
+  Backend backend;
+
+  void operator()(void* ptr) const noexcept {
+    if (!ptr) return;
+
+    if constexpr (std::is_same_v<Backend, cudecompTransposeCommBackend_t>) {
+      grid_desc->config.transpose_comm_backend = backend;
+    } else {
+      grid_desc->config.halo_comm_backend = backend;
+    }
+    cudecompFree(handle, grid_desc, ptr);
+  }
+};
+
+using cudaBufferGuard = std::unique_ptr<void, cudaBufferGuardDeleter>;
+using transposeWorkspaceGuard = std::unique_ptr<void, workspaceGuardDeleter<cudecompTransposeCommBackend_t>>;
+using haloWorkspaceGuard = std::unique_ptr<void, workspaceGuardDeleter<cudecompHaloCommBackend_t>>;
+
 static std::vector<int> getFactors(int N) {
   std::vector<int> factors;
   for (int i = 1; i <= std::sqrt(N); ++i) {
@@ -154,6 +183,11 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
   void* work = nullptr;
   void* work_nvshmem = nullptr;
 
+  cudaBufferGuard data_guard;
+  cudaBufferGuard data2_guard;
+  transposeWorkspaceGuard work_guard(nullptr, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_MPI_P2P});
+  transposeWorkspaceGuard work_nvshmem_guard(nullptr, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_NVSHMEM});
+
   int64_t data_sz = 0;
   int64_t work_sz = 0;
 
@@ -213,11 +247,19 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
     int64_t work_sz_new = num_elements_work * dtype_size;
     if (data_sz_new > data_sz) {
       data_sz = data_sz_new;
-      if (data) CHECK_CUDA(cudaFree(data));
+      if (data) {
+        data_guard.release();
+        CHECK_CUDA(cudaFree(data));
+      }
       CHECK_CUDA(cudaMalloc(&data, data_sz));
+      data_guard.reset(data);
       if (need_data2) {
-        if (data2) CHECK_CUDA(cudaFree(data2));
+        if (data2) {
+          data2_guard.release();
+          CHECK_CUDA(cudaFree(data2));
+        }
         CHECK_CUDA(cudaMalloc(&data2, data_sz));
+        data2_guard.reset(data2);
       }
     }
 
@@ -232,15 +274,21 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
           auto tmp = grid_desc->config.transpose_comm_backend;
           grid_desc->config.transpose_comm_backend =
               (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
+          work_guard.release();
           CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
           grid_desc->config.transpose_comm_backend = tmp;
         }
         // Temporarily set backend to force nvshmem_malloc path in cudecompMalloc/Free
         auto tmp = grid_desc->config.transpose_comm_backend;
         grid_desc->config.transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_NVSHMEM;
-        if (work_nvshmem) CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem));
+        if (work_nvshmem) {
+          work_nvshmem_guard.release();
+          CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem));
+        }
         CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work_nvshmem), work_sz));
         grid_desc->config.transpose_comm_backend = tmp;
+        work_nvshmem_guard =
+            transposeWorkspaceGuard(work_nvshmem, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_NVSHMEM});
 
         // Check if there is enough memory for separate non-NVSHMEM allocated work buffer
         auto ret = cudaMalloc(&work, work_sz);
@@ -254,22 +302,25 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
           cudaGetLastError(); // Reset CUDA error state
         } else {
           CHECK_CUDA(cudaFree(work));
-          auto tmp = grid_desc->config.transpose_comm_backend;
-          grid_desc->config.transpose_comm_backend =
-              (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
-          cudecompResult_t ret = cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work), work_sz);
+          auto backend = (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
+          tmp = grid_desc->config.transpose_comm_backend;
+          grid_desc->config.transpose_comm_backend = backend;
+          CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work), work_sz));
           grid_desc->config.transpose_comm_backend = tmp;
-          // Check after restoring the temporary backend used to select the allocation path.
-          CHECK_CUDECOMP(ret);
+          work_guard = transposeWorkspaceGuard(work, {handle, grid_desc, backend});
         }
 #endif
       } else {
+        auto backend = (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
         auto tmp = grid_desc->config.transpose_comm_backend;
-        grid_desc->config.transpose_comm_backend =
-            (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
-        if (work) { CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); }
+        grid_desc->config.transpose_comm_backend = backend;
+        if (work) {
+          work_guard.release();
+          CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
+        }
         CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work), work_sz));
         grid_desc->config.transpose_comm_backend = tmp;
+        work_guard = transposeWorkspaceGuard(work, {handle, grid_desc, backend});
       }
     }
 
@@ -490,26 +541,33 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       auto tmp = grid_desc->config.transpose_comm_backend;
       grid_desc->config.transpose_comm_backend =
           (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
+      work_guard.release();
       CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
       grid_desc->config.transpose_comm_backend = tmp;
     }
 #ifdef ENABLE_NVSHMEM
     // Temporarily set backend to force nvshmem_malloc path in cudecompMalloc/Free
     auto tmp = grid_desc->config.transpose_comm_backend;
     grid_desc->config.transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_NVSHMEM;
+    work_nvshmem_guard.release();
     CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem));
     grid_desc->config.transpose_comm_backend = tmp;
 #endif
   } else {
     auto tmp = grid_desc->config.transpose_comm_backend;
     grid_desc->config.transpose_comm_backend =
         (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
+    work_guard.release();
     CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
     grid_desc->config.transpose_comm_backend = tmp;
   }
 
+  data_guard.release();
   CHECK_CUDA(cudaFree(data));
-  if (need_data2) { CHECK_CUDA(cudaFree(data2)); }
+  if (need_data2) {
+    data2_guard.release();
+    CHECK_CUDA(cudaFree(data2));
+  }
 
   // Set handle to best option (broadcast from rank 0 for consistency)
   CHECK_MPI(MPI_Bcast(&comm_backend_best, sizeof(cudecompTransposeCommBackend_t), MPI_CHAR, 0, handle->mpi_comm));
@@ -535,12 +593,17 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
   // registration handles on larger test data buffers. These stale registration handles can cause to delays in freeing
   // the test buffer GPU memory.
   char *tmp1, *tmp2;
+  cudaBufferGuard tmp1_guard, tmp2_guard;
   size_t per_rank_size = (1024 * 1024 + handle->nranks - 1) / handle->nranks;
   CHECK_CUDA(cudaMalloc(&tmp1, per_rank_size * handle->nranks * sizeof(*tmp1)));
+  tmp1_guard.reset(tmp1);
   CHECK_CUDA(cudaMalloc(&tmp2, per_rank_size * handle->nranks * sizeof(*tmp2)));
+  tmp2_guard.reset(tmp2);
   CHECK_MPI(MPI_Alltoall(tmp1, per_rank_size, MPI_CHAR, tmp2, per_rank_size, MPI_CHAR, handle->mpi_comm));
   CHECK_MPI(MPI_Barrier(handle->mpi_comm));
+  tmp1_guard.release();
   CHECK_CUDA(cudaFree(tmp1));
+  tmp2_guard.release();
   CHECK_CUDA(cudaFree(tmp2));
 
   // Reset performance samples after autotuning
@@ -601,6 +664,10 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
   void* work = nullptr;
   void* work_nvshmem = nullptr;
 
+  cudaBufferGuard data_guard;
+  haloWorkspaceGuard work_guard(nullptr, {handle, grid_desc, CUDECOMP_HALO_COMM_MPI});
+  haloWorkspaceGuard work_nvshmem_guard(nullptr, {handle, grid_desc, CUDECOMP_HALO_COMM_NVSHMEM});
+
   int64_t data_sz = 0;
   int64_t work_sz = 0;
 
@@ -644,8 +711,12 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
     int64_t work_sz_new = num_elements_work * dtype_size;
     if (data_sz_new > data_sz) {
       data_sz = data_sz_new;
-      if (data) CHECK_CUDA(cudaFree(data));
+      if (data) {
+        data_guard.release();
+        CHECK_CUDA(cudaFree(data));
+      }
       CHECK_CUDA(cudaMalloc(&data, data_sz));
+      data_guard.reset(data);
     }
 
     // For nvshmem, buffers must be the same size. Find global maximums.
@@ -658,15 +729,20 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
         if (work && work != work_nvshmem) {
           auto tmp = grid_desc->config.halo_comm_backend;
           grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
+          work_guard.release();
           CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
           grid_desc->config.halo_comm_backend = tmp;
         }
         // Temporarily set backend to force nvshmem_malloc path in cudecompMalloc/Free
         auto tmp = grid_desc->config.halo_comm_backend;
         grid_desc->config.halo_comm_backend = CUDECOMP_HALO_COMM_NVSHMEM;
-        if (work_nvshmem) CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem));
+        if (work_nvshmem) {
+          work_nvshmem_guard.release();
+          CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem));
+        }
         CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work_nvshmem), work_sz));
         grid_desc->config.halo_comm_backend = tmp;
+        work_nvshmem_guard = haloWorkspaceGuard(work_nvshmem, {handle, grid_desc, CUDECOMP_HALO_COMM_NVSHMEM});
 
         // Check if there is enough memory for separate non-NVSHMEM allocated work buffer
         auto ret = cudaMalloc(&work, work_sz);
@@ -680,20 +756,25 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
           cudaGetLastError(); // Reset CUDA error state
         } else {
           CHECK_CUDA(cudaFree(work));
-          auto tmp = grid_desc->config.halo_comm_backend;
-          grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
-          cudecompResult_t ret = cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work), work_sz);
+          auto backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
+          tmp = grid_desc->config.halo_comm_backend;
+          grid_desc->config.halo_comm_backend = backend;
+          CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work), work_sz));
           grid_desc->config.halo_comm_backend = tmp;
-          // Check after restoring the temporary backend used to select the allocation path.
-          CHECK_CUDECOMP(ret);
+          work_guard = haloWorkspaceGuard(work, {handle, grid_desc, backend});
         }
 #endif
       } else {
+        auto backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
         auto tmp = grid_desc->config.halo_comm_backend;
-        grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
-        if (work) { CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); }
+        grid_desc->config.halo_comm_backend = backend;
+        if (work) {
+          work_guard.release();
+          CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
+        }
         CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work), work_sz));
         grid_desc->config.halo_comm_backend = tmp;
+        work_guard = haloWorkspaceGuard(work, {handle, grid_desc, backend});
       }
     }
 
@@ -827,23 +908,27 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
     if (work != work_nvshmem) {
       auto tmp = grid_desc->config.halo_comm_backend;
       grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
+      work_guard.release();
       CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
       grid_desc->config.halo_comm_backend = tmp;
     }
 #ifdef ENABLE_NVSHMEM
     // Temporarily set backend to force nvshmem_malloc path in cudecompMalloc/Free
     auto tmp = grid_desc->config.halo_comm_backend;
     grid_desc->config.halo_comm_backend = CUDECOMP_HALO_COMM_NVSHMEM;
+    work_nvshmem_guard.release();
     CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem));
     grid_desc->config.halo_comm_backend = tmp;
 #endif
   } else {
     auto tmp = grid_desc->config.halo_comm_backend;
     grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
+    work_guard.release();
     CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
     grid_desc->config.halo_comm_backend = tmp;
   }
 
+  data_guard.release();
   CHECK_CUDA(cudaFree(data));
 
   // Set handle to best option (broadcast from rank 0 for consistency)

diff --git a/src/graph.cc b/src/graph.cc
@@ -27,7 +27,7 @@
 
 namespace cudecomp {
 
-graphCache::~graphCache() { this->clear(); }
+graphCache::~graphCache() noexcept { clearNoThrow(); }
 
 void graphCache::replay(const graphCache::key_type& key, cudaStream_t stream) const {
   CHECK_CUDA(cudaGraphLaunch(graph_cache_.at(key), stream));
@@ -58,4 +58,12 @@ void graphCache::clear() {
   graph_cache_.clear();
 }
 
+void graphCache::clearNoThrow() noexcept {
+  for (auto& entry : graph_cache_) {
+    cudaGraphExecDestroy(entry.second);
+  }
+
+  graph_cache_.clear();
+}
+
 } // namespace cudecomp