NVIDIA · romerojosh · May 5, 2026 · May 4, 2026 · May 4, 2026
diff --git a/include/internal/comm_routines.h b/include/internal/comm_routines.h
@@ -511,7 +511,7 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
 #endif
   }
   case CUDECOMP_TRANSPOSE_COMM_NCCL_PL: {
-    auto comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
+    const auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
     // For fully intra-group alltoall, use distinct NCCL local comm instead of global comm as it is faster.
     auto comm = (comm_info.ngroups == 1) ? *grid_desc->nccl_local_comm : *grid_desc->nccl_comm;
     auto pl_stream = handle->streams[0].get();

diff --git a/include/internal/common.h b/include/internal/common.h
@@ -74,6 +74,13 @@ typedef std::shared_ptr<nvshmemRuntimeState> nvshmemRuntime;
 
 // cuDecomp handle containing general information
 struct cudecompHandle {
+  cudecompHandle() = default;
+  ~cudecompHandle() noexcept;
+
+  cudecompHandle(const cudecompHandle&) = delete;
+  cudecompHandle& operator=(const cudecompHandle&) = delete;
+  cudecompHandle(cudecompHandle&&) = delete;
+  cudecompHandle& operator=(cudecompHandle&&) = delete;
 
   MPI_Comm mpi_comm = MPI_COMM_NULL; // MPI communicator
   int32_t rank;                      // MPI rank
@@ -96,10 +103,12 @@ struct cudecompHandle {
 
   std::vector<cudecomp::cudaStream> streams; // internal streams for concurrent scheduling
 
-  cutensorHandle_t cutensor_handle; // cuTENSOR handle;
 #if CUTENSOR_MAJOR >= 2
-  cutensorPlanPreference_t cutensor_plan_pref;  // cuTENSOR plan preference;
-  bool cutensor_needs_permute_chunking = false; // Flag to enable large tensor workaround
+  cutensorHandle_t cutensor_handle = nullptr;            // cuTENSOR handle;
+  cutensorPlanPreference_t cutensor_plan_pref = nullptr; // cuTENSOR plan preference;
+  bool cutensor_needs_permute_chunking = false;          // Flag to enable large tensor workaround
+#else
+  cutensorHandle_t cutensor_handle; // cuTENSOR handle;
 #endif
 
   std::vector<std::array<char, MPI_MAX_PROCESSOR_NAME>> hostnames; // list of hostnames by rank
@@ -134,6 +143,7 @@ struct cudecompHandle {
       ""; // directory to write CSV performance reports, empty means no file writing
 
   // Miscellaneous
+  bool nvml_initialized = false;                        // Flag to track NVML initialization
   int32_t device_p2p_ce_count = 0;                      // number of P2P CEs available
   int32_t device_num_sms = 0;                           // number of SMs on the device
   int32_t device_max_threads_per_sm = 0;                // maximum threads per SM
@@ -142,9 +152,40 @@ struct cudecompHandle {
 
 // Structure with information about row/column communicator
 struct cudecompCommInfo {
+  cudecompCommInfo() = default;
+  ~cudecompCommInfo() noexcept { reset(); }
+
+  cudecompCommInfo(const cudecompCommInfo&) = delete;
+  cudecompCommInfo& operator=(const cudecompCommInfo&) = delete;
+  cudecompCommInfo(cudecompCommInfo&&) = delete;
+  cudecompCommInfo& operator=(cudecompCommInfo&&) = delete;
+
+  void reset() noexcept {
+    if (mpi_comm != MPI_COMM_NULL) {
+      MPI_Comm comm = mpi_comm;
+      mpi_comm = MPI_COMM_NULL;
+      MPI_Comm_free(&comm);
+    }
+#ifdef ENABLE_NVSHMEM
+    if (nvshmem_team != NVSHMEM_TEAM_INVALID) {
+      nvshmem_team_destroy(nvshmem_team);
+      nvshmem_team = NVSHMEM_TEAM_INVALID;
+    }
+    if (nvshmem_signals) {
+      nvshmem_free(nvshmem_signals);
+      nvshmem_signals = nullptr;
+    }
+#endif
+    rank = 0;
+    nranks = 0;
+    ngroups = 0;
+    npergroup = 0;
+    mnnvl_active = false;
+  }
+
   MPI_Comm mpi_comm = MPI_COMM_NULL;
-  int32_t rank;
-  int32_t nranks;
+  int32_t rank = 0;
+  int32_t nranks = 0;
 
   int32_t ngroups = 0;   // number of p2p groups (i.e. grouping of ranks with fast interconnect) in communicator
   int32_t npergroup = 0; // number of ranks per p2p group
@@ -194,6 +235,14 @@ struct cudecompHaloPerformanceSampleCollection {
 
 // cuDecomp grid descriptor containing grid-specific information
 struct cudecompGridDesc {
+  ~cudecompGridDesc() noexcept {
+    row_comm_info.reset();
+    col_comm_info.reset();
+#ifdef ENABLE_NVSHMEM
+    if (nvshmem_block_counters) { cudaFree(nvshmem_block_counters); }
+#endif
+  }
+
   cudecompGridDescConfig_t config;      // configuration struct
   bool gdims_dist_set = false;          // flag to record if gdims_dist was set to non-default values
   bool transpose_mem_order_set = false; // flag to record if transpose_mem_order was set to non-default values
@@ -355,6 +404,7 @@ static void setCommInfo(cudecompHandle_t& handle, cudecompGridDesc_t& grid_desc,
                         cudecompCommAxis comm_axis) {
   auto& info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
 
+  info.reset();
   info.mpi_comm = mpi_comm;
   CHECK_MPI(MPI_Comm_rank(info.mpi_comm, &info.rank));
   CHECK_MPI(MPI_Comm_size(info.mpi_comm, &info.nranks));
@@ -420,6 +470,43 @@ static void setCommInfo(cudecompHandle_t& handle, cudecompGridDesc_t& grid_desc,
   info.ngroups = info.nranks / info.npergroup;
 }
 
+static void createCommInfo(cudecompHandle_t& handle, cudecompGridDesc_t& grid_desc, bool need_nvshmem = false) {
+  grid_desc->row_comm_info.reset();
+  grid_desc->col_comm_info.reset();
+
+  setProcessGridIndex(handle, grid_desc);
+
+  MPI_Comm row_comm;
+  CHECK_MPI(MPI_Comm_split(handle->mpi_comm, grid_desc->pidx[0], handle->rank, &row_comm));
+  setCommInfo(handle, grid_desc, row_comm, CUDECOMP_COMM_ROW);
+
+  MPI_Comm col_comm;
+  CHECK_MPI(MPI_Comm_split(handle->mpi_comm, grid_desc->pidx[1], handle->rank, &col_comm));
+  setCommInfo(handle, grid_desc, col_comm, CUDECOMP_COMM_COL);
+
+#ifdef ENABLE_NVSHMEM
+  if (need_nvshmem) {
+    nvshmem_team_config_t tmp;
+    nvshmem_team_split_2d(NVSHMEM_TEAM_WORLD, grid_desc->config.pdims[1], &tmp, 0,
+                          &grid_desc->row_comm_info.nvshmem_team, &tmp, 0, &grid_desc->col_comm_info.nvshmem_team);
+
+    grid_desc->row_comm_info.nvshmem_signals =
+        (uint64_t*)nvshmem_malloc(grid_desc->row_comm_info.nranks * sizeof(uint64_t));
+    if (!grid_desc->row_comm_info.nvshmem_signals) { THROW_NVSHMEM_ERROR("nvshmem_malloc failed"); }
+    CHECK_CUDA(
+        cudaMemset(grid_desc->row_comm_info.nvshmem_signals, 0, grid_desc->row_comm_info.nranks * sizeof(uint64_t)));
+
+    grid_desc->col_comm_info.nvshmem_signals =
+        (uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
+    if (!grid_desc->col_comm_info.nvshmem_signals) { THROW_NVSHMEM_ERROR("nvshmem_malloc failed"); }
+    CHECK_CUDA(
+        cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
+  }
+#else
+  if (need_nvshmem) { THROW_NOT_SUPPORTED("build does not support NVSHMEM communication backends."); }
+#endif
+}
+
 static inline void getAlltoallPeerRanks(cudecompGridDesc_t grid_desc, cudecompCommAxis comm_axis, int iter,
                                         int& src_rank, int& dst_rank) {
 

diff --git a/src/autotune.cc b/src/autotune.cc
@@ -274,28 +274,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
     }
 
     // Create test row and column communicators
-    int color_row = grid_desc->pidx[0];
-    MPI_Comm row_comm;
-    CHECK_MPI(MPI_Comm_split(handle->mpi_comm, color_row, handle->rank, &row_comm));
-    setCommInfo(handle, grid_desc, row_comm, CUDECOMP_COMM_ROW);
-
-    int color_col = grid_desc->pidx[1];
-    MPI_Comm col_comm;
-    CHECK_MPI(MPI_Comm_split(handle->mpi_comm, color_col, handle->rank, &col_comm));
-    setCommInfo(handle, grid_desc, col_comm, CUDECOMP_COMM_COL);
+    createCommInfo(handle, grid_desc, need_nvshmem);
     if (need_nvshmem) {
 #ifdef ENABLE_NVSHMEM
-      nvshmem_team_config_t tmp;
-      nvshmem_team_split_2d(NVSHMEM_TEAM_WORLD, grid_desc->config.pdims[1], &tmp, 0,
-                            &grid_desc->row_comm_info.nvshmem_team, &tmp, 0, &grid_desc->col_comm_info.nvshmem_team);
-      grid_desc->row_comm_info.nvshmem_signals =
-          (uint64_t*)nvshmem_malloc(grid_desc->row_comm_info.nranks * sizeof(uint64_t));
-      CHECK_CUDA(
-          cudaMemset(grid_desc->row_comm_info.nvshmem_signals, 0, grid_desc->row_comm_info.nranks * sizeof(uint64_t)));
-      grid_desc->col_comm_info.nvshmem_signals =
-          (uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
-      CHECK_CUDA(
-          cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
       CHECK_CUDA(cudaMalloc(&grid_desc->nvshmem_block_counters, handle->nranks * sizeof(int)));
       CHECK_CUDA(cudaMemset(grid_desc->nvshmem_block_counters, 0, handle->nranks * sizeof(int)));
 #endif
@@ -490,18 +471,15 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
       }
     }
 
-    // Destroy test communicators
-    CHECK_MPI(MPI_Comm_free(&grid_desc->row_comm_info.mpi_comm));
-    CHECK_MPI(MPI_Comm_free(&grid_desc->col_comm_info.mpi_comm));
+    // Destroy test communicator resources
+    grid_desc->row_comm_info.reset();
+    grid_desc->col_comm_info.reset();
     if (need_nvshmem) {
 #ifdef ENABLE_NVSHMEM
-      nvshmem_team_destroy(grid_desc->row_comm_info.nvshmem_team);
-      nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
-      nvshmem_free(grid_desc->row_comm_info.nvshmem_signals);
-      nvshmem_free(grid_desc->col_comm_info.nvshmem_signals);
-      CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters));
-      grid_desc->row_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
-      grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
+      if (grid_desc->nvshmem_block_counters) {
+        CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters));
+        grid_desc->nvshmem_block_counters = nullptr;
+      }
 #endif
     }
   }
@@ -720,28 +698,9 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
     }
 
     // Create test row and column communicators
-    int color_row = grid_desc->pidx[0];
-    MPI_Comm row_comm;
-    CHECK_MPI(MPI_Comm_split(handle->mpi_comm, color_row, handle->rank, &row_comm));
-    setCommInfo(handle, grid_desc, row_comm, CUDECOMP_COMM_ROW);
-
-    int color_col = grid_desc->pidx[1];
-    MPI_Comm col_comm;
-    CHECK_MPI(MPI_Comm_split(handle->mpi_comm, color_col, handle->rank, &col_comm));
-    setCommInfo(handle, grid_desc, col_comm, CUDECOMP_COMM_COL);
+    createCommInfo(handle, grid_desc, need_nvshmem);
     if (need_nvshmem) {
 #ifdef ENABLE_NVSHMEM
-      nvshmem_team_config_t tmp;
-      nvshmem_team_split_2d(NVSHMEM_TEAM_WORLD, grid_desc->config.pdims[1], &tmp, 0,
-                            &grid_desc->row_comm_info.nvshmem_team, &tmp, 0, &grid_desc->col_comm_info.nvshmem_team);
-      grid_desc->row_comm_info.nvshmem_signals =
-          (uint64_t*)nvshmem_malloc(grid_desc->row_comm_info.nranks * sizeof(uint64_t));
-      CHECK_CUDA(
-          cudaMemset(grid_desc->row_comm_info.nvshmem_signals, 0, grid_desc->row_comm_info.nranks * sizeof(uint64_t)));
-      grid_desc->col_comm_info.nvshmem_signals =
-          (uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
-      CHECK_CUDA(
-          cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
       CHECK_CUDA(cudaMalloc(&grid_desc->nvshmem_block_counters, handle->nranks * sizeof(int)));
       CHECK_CUDA(cudaMemset(grid_desc->nvshmem_block_counters, 0, handle->nranks * sizeof(int)));
 #endif
@@ -850,18 +809,15 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
       }
     }
 
-    // Destroy test communicators
-    CHECK_MPI(MPI_Comm_free(&grid_desc->row_comm_info.mpi_comm));
-    CHECK_MPI(MPI_Comm_free(&grid_desc->col_comm_info.mpi_comm));
+    // Destroy test communicator resources
+    grid_desc->row_comm_info.reset();
+    grid_desc->col_comm_info.reset();
     if (need_nvshmem) {
 #ifdef ENABLE_NVSHMEM
-      nvshmem_team_destroy(grid_desc->row_comm_info.nvshmem_team);
-      nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
-      nvshmem_free(grid_desc->row_comm_info.nvshmem_signals);
-      nvshmem_free(grid_desc->col_comm_info.nvshmem_signals);
-      CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters));
-      grid_desc->row_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
-      grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
+      if (grid_desc->nvshmem_block_counters) {
+        CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters));
+        grid_desc->nvshmem_block_counters = nullptr;
+      }
 #endif
     }
   }