NVIDIA · romerojosh · May 4, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/include/internal/common.h b/include/internal/common.h
@@ -45,6 +45,7 @@
 
 #include "cudecomp.h"
 #include "internal/checks.h"
+#include "internal/cuda_event.h"
 #include "internal/graph.h"
 
 namespace cudecomp {
@@ -158,10 +159,10 @@ struct cudecompCommInfo {
 
 // Structure to contain data for transpose performance sample
 struct cudecompTransposePerformanceSample {
-  cudaEvent_t transpose_start_event;
-  cudaEvent_t transpose_end_event;
-  std::vector<cudaEvent_t> alltoall_start_events;
-  std::vector<cudaEvent_t> alltoall_end_events;
+  cudecomp::cudaEventTimed transpose_start_event;
+  cudecomp::cudaEventTimed transpose_end_event;
+  std::vector<cudecomp::cudaEventTimed> alltoall_start_events;
+  std::vector<cudecomp::cudaEventTimed> alltoall_end_events;
   int32_t alltoall_timing_count = 0;
   size_t alltoall_bytes = 0;
   bool valid = false;
@@ -176,10 +177,10 @@ struct cudecompTransposePerformanceSampleCollection {
 
 // Structure to contain data for halo performance sample
 struct cudecompHaloPerformanceSample {
-  cudaEvent_t halo_start_event;
-  cudaEvent_t halo_end_event;
-  cudaEvent_t sendrecv_start_event;
-  cudaEvent_t sendrecv_end_event;
+  cudecomp::cudaEventTimed halo_start_event;
+  cudecomp::cudaEventTimed halo_end_event;
+  cudecomp::cudaEventTimed sendrecv_start_event;
+  cudecomp::cudaEventTimed sendrecv_end_event;
   size_t sendrecv_bytes = 0;
   bool valid = false;
 };
@@ -202,8 +203,8 @@ struct cudecompGridDesc {
   cudecompCommInfo row_comm_info; // row communicator information
   cudecompCommInfo col_comm_info; // column communicator information
 
-  std::vector<cudaEvent_t> events{nullptr}; // CUDA events used for scheduling
-  cudaEvent_t nvshmem_sync_event = nullptr; // NVSHMEM event used for synchronization
+  std::vector<cudecomp::cudaEvent> events; // CUDA events used for scheduling
+  cudecomp::cudaEvent nvshmem_sync_event;  // NVSHMEM event used for synchronization
 
 #ifdef ENABLE_NVSHMEM
   int* nvshmem_block_counters = nullptr;    // device memory counters for SM alltoallv last-block detection
@@ -217,11 +218,11 @@ struct cudecompGridDesc {
       nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle
 
   // Performance reporting related entries
-  std::vector<cudaEvent_t> alltoall_start_events; // events for alltoall timing
-  std::vector<cudaEvent_t> alltoall_end_events;   // events for alltoall timing
+  std::vector<cudecomp::cudaEventTimed> alltoall_start_events; // events for alltoall timing
+  std::vector<cudecomp::cudaEventTimed> alltoall_end_events;   // events for alltoall timing
   int32_t alltoall_timing_count = 0;              // count of alltoall timing events pairs (for pipelined alltoall)
-  cudaEvent_t transpose_start_event;              // event for transpose timing
-  cudaEvent_t transpose_end_event;                // event for transpose timing
+  cudecomp::cudaEventTimed transpose_start_event; // event for transpose timing
+  cudecomp::cudaEventTimed transpose_end_event;   // event for transpose timing
 
   std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<int32_t, 3>,
                                 std::array<int32_t, 3>, std::array<int32_t, 3>, bool, bool, cudecompDataType_t>,

diff --git a/include/internal/cuda_event.h b/include/internal/cuda_event.h
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUDECOMP_CUDA_EVENT_H
+#define CUDECOMP_CUDA_EVENT_H
+
+#include <utility>
+
+#include <cuda_runtime.h>
+
+#include "internal/checks.h"
+
+namespace cudecomp {
+
+template <unsigned int flags> class cudaEventBase {
+public:
+  cudaEventBase() { CHECK_CUDA(cudaEventCreateWithFlags(&event_, flags)); }
+  ~cudaEventBase() noexcept { resetNoThrow(); }
+
+  cudaEventBase(const cudaEventBase&) = delete;
+  cudaEventBase& operator=(const cudaEventBase&) = delete;
+
+  cudaEventBase(cudaEventBase&& other) noexcept : event_(std::exchange(other.event_, nullptr)) {}
+
+  cudaEventBase& operator=(cudaEventBase&& other) noexcept {
+    if (this != &other) {
+      resetNoThrow();
+      event_ = std::exchange(other.event_, nullptr);
+    }
+    return *this;
+  }
+
+  cudaEvent_t get() const noexcept { return event_; }
+  operator cudaEvent_t() const noexcept { return event_; }
+
+private:
+  void resetNoThrow() noexcept {
+    if (event_) {
+      cudaEventDestroy(event_);
+      event_ = nullptr;
+    }
+  }
+
+  cudaEvent_t event_ = nullptr;
+};
+
+using cudaEvent = cudaEventBase<cudaEventDisableTiming>;
+using cudaEventTimed = cudaEventBase<cudaEventDefault>;
+
+} // namespace cudecomp
+
+#endif // CUDECOMP_CUDA_EVENT_H
diff --git a/src/autotune.cc b/src/autotune.cc
@@ -101,10 +101,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
   double t_start = MPI_Wtime();
 
   // Create cuda_events for intermediate timings (5 events per trial: start + 4 op boundaries)
-  std::vector<cudaEvent_t> events(5 * options->n_trials);
-  for (auto& event : events) {
-    CHECK_CUDA(cudaEventCreate(&event));
-  }
+  std::vector<cudaEventTimed> events(5 * options->n_trials);
 
   bool autotune_comm = options->autotune_transpose_backend;
   bool autotune_pdims = (grid_desc->config.pdims[0] == 0 && grid_desc->config.pdims[1] == 0);
@@ -536,11 +533,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
   CHECK_CUDA(cudaFree(data));
   if (need_data2) { CHECK_CUDA(cudaFree(data2)); }
 
-  // Delete cuda events
-  for (auto& event : events) {
-    CHECK_CUDA(cudaEventDestroy(event));
-  }
-
   // Set handle to best option (broadcast from rank 0 for consistency)
   CHECK_MPI(MPI_Bcast(&comm_backend_best, sizeof(cudecompTransposeCommBackend_t), MPI_CHAR, 0, handle->mpi_comm));
   CHECK_MPI(MPI_Bcast(pdims_best, 2 * sizeof(int), MPI_INT, 0, handle->mpi_comm));
@@ -587,10 +579,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
   double t_start = MPI_Wtime();
 
   // Create cuda events for timing (one per trial boundary: n_trials + 1 total)
-  std::vector<cudaEvent_t> events(options->n_trials + 1);
-  for (auto& event : events) {
-    CHECK_CUDA(cudaEventCreate(&event));
-  }
+  std::vector<cudaEventTimed> events(options->n_trials + 1);
 
   bool autotune_comm = options->autotune_halo_backend;
   bool autotune_pdims = (grid_desc->config.pdims[0] == 0 && grid_desc->config.pdims[1] == 0);
@@ -901,11 +890,6 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
 
   CHECK_CUDA(cudaFree(data));
 
-  // Delete cuda events
-  for (auto& event : events) {
-    CHECK_CUDA(cudaEventDestroy(event));
-  }
-
   // Set handle to best option (broadcast from rank 0 for consistency)
   CHECK_MPI(MPI_Bcast(&comm_backend_best, sizeof(cudecompHaloCommBackend_t), MPI_CHAR, 0, handle->mpi_comm));
   CHECK_MPI(MPI_Bcast(pdims_best, 2 * sizeof(int), MPI_INT, 0, handle->mpi_comm));

diff --git a/src/cudecomp.cc b/src/cudecomp.cc
@@ -804,12 +804,6 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
 
     // Create CUDA events for scheduling
     grid_desc->events.resize(handle->nranks);
-    for (auto& event : grid_desc->events) {
-      CHECK_CUDA(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
-    }
-#ifdef ENABLE_NVSHMEM
-    CHECK_CUDA(cudaEventCreateWithFlags(&grid_desc->nvshmem_sync_event, cudaEventDisableTiming));
-#endif
 
     // Run autotuning if requested
     if (options) {
@@ -922,44 +916,8 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
       CHECK_MPI(MPI_Comm_free(&grid_desc->col_comm_info.mpi_comm));
     }
 
-    for (auto e : grid_desc->events) {
-      if (e) { CHECK_CUDA(cudaEventDestroy(e)); }
-    }
-
-#ifdef ENABLE_NVSHMEM
-    if (grid_desc->nvshmem_sync_event) { CHECK_CUDA(cudaEventDestroy(grid_desc->nvshmem_sync_event)); }
-#endif
-
-    if (handle->performance_report_enable) {
-      // Print performance report before destroying events
-      printPerformanceReport(handle, grid_desc);
-
-      // Destroy all transpose performance sample events in the map
-      for (auto& entry : grid_desc->transpose_perf_samples_map) {
-        auto& collection = entry.second;
-        for (auto& sample : collection.samples) {
-          CHECK_CUDA(cudaEventDestroy(sample.transpose_start_event));
-          CHECK_CUDA(cudaEventDestroy(sample.transpose_end_event));
-          for (auto& event : sample.alltoall_start_events) {
-            CHECK_CUDA(cudaEventDestroy(event));
-          }
-          for (auto& event : sample.alltoall_end_events) {
-            CHECK_CUDA(cudaEventDestroy(event));
-          }
-        }
-      }
-
-      // Destroy all halo performance sample events in the map
-      for (auto& entry : grid_desc->halo_perf_samples_map) {
-        auto& collection = entry.second;
-        for (auto& sample : collection.samples) {
-          CHECK_CUDA(cudaEventDestroy(sample.halo_start_event));
-          CHECK_CUDA(cudaEventDestroy(sample.halo_end_event));
-          CHECK_CUDA(cudaEventDestroy(sample.sendrecv_start_event));
-          CHECK_CUDA(cudaEventDestroy(sample.sendrecv_end_event));
-        }
-      }
-    }
+    // Print performance report if enabled
+    if (handle->performance_report_enable) { printPerformanceReport(handle, grid_desc); }
 
     if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
         haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {

diff --git a/src/performance.cc b/src/performance.cc
@@ -207,21 +207,11 @@ getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle, cudecompGr
     // Create new sample collection for this configuration
     cudecompTransposePerformanceSampleCollection collection;
     collection.samples.resize(handle->performance_report_samples);
-    collection.sample_idx = 0;
 
-    // Create events for each sample
+    // Create per-rank alltoall events for each sample
     for (auto& sample : collection.samples) {
-      CHECK_CUDA(cudaEventCreate(&sample.transpose_start_event));
-      CHECK_CUDA(cudaEventCreate(&sample.transpose_end_event));
       sample.alltoall_start_events.resize(handle->nranks);
       sample.alltoall_end_events.resize(handle->nranks);
-      for (auto& event : sample.alltoall_start_events) {
-        CHECK_CUDA(cudaEventCreate(&event));
-      }
-      for (auto& event : sample.alltoall_end_events) {
-        CHECK_CUDA(cudaEventCreate(&event));
-      }
-      sample.valid = false;
     }
 
     samples_map[config] = std::move(collection);
@@ -240,16 +230,6 @@ cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const
     // Create new sample collection for this configuration
     cudecompHaloPerformanceSampleCollection collection;
     collection.samples.resize(handle->performance_report_samples);
-    collection.sample_idx = 0;
-
-    // Create events for each sample
-    for (auto& sample : collection.samples) {
-      CHECK_CUDA(cudaEventCreate(&sample.halo_start_event));
-      CHECK_CUDA(cudaEventCreate(&sample.halo_end_event));
-      CHECK_CUDA(cudaEventCreate(&sample.sendrecv_start_event));
-      CHECK_CUDA(cudaEventCreate(&sample.sendrecv_end_event));
-      sample.valid = false;
-    }
 
     samples_map[config] = std::move(collection);
   }