From 658d9831d9aa14e8313a882a045aba236d12c79c Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Mon, 4 May 2026 10:10:22 -0700
Subject: [PATCH 1/3] Use RAII cudaEvent wrapper to simplify event
 creation/cleanup.

Signed-off-by: Josh Romero <joshr@nvidia.com>
---
 include/internal/common.h     | 29 ++++++------
 include/internal/cuda_event.h | 84 +++++++++++++++++++++++++++++++++++
 src/autotune.cc               | 18 ++------
 src/cudecomp.cc               | 40 ++---------------
 src/performance.cc            | 16 +++----
 5 files changed, 114 insertions(+), 73 deletions(-)
 create mode 100644 include/internal/cuda_event.h

diff --git a/include/internal/common.h b/include/internal/common.h
index d763cdd..a083b7b 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -45,6 +45,7 @@
 
 #include "cudecomp.h"
 #include "internal/checks.h"
+#include "internal/cuda_event.h"
 #include "internal/graph.h"
 
 namespace cudecomp {
@@ -158,10 +159,10 @@ struct cudecompCommInfo {
 
 // Structure to contain data for transpose performance sample
 struct cudecompTransposePerformanceSample {
-  cudaEvent_t transpose_start_event;
-  cudaEvent_t transpose_end_event;
-  std::vector<cudaEvent_t> alltoall_start_events;
-  std::vector<cudaEvent_t> alltoall_end_events;
+  cudecomp::cudaEvent transpose_start_event;
+  cudecomp::cudaEvent transpose_end_event;
+  std::vector<cudecomp::cudaEvent> alltoall_start_events;
+  std::vector<cudecomp::cudaEvent> alltoall_end_events;
   int32_t alltoall_timing_count = 0;
   size_t alltoall_bytes = 0;
   bool valid = false;
@@ -176,10 +177,10 @@ struct cudecompTransposePerformanceSampleCollection {
 
 // Structure to contain data for halo performance sample
 struct cudecompHaloPerformanceSample {
-  cudaEvent_t halo_start_event;
-  cudaEvent_t halo_end_event;
-  cudaEvent_t sendrecv_start_event;
-  cudaEvent_t sendrecv_end_event;
+  cudecomp::cudaEvent halo_start_event;
+  cudecomp::cudaEvent halo_end_event;
+  cudecomp::cudaEvent sendrecv_start_event;
+  cudecomp::cudaEvent sendrecv_end_event;
   size_t sendrecv_bytes = 0;
   bool valid = false;
 };
@@ -202,8 +203,8 @@ struct cudecompGridDesc {
   cudecompCommInfo row_comm_info; // row communicator information
   cudecompCommInfo col_comm_info; // column communicator information
 
-  std::vector<cudaEvent_t> events{nullptr}; // CUDA events used for scheduling
-  cudaEvent_t nvshmem_sync_event = nullptr; // NVSHMEM event used for synchronization
+  std::vector<cudecomp::cudaEvent> events;        // CUDA events used for scheduling
+  cudecomp::cudaEvent nvshmem_sync_event;         // NVSHMEM event used for synchronization
 
 #ifdef ENABLE_NVSHMEM
   int* nvshmem_block_counters = nullptr;    // device memory counters for SM alltoallv last-block detection
@@ -217,11 +218,11 @@ struct cudecompGridDesc {
       nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle
 
   // Performance reporting related entries
-  std::vector<cudaEvent_t> alltoall_start_events; // events for alltoall timing
-  std::vector<cudaEvent_t> alltoall_end_events;   // events for alltoall timing
+  std::vector<cudecomp::cudaEvent> alltoall_start_events; // events for alltoall timing
+  std::vector<cudecomp::cudaEvent> alltoall_end_events;   // events for alltoall timing
   int32_t alltoall_timing_count = 0;              // count of alltoall timing events pairs (for pipelined alltoall)
-  cudaEvent_t transpose_start_event;              // event for transpose timing
-  cudaEvent_t transpose_end_event;                // event for transpose timing
+  cudecomp::cudaEvent transpose_start_event;      // event for transpose timing
+  cudecomp::cudaEvent transpose_end_event;        // event for transpose timing
 
   std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<int32_t, 3>,
                                 std::array<int32_t, 3>, std::array<int32_t, 3>, bool, bool, cudecompDataType_t>,
diff --git a/include/internal/cuda_event.h b/include/internal/cuda_event.h
new file mode 100644
index 0000000..31554a8
--- /dev/null
+++ b/include/internal/cuda_event.h
@@ -0,0 +1,84 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CUDECOMP_CUDA_EVENT_H
+#define CUDECOMP_CUDA_EVENT_H
+
+#include <utility>
+
+#include <cuda_runtime.h>
+
+#include "internal/checks.h"
+
+namespace cudecomp {
+
+class cudaEvent {
+public:
+  cudaEvent() = default;
+  ~cudaEvent() noexcept { resetNoThrow(); }
+
+  cudaEvent(const cudaEvent&) = delete;
+  cudaEvent& operator=(const cudaEvent&) = delete;
+
+  cudaEvent(cudaEvent&& other) noexcept : event_(std::exchange(other.event_, nullptr)) {}
+
+  cudaEvent& operator=(cudaEvent&& other) noexcept {
+    if (this != &other) {
+      resetNoThrow();
+      event_ = std::exchange(other.event_, nullptr);
+    }
+    return *this;
+  }
+
+  void create() {
+    reset();
+    cudaEvent_t event = nullptr;
+    CHECK_CUDA(cudaEventCreate(&event));
+    event_ = event;
+  }
+
+  void createWithFlags(unsigned int flags) {
+    reset();
+    cudaEvent_t event = nullptr;
+    CHECK_CUDA(cudaEventCreateWithFlags(&event, flags));
+    event_ = event;
+  }
+
+  void reset() {
+    if (event_) {
+      CHECK_CUDA(cudaEventDestroy(event_));
+      event_ = nullptr;
+    }
+  }
+
+  void resetNoThrow() noexcept {
+    if (event_) {
+      cudaEventDestroy(event_);
+      event_ = nullptr;
+    }
+  }
+
+  cudaEvent_t get() const noexcept { return event_; }
+  operator cudaEvent_t() const noexcept { return event_; }
+
+private:
+  cudaEvent_t event_ = nullptr;
+};
+
+} // namespace cudecomp
+
+#endif // CUDECOMP_CUDA_EVENT_H
diff --git a/src/autotune.cc b/src/autotune.cc
index a1ce48f..ae12edb 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -101,9 +101,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
   double t_start = MPI_Wtime();
 
   // Create cuda_events for intermediate timings (5 events per trial: start + 4 op boundaries)
-  std::vector<cudaEvent_t> events(5 * options->n_trials);
+  std::vector<cudaEvent> events(5 * options->n_trials);
   for (auto& event : events) {
-    CHECK_CUDA(cudaEventCreate(&event));
+    event.create();
   }
 
   bool autotune_comm = options->autotune_transpose_backend;
@@ -536,11 +536,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
   CHECK_CUDA(cudaFree(data));
   if (need_data2) { CHECK_CUDA(cudaFree(data2)); }
 
-  // Delete cuda events
-  for (auto& event : events) {
-    CHECK_CUDA(cudaEventDestroy(event));
-  }
-
   // Set handle to best option (broadcast from rank 0 for consistency)
   CHECK_MPI(MPI_Bcast(&comm_backend_best, sizeof(cudecompTransposeCommBackend_t), MPI_CHAR, 0, handle->mpi_comm));
   CHECK_MPI(MPI_Bcast(pdims_best, 2 * sizeof(int), MPI_INT, 0, handle->mpi_comm));
@@ -587,9 +582,9 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
   double t_start = MPI_Wtime();
 
   // Create cuda events for timing (one per trial boundary: n_trials + 1 total)
-  std::vector<cudaEvent_t> events(options->n_trials + 1);
+  std::vector<cudaEvent> events(options->n_trials + 1);
   for (auto& event : events) {
-    CHECK_CUDA(cudaEventCreate(&event));
+    event.create();
   }
 
   bool autotune_comm = options->autotune_halo_backend;
@@ -901,11 +896,6 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
 
   CHECK_CUDA(cudaFree(data));
 
-  // Delete cuda events
-  for (auto& event : events) {
-    CHECK_CUDA(cudaEventDestroy(event));
-  }
-
   // Set handle to best option (broadcast from rank 0 for consistency)
   CHECK_MPI(MPI_Bcast(&comm_backend_best, sizeof(cudecompHaloCommBackend_t), MPI_CHAR, 0, handle->mpi_comm));
   CHECK_MPI(MPI_Bcast(pdims_best, 2 * sizeof(int), MPI_INT, 0, handle->mpi_comm));
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index 53b71b2..8437d28 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -805,10 +805,10 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
     // Create CUDA events for scheduling
     grid_desc->events.resize(handle->nranks);
     for (auto& event : grid_desc->events) {
-      CHECK_CUDA(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+      event.createWithFlags(cudaEventDisableTiming);
     }
 #ifdef ENABLE_NVSHMEM
-    CHECK_CUDA(cudaEventCreateWithFlags(&grid_desc->nvshmem_sync_event, cudaEventDisableTiming));
+    grid_desc->nvshmem_sync_event.createWithFlags(cudaEventDisableTiming);
 #endif
 
     // Run autotuning if requested
@@ -922,43 +922,9 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
       CHECK_MPI(MPI_Comm_free(&grid_desc->col_comm_info.mpi_comm));
     }
 
-    for (auto e : grid_desc->events) {
-      if (e) { CHECK_CUDA(cudaEventDestroy(e)); }
-    }
-
-#ifdef ENABLE_NVSHMEM
-    if (grid_desc->nvshmem_sync_event) { CHECK_CUDA(cudaEventDestroy(grid_desc->nvshmem_sync_event)); }
-#endif
-
+    // Print performance report if enabled
     if (handle->performance_report_enable) {
-      // Print performance report before destroying events
       printPerformanceReport(handle, grid_desc);
-
-      // Destroy all transpose performance sample events in the map
-      for (auto& entry : grid_desc->transpose_perf_samples_map) {
-        auto& collection = entry.second;
-        for (auto& sample : collection.samples) {
-          CHECK_CUDA(cudaEventDestroy(sample.transpose_start_event));
-          CHECK_CUDA(cudaEventDestroy(sample.transpose_end_event));
-          for (auto& event : sample.alltoall_start_events) {
-            CHECK_CUDA(cudaEventDestroy(event));
-          }
-          for (auto& event : sample.alltoall_end_events) {
-            CHECK_CUDA(cudaEventDestroy(event));
-          }
-        }
-      }
-
-      // Destroy all halo performance sample events in the map
-      for (auto& entry : grid_desc->halo_perf_samples_map) {
-        auto& collection = entry.second;
-        for (auto& sample : collection.samples) {
-          CHECK_CUDA(cudaEventDestroy(sample.halo_start_event));
-          CHECK_CUDA(cudaEventDestroy(sample.halo_end_event));
-          CHECK_CUDA(cudaEventDestroy(sample.sendrecv_start_event));
-          CHECK_CUDA(cudaEventDestroy(sample.sendrecv_end_event));
-        }
-      }
     }
 
     if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
diff --git a/src/performance.cc b/src/performance.cc
index 3a91b76..8256a3b 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -211,15 +211,15 @@ getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle, cudecompGr
 
     // Create events for each sample
     for (auto& sample : collection.samples) {
-      CHECK_CUDA(cudaEventCreate(&sample.transpose_start_event));
-      CHECK_CUDA(cudaEventCreate(&sample.transpose_end_event));
+      sample.transpose_start_event.create();
+      sample.transpose_end_event.create();
       sample.alltoall_start_events.resize(handle->nranks);
       sample.alltoall_end_events.resize(handle->nranks);
       for (auto& event : sample.alltoall_start_events) {
-        CHECK_CUDA(cudaEventCreate(&event));
+        event.create();
       }
       for (auto& event : sample.alltoall_end_events) {
-        CHECK_CUDA(cudaEventCreate(&event));
+        event.create();
       }
       sample.valid = false;
     }
@@ -244,10 +244,10 @@ cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const
 
     // Create events for each sample
     for (auto& sample : collection.samples) {
-      CHECK_CUDA(cudaEventCreate(&sample.halo_start_event));
-      CHECK_CUDA(cudaEventCreate(&sample.halo_end_event));
-      CHECK_CUDA(cudaEventCreate(&sample.sendrecv_start_event));
-      CHECK_CUDA(cudaEventCreate(&sample.sendrecv_end_event));
+      sample.halo_start_event.create();
+      sample.halo_end_event.create();
+      sample.sendrecv_start_event.create();
+      sample.sendrecv_end_event.create();
       sample.valid = false;
     }
 

From 3736b996edd61901a0d68183e8bd9be871530911 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Mon, 4 May 2026 10:17:20 -0700
Subject: [PATCH 2/3] Formatting.

Signed-off-by: Josh Romero <joshr@nvidia.com>
---
 include/internal/common.h | 10 +++++-----
 src/cudecomp.cc           |  4 +---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/internal/common.h b/include/internal/common.h
index a083b7b..ca50247 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -203,8 +203,8 @@ struct cudecompGridDesc {
   cudecompCommInfo row_comm_info; // row communicator information
   cudecompCommInfo col_comm_info; // column communicator information
 
-  std::vector<cudecomp::cudaEvent> events;        // CUDA events used for scheduling
-  cudecomp::cudaEvent nvshmem_sync_event;         // NVSHMEM event used for synchronization
+  std::vector<cudecomp::cudaEvent> events; // CUDA events used for scheduling
+  cudecomp::cudaEvent nvshmem_sync_event;  // NVSHMEM event used for synchronization
 
 #ifdef ENABLE_NVSHMEM
   int* nvshmem_block_counters = nullptr;    // device memory counters for SM alltoallv last-block detection
@@ -220,9 +220,9 @@ struct cudecompGridDesc {
   // Performance reporting related entries
   std::vector<cudecomp::cudaEvent> alltoall_start_events; // events for alltoall timing
   std::vector<cudecomp::cudaEvent> alltoall_end_events;   // events for alltoall timing
-  int32_t alltoall_timing_count = 0;              // count of alltoall timing events pairs (for pipelined alltoall)
-  cudecomp::cudaEvent transpose_start_event;      // event for transpose timing
-  cudecomp::cudaEvent transpose_end_event;        // event for transpose timing
+  int32_t alltoall_timing_count = 0;         // count of alltoall timing events pairs (for pipelined alltoall)
+  cudecomp::cudaEvent transpose_start_event; // event for transpose timing
+  cudecomp::cudaEvent transpose_end_event;   // event for transpose timing
 
   std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<int32_t, 3>,
                                 std::array<int32_t, 3>, std::array<int32_t, 3>, bool, bool, cudecompDataType_t>,
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index 8437d28..31624b3 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -923,9 +923,7 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
     }
 
     // Print performance report if enabled
-    if (handle->performance_report_enable) {
-      printPerformanceReport(handle, grid_desc);
-    }
+    if (handle->performance_report_enable) { printPerformanceReport(handle, grid_desc); }
 
     if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
         haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {

From d7a9afdff586df198deb598049a778c1639b3193 Mon Sep 17 00:00:00 2001
From: Josh Romero <joshr@nvidia.com>
Date: Mon, 4 May 2026 10:59:54 -0700
Subject: [PATCH 3/3] Remove explicit create calls.

Signed-off-by: Josh Romero <joshr@nvidia.com>
---
 include/internal/common.h     | 26 ++++++++++----------
 include/internal/cuda_event.h | 46 +++++++++++------------------------
 src/autotune.cc               | 10 ++------
 src/cudecomp.cc               |  6 -----
 src/performance.cc            | 22 +----------------
 5 files changed, 30 insertions(+), 80 deletions(-)

diff --git a/include/internal/common.h b/include/internal/common.h
index ca50247..514cf0a 100644
--- a/include/internal/common.h
+++ b/include/internal/common.h
@@ -159,10 +159,10 @@ struct cudecompCommInfo {
 
 // Structure to contain data for transpose performance sample
 struct cudecompTransposePerformanceSample {
-  cudecomp::cudaEvent transpose_start_event;
-  cudecomp::cudaEvent transpose_end_event;
-  std::vector<cudecomp::cudaEvent> alltoall_start_events;
-  std::vector<cudecomp::cudaEvent> alltoall_end_events;
+  cudecomp::cudaEventTimed transpose_start_event;
+  cudecomp::cudaEventTimed transpose_end_event;
+  std::vector<cudecomp::cudaEventTimed> alltoall_start_events;
+  std::vector<cudecomp::cudaEventTimed> alltoall_end_events;
   int32_t alltoall_timing_count = 0;
   size_t alltoall_bytes = 0;
   bool valid = false;
@@ -177,10 +177,10 @@ struct cudecompTransposePerformanceSampleCollection {
 
 // Structure to contain data for halo performance sample
 struct cudecompHaloPerformanceSample {
-  cudecomp::cudaEvent halo_start_event;
-  cudecomp::cudaEvent halo_end_event;
-  cudecomp::cudaEvent sendrecv_start_event;
-  cudecomp::cudaEvent sendrecv_end_event;
+  cudecomp::cudaEventTimed halo_start_event;
+  cudecomp::cudaEventTimed halo_end_event;
+  cudecomp::cudaEventTimed sendrecv_start_event;
+  cudecomp::cudaEventTimed sendrecv_end_event;
   size_t sendrecv_bytes = 0;
   bool valid = false;
 };
@@ -218,11 +218,11 @@ struct cudecompGridDesc {
       nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle
 
   // Performance reporting related entries
-  std::vector<cudecomp::cudaEvent> alltoall_start_events; // events for alltoall timing
-  std::vector<cudecomp::cudaEvent> alltoall_end_events;   // events for alltoall timing
-  int32_t alltoall_timing_count = 0;         // count of alltoall timing events pairs (for pipelined alltoall)
-  cudecomp::cudaEvent transpose_start_event; // event for transpose timing
-  cudecomp::cudaEvent transpose_end_event;   // event for transpose timing
+  std::vector<cudecomp::cudaEventTimed> alltoall_start_events; // events for alltoall timing
+  std::vector<cudecomp::cudaEventTimed> alltoall_end_events;   // events for alltoall timing
+  int32_t alltoall_timing_count = 0;              // count of alltoall timing events pairs (for pipelined alltoall)
+  cudecomp::cudaEventTimed transpose_start_event; // event for transpose timing
+  cudecomp::cudaEventTimed transpose_end_event;   // event for transpose timing
 
   std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<int32_t, 3>,
                                 std::array<int32_t, 3>, std::array<int32_t, 3>, bool, bool, cudecompDataType_t>,
diff --git a/include/internal/cuda_event.h b/include/internal/cuda_event.h
index 31554a8..27e7100 100644
--- a/include/internal/cuda_event.h
+++ b/include/internal/cuda_event.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,17 +26,17 @@
 
 namespace cudecomp {
 
-class cudaEvent {
+template <unsigned int flags> class cudaEventBase {
 public:
-  cudaEvent() = default;
-  ~cudaEvent() noexcept { resetNoThrow(); }
+  cudaEventBase() { CHECK_CUDA(cudaEventCreateWithFlags(&event_, flags)); }
+  ~cudaEventBase() noexcept { resetNoThrow(); }
 
-  cudaEvent(const cudaEvent&) = delete;
-  cudaEvent& operator=(const cudaEvent&) = delete;
+  cudaEventBase(const cudaEventBase&) = delete;
+  cudaEventBase& operator=(const cudaEventBase&) = delete;
 
-  cudaEvent(cudaEvent&& other) noexcept : event_(std::exchange(other.event_, nullptr)) {}
+  cudaEventBase(cudaEventBase&& other) noexcept : event_(std::exchange(other.event_, nullptr)) {}
 
-  cudaEvent& operator=(cudaEvent&& other) noexcept {
+  cudaEventBase& operator=(cudaEventBase&& other) noexcept {
     if (this != &other) {
       resetNoThrow();
       event_ = std::exchange(other.event_, nullptr);
@@ -44,27 +44,10 @@ class cudaEvent {
     return *this;
   }
 
-  void create() {
-    reset();
-    cudaEvent_t event = nullptr;
-    CHECK_CUDA(cudaEventCreate(&event));
-    event_ = event;
-  }
-
-  void createWithFlags(unsigned int flags) {
-    reset();
-    cudaEvent_t event = nullptr;
-    CHECK_CUDA(cudaEventCreateWithFlags(&event, flags));
-    event_ = event;
-  }
-
-  void reset() {
-    if (event_) {
-      CHECK_CUDA(cudaEventDestroy(event_));
-      event_ = nullptr;
-    }
-  }
+  cudaEvent_t get() const noexcept { return event_; }
+  operator cudaEvent_t() const noexcept { return event_; }
 
+private:
   void resetNoThrow() noexcept {
     if (event_) {
       cudaEventDestroy(event_);
@@ -72,13 +55,12 @@ class cudaEvent {
     }
   }
 
-  cudaEvent_t get() const noexcept { return event_; }
-  operator cudaEvent_t() const noexcept { return event_; }
-
-private:
   cudaEvent_t event_ = nullptr;
 };
 
+using cudaEvent = cudaEventBase<cudaEventDisableTiming>;
+using cudaEventTimed = cudaEventBase<cudaEventDefault>;
+
 } // namespace cudecomp
 
 #endif // CUDECOMP_CUDA_EVENT_H
diff --git a/src/autotune.cc b/src/autotune.cc
index ae12edb..03b7f70 100644
--- a/src/autotune.cc
+++ b/src/autotune.cc
@@ -101,10 +101,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
   double t_start = MPI_Wtime();
 
   // Create cuda_events for intermediate timings (5 events per trial: start + 4 op boundaries)
-  std::vector<cudaEvent> events(5 * options->n_trials);
-  for (auto& event : events) {
-    event.create();
-  }
+  std::vector<cudaEventTimed> events(5 * options->n_trials);
 
   bool autotune_comm = options->autotune_transpose_backend;
   bool autotune_pdims = (grid_desc->config.pdims[0] == 0 && grid_desc->config.pdims[1] == 0);
@@ -582,10 +579,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
   double t_start = MPI_Wtime();
 
   // Create cuda events for timing (one per trial boundary: n_trials + 1 total)
-  std::vector<cudaEvent> events(options->n_trials + 1);
-  for (auto& event : events) {
-    event.create();
-  }
+  std::vector<cudaEventTimed> events(options->n_trials + 1);
 
   bool autotune_comm = options->autotune_halo_backend;
   bool autotune_pdims = (grid_desc->config.pdims[0] == 0 && grid_desc->config.pdims[1] == 0);
diff --git a/src/cudecomp.cc b/src/cudecomp.cc
index 31624b3..a10a32e 100644
--- a/src/cudecomp.cc
+++ b/src/cudecomp.cc
@@ -804,12 +804,6 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes
 
     // Create CUDA events for scheduling
     grid_desc->events.resize(handle->nranks);
-    for (auto& event : grid_desc->events) {
-      event.createWithFlags(cudaEventDisableTiming);
-    }
-#ifdef ENABLE_NVSHMEM
-    grid_desc->nvshmem_sync_event.createWithFlags(cudaEventDisableTiming);
-#endif
 
     // Run autotuning if requested
     if (options) {
diff --git a/src/performance.cc b/src/performance.cc
index 8256a3b..0b65027 100644
--- a/src/performance.cc
+++ b/src/performance.cc
@@ -207,21 +207,11 @@ getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle, cudecompGr
     // Create new sample collection for this configuration
     cudecompTransposePerformanceSampleCollection collection;
     collection.samples.resize(handle->performance_report_samples);
-    collection.sample_idx = 0;
 
-    // Create events for each sample
+    // Create per-rank alltoall events for each sample
     for (auto& sample : collection.samples) {
-      sample.transpose_start_event.create();
-      sample.transpose_end_event.create();
       sample.alltoall_start_events.resize(handle->nranks);
       sample.alltoall_end_events.resize(handle->nranks);
-      for (auto& event : sample.alltoall_start_events) {
-        event.create();
-      }
-      for (auto& event : sample.alltoall_end_events) {
-        event.create();
-      }
-      sample.valid = false;
     }
 
     samples_map[config] = std::move(collection);
@@ -240,16 +230,6 @@ cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const
     // Create new sample collection for this configuration
     cudecompHaloPerformanceSampleCollection collection;
     collection.samples.resize(handle->performance_report_samples);
-    collection.sample_idx = 0;
-
-    // Create events for each sample
-    for (auto& sample : collection.samples) {
-      sample.halo_start_event.create();
-      sample.halo_end_event.create();
-      sample.sendrecv_start_event.create();
-      sample.sendrecv_end_event.create();
-      sample.valid = false;
-    }
 
     samples_map[config] = std::move(collection);
   }