diff --git a/include/internal/common.h b/include/internal/common.h index d763cdd..514cf0a 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -45,6 +45,7 @@ #include "cudecomp.h" #include "internal/checks.h" +#include "internal/cuda_event.h" #include "internal/graph.h" namespace cudecomp { @@ -158,10 +159,10 @@ struct cudecompCommInfo { // Structure to contain data for transpose performance sample struct cudecompTransposePerformanceSample { - cudaEvent_t transpose_start_event; - cudaEvent_t transpose_end_event; - std::vector alltoall_start_events; - std::vector alltoall_end_events; + cudecomp::cudaEventTimed transpose_start_event; + cudecomp::cudaEventTimed transpose_end_event; + std::vector alltoall_start_events; + std::vector alltoall_end_events; int32_t alltoall_timing_count = 0; size_t alltoall_bytes = 0; bool valid = false; @@ -176,10 +177,10 @@ struct cudecompTransposePerformanceSampleCollection { // Structure to contain data for halo performance sample struct cudecompHaloPerformanceSample { - cudaEvent_t halo_start_event; - cudaEvent_t halo_end_event; - cudaEvent_t sendrecv_start_event; - cudaEvent_t sendrecv_end_event; + cudecomp::cudaEventTimed halo_start_event; + cudecomp::cudaEventTimed halo_end_event; + cudecomp::cudaEventTimed sendrecv_start_event; + cudecomp::cudaEventTimed sendrecv_end_event; size_t sendrecv_bytes = 0; bool valid = false; }; @@ -202,8 +203,8 @@ struct cudecompGridDesc { cudecompCommInfo row_comm_info; // row communicator information cudecompCommInfo col_comm_info; // column communicator information - std::vector events{nullptr}; // CUDA events used for scheduling - cudaEvent_t nvshmem_sync_event = nullptr; // NVSHMEM event used for synchronization + std::vector events; // CUDA events used for scheduling + cudecomp::cudaEvent nvshmem_sync_event; // NVSHMEM event used for synchronization #ifdef ENABLE_NVSHMEM int* nvshmem_block_counters = nullptr; // device memory counters for SM alltoallv last-block detection @@ -217,11 +218,11 @@ struct cudecompGridDesc { nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle // Performance reporting related entries - std::vector alltoall_start_events; // events for alltoall timing - std::vector alltoall_end_events; // events for alltoall timing + std::vector alltoall_start_events; // events for alltoall timing + std::vector alltoall_end_events; // events for alltoall timing int32_t alltoall_timing_count = 0; // count of alltoall timing events pairs (for pipelined alltoall) - cudaEvent_t transpose_start_event; // event for transpose timing - cudaEvent_t transpose_end_event; // event for transpose timing + cudecomp::cudaEventTimed transpose_start_event; // event for transpose timing + cudecomp::cudaEventTimed transpose_end_event; // event for transpose timing std::unordered_map, std::array, std::array, std::array, bool, bool, cudecompDataType_t>, diff --git a/include/internal/cuda_event.h b/include/internal/cuda_event.h new file mode 100644 index 0000000..27e7100 --- /dev/null +++ b/include/internal/cuda_event.h @@ -0,0 +1,66 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CUDECOMP_CUDA_EVENT_H +#define CUDECOMP_CUDA_EVENT_H + +#include + +#include + +#include "internal/checks.h" + +namespace cudecomp { + +template class cudaEventBase { +public: + cudaEventBase() { CHECK_CUDA(cudaEventCreateWithFlags(&event_, flags)); } + ~cudaEventBase() noexcept { resetNoThrow(); } + + cudaEventBase(const cudaEventBase&) = delete; + cudaEventBase& operator=(const cudaEventBase&) = delete; + + cudaEventBase(cudaEventBase&& other) noexcept : event_(std::exchange(other.event_, nullptr)) {} + + cudaEventBase& operator=(cudaEventBase&& other) noexcept { + if (this != &other) { + resetNoThrow(); + event_ = std::exchange(other.event_, nullptr); + } + return *this; + } + + cudaEvent_t get() const noexcept { return event_; } + operator cudaEvent_t() const noexcept { return event_; } + +private: + void resetNoThrow() noexcept { + if (event_) { + cudaEventDestroy(event_); + event_ = nullptr; + } + } + + cudaEvent_t event_ = nullptr; +}; + +using cudaEvent = cudaEventBase; +using cudaEventTimed = cudaEventBase; + +} // namespace cudecomp + +#endif // CUDECOMP_CUDA_EVENT_H diff --git a/src/autotune.cc b/src/autotune.cc index a1ce48f..03b7f70 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -101,10 +101,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d double t_start = MPI_Wtime(); // Create cuda_events for intermediate timings (5 events per trial: start + 4 op boundaries) - std::vector events(5 * options->n_trials); - for (auto& event : events) { - CHECK_CUDA(cudaEventCreate(&event)); - } + std::vector events(5 * options->n_trials); bool autotune_comm = options->autotune_transpose_backend; bool autotune_pdims = (grid_desc->config.pdims[0] == 0 && grid_desc->config.pdims[1] == 0); @@ -536,11 +533,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d CHECK_CUDA(cudaFree(data)); if (need_data2) { CHECK_CUDA(cudaFree(data2)); } - // Delete cuda events - for (auto& event : events) { - CHECK_CUDA(cudaEventDestroy(event)); - } - // Set handle to best option (broadcast from rank 0 for consistency) CHECK_MPI(MPI_Bcast(&comm_backend_best, sizeof(cudecompTransposeCommBackend_t), MPI_CHAR, 0, handle->mpi_comm)); CHECK_MPI(MPI_Bcast(pdims_best, 2 * sizeof(int), MPI_INT, 0, handle->mpi_comm)); @@ -587,10 +579,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, double t_start = MPI_Wtime(); // Create cuda events for timing (one per trial boundary: n_trials + 1 total) - std::vector events(options->n_trials + 1); - for (auto& event : events) { - CHECK_CUDA(cudaEventCreate(&event)); - } + std::vector events(options->n_trials + 1); bool autotune_comm = options->autotune_halo_backend; bool autotune_pdims = (grid_desc->config.pdims[0] == 0 && grid_desc->config.pdims[1] == 0); @@ -901,11 +890,6 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, CHECK_CUDA(cudaFree(data)); - // Delete cuda events - for (auto& event : events) { - CHECK_CUDA(cudaEventDestroy(event)); - } - // Set handle to best option (broadcast from rank 0 for consistency) CHECK_MPI(MPI_Bcast(&comm_backend_best, sizeof(cudecompHaloCommBackend_t), MPI_CHAR, 0, handle->mpi_comm)); CHECK_MPI(MPI_Bcast(pdims_best, 2 * sizeof(int), MPI_INT, 0, handle->mpi_comm)); diff --git a/src/cudecomp.cc b/src/cudecomp.cc index 53b71b2..a10a32e 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -804,12 +804,6 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes // Create CUDA events for scheduling grid_desc->events.resize(handle->nranks); - for (auto& event : grid_desc->events) { - CHECK_CUDA(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); - } -#ifdef ENABLE_NVSHMEM - CHECK_CUDA(cudaEventCreateWithFlags(&grid_desc->nvshmem_sync_event, cudaEventDisableTiming)); -#endif // Run autotuning if requested if (options) { @@ -922,44 +916,8 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe CHECK_MPI(MPI_Comm_free(&grid_desc->col_comm_info.mpi_comm)); } - for (auto e : grid_desc->events) { - if (e) { CHECK_CUDA(cudaEventDestroy(e)); } - } - -#ifdef ENABLE_NVSHMEM - if (grid_desc->nvshmem_sync_event) { CHECK_CUDA(cudaEventDestroy(grid_desc->nvshmem_sync_event)); } -#endif - - if (handle->performance_report_enable) { - // Print performance report before destroying events - printPerformanceReport(handle, grid_desc); - - // Destroy all transpose performance sample events in the map - for (auto& entry : grid_desc->transpose_perf_samples_map) { - auto& collection = entry.second; - for (auto& sample : collection.samples) { - CHECK_CUDA(cudaEventDestroy(sample.transpose_start_event)); - CHECK_CUDA(cudaEventDestroy(sample.transpose_end_event)); - for (auto& event : sample.alltoall_start_events) { - CHECK_CUDA(cudaEventDestroy(event)); - } - for (auto& event : sample.alltoall_end_events) { - CHECK_CUDA(cudaEventDestroy(event)); - } - } - } - - // Destroy all halo performance sample events in the map - for (auto& entry : grid_desc->halo_perf_samples_map) { - auto& collection = entry.second; - for (auto& sample : collection.samples) { - CHECK_CUDA(cudaEventDestroy(sample.halo_start_event)); - CHECK_CUDA(cudaEventDestroy(sample.halo_end_event)); - CHECK_CUDA(cudaEventDestroy(sample.sendrecv_start_event)); - CHECK_CUDA(cudaEventDestroy(sample.sendrecv_end_event)); - } - } - } + // Print performance report if enabled + if (handle->performance_report_enable) { printPerformanceReport(handle, grid_desc); } if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) || haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) { diff --git a/src/performance.cc b/src/performance.cc index 3a91b76..0b65027 100644 --- a/src/performance.cc +++ b/src/performance.cc @@ -207,21 +207,11 @@ getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle, cudecompGr // Create new sample collection for this configuration cudecompTransposePerformanceSampleCollection collection; collection.samples.resize(handle->performance_report_samples); - collection.sample_idx = 0; - // Create events for each sample + // Create per-rank alltoall events for each sample for (auto& sample : collection.samples) { - CHECK_CUDA(cudaEventCreate(&sample.transpose_start_event)); - CHECK_CUDA(cudaEventCreate(&sample.transpose_end_event)); sample.alltoall_start_events.resize(handle->nranks); sample.alltoall_end_events.resize(handle->nranks); - for (auto& event : sample.alltoall_start_events) { - CHECK_CUDA(cudaEventCreate(&event)); - } - for (auto& event : sample.alltoall_end_events) { - CHECK_CUDA(cudaEventCreate(&event)); - } - sample.valid = false; } samples_map[config] = std::move(collection); @@ -240,16 +230,6 @@ cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const // Create new sample collection for this configuration cudecompHaloPerformanceSampleCollection collection; collection.samples.resize(handle->performance_report_samples); - collection.sample_idx = 0; - - // Create events for each sample - for (auto& sample : collection.samples) { - CHECK_CUDA(cudaEventCreate(&sample.halo_start_event)); - CHECK_CUDA(cudaEventCreate(&sample.halo_end_event)); - CHECK_CUDA(cudaEventCreate(&sample.sendrecv_start_event)); - CHECK_CUDA(cudaEventCreate(&sample.sendrecv_end_event)); - sample.valid = false; - } samples_map[config] = std::move(collection); }