Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 15 additions & 14 deletions include/internal/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@

#include "cudecomp.h"
#include "internal/checks.h"
#include "internal/cuda_event.h"
#include "internal/graph.h"

namespace cudecomp {
Expand Down Expand Up @@ -158,10 +159,10 @@ struct cudecompCommInfo {

// Structure to contain data for transpose performance sample
struct cudecompTransposePerformanceSample {
cudaEvent_t transpose_start_event;
cudaEvent_t transpose_end_event;
std::vector<cudaEvent_t> alltoall_start_events;
std::vector<cudaEvent_t> alltoall_end_events;
cudecomp::cudaEventTimed transpose_start_event;
cudecomp::cudaEventTimed transpose_end_event;
std::vector<cudecomp::cudaEventTimed> alltoall_start_events;
std::vector<cudecomp::cudaEventTimed> alltoall_end_events;
int32_t alltoall_timing_count = 0;
size_t alltoall_bytes = 0;
bool valid = false;
Expand All @@ -176,10 +177,10 @@ struct cudecompTransposePerformanceSampleCollection {

// Structure to contain data for halo performance sample
struct cudecompHaloPerformanceSample {
cudaEvent_t halo_start_event;
cudaEvent_t halo_end_event;
cudaEvent_t sendrecv_start_event;
cudaEvent_t sendrecv_end_event;
cudecomp::cudaEventTimed halo_start_event;
cudecomp::cudaEventTimed halo_end_event;
cudecomp::cudaEventTimed sendrecv_start_event;
cudecomp::cudaEventTimed sendrecv_end_event;
size_t sendrecv_bytes = 0;
bool valid = false;
};
Expand All @@ -202,8 +203,8 @@ struct cudecompGridDesc {
cudecompCommInfo row_comm_info; // row communicator information
cudecompCommInfo col_comm_info; // column communicator information

std::vector<cudaEvent_t> events{nullptr}; // CUDA events used for scheduling
cudaEvent_t nvshmem_sync_event = nullptr; // NVSHMEM event used for synchronization
std::vector<cudecomp::cudaEvent> events; // CUDA events used for scheduling
cudecomp::cudaEvent nvshmem_sync_event; // NVSHMEM event used for synchronization

#ifdef ENABLE_NVSHMEM
int* nvshmem_block_counters = nullptr; // device memory counters for SM alltoallv last-block detection
Expand All @@ -217,11 +218,11 @@ struct cudecompGridDesc {
nccl_local_comm; // NCCL communicator (intra-node, or intra-clique on MNNVL systems), shared from handle

// Performance reporting related entries
std::vector<cudaEvent_t> alltoall_start_events; // events for alltoall timing
std::vector<cudaEvent_t> alltoall_end_events; // events for alltoall timing
std::vector<cudecomp::cudaEventTimed> alltoall_start_events; // events for alltoall timing
std::vector<cudecomp::cudaEventTimed> alltoall_end_events; // events for alltoall timing
int32_t alltoall_timing_count = 0; // count of alltoall timing events pairs (for pipelined alltoall)
cudaEvent_t transpose_start_event; // event for transpose timing
cudaEvent_t transpose_end_event; // event for transpose timing
cudecomp::cudaEventTimed transpose_start_event; // event for transpose timing
cudecomp::cudaEventTimed transpose_end_event; // event for transpose timing

std::unordered_map<std::tuple<int32_t, int32_t, std::array<int32_t, 3>, std::array<int32_t, 3>,
std::array<int32_t, 3>, std::array<int32_t, 3>, bool, bool, cudecompDataType_t>,
Expand Down
66 changes: 66 additions & 0 deletions include/internal/cuda_event.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef CUDECOMP_CUDA_EVENT_H
#define CUDECOMP_CUDA_EVENT_H

#include <utility>

#include <cuda_runtime.h>

#include "internal/checks.h"

namespace cudecomp {

template <unsigned int flags> class cudaEventBase {
public:
cudaEventBase() { CHECK_CUDA(cudaEventCreateWithFlags(&event_, flags)); }
~cudaEventBase() noexcept { resetNoThrow(); }

cudaEventBase(const cudaEventBase&) = delete;
cudaEventBase& operator=(const cudaEventBase&) = delete;

cudaEventBase(cudaEventBase&& other) noexcept : event_(std::exchange(other.event_, nullptr)) {}

cudaEventBase& operator=(cudaEventBase&& other) noexcept {
if (this != &other) {
resetNoThrow();
event_ = std::exchange(other.event_, nullptr);
}
return *this;
}

cudaEvent_t get() const noexcept { return event_; }
operator cudaEvent_t() const noexcept { return event_; }

private:
void resetNoThrow() noexcept {
if (event_) {
cudaEventDestroy(event_);
event_ = nullptr;
}
}

cudaEvent_t event_ = nullptr;
};

using cudaEvent = cudaEventBase<cudaEventDisableTiming>;
using cudaEventTimed = cudaEventBase<cudaEventDefault>;

} // namespace cudecomp

#endif // CUDECOMP_CUDA_EVENT_H
20 changes: 2 additions & 18 deletions src/autotune.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
double t_start = MPI_Wtime();

// Create cuda_events for intermediate timings (5 events per trial: start + 4 op boundaries)
std::vector<cudaEvent_t> events(5 * options->n_trials);
for (auto& event : events) {
CHECK_CUDA(cudaEventCreate(&event));
}
std::vector<cudaEventTimed> events(5 * options->n_trials);

bool autotune_comm = options->autotune_transpose_backend;
bool autotune_pdims = (grid_desc->config.pdims[0] == 0 && grid_desc->config.pdims[1] == 0);
Expand Down Expand Up @@ -536,11 +533,6 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
CHECK_CUDA(cudaFree(data));
if (need_data2) { CHECK_CUDA(cudaFree(data2)); }

// Delete cuda events
for (auto& event : events) {
CHECK_CUDA(cudaEventDestroy(event));
}

// Set handle to best option (broadcast from rank 0 for consistency)
CHECK_MPI(MPI_Bcast(&comm_backend_best, sizeof(cudecompTransposeCommBackend_t), MPI_CHAR, 0, handle->mpi_comm));
CHECK_MPI(MPI_Bcast(pdims_best, 2 * sizeof(int), MPI_INT, 0, handle->mpi_comm));
Expand Down Expand Up @@ -587,10 +579,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
double t_start = MPI_Wtime();

// Create cuda events for timing (one per trial boundary: n_trials + 1 total)
std::vector<cudaEvent_t> events(options->n_trials + 1);
for (auto& event : events) {
CHECK_CUDA(cudaEventCreate(&event));
}
std::vector<cudaEventTimed> events(options->n_trials + 1);

bool autotune_comm = options->autotune_halo_backend;
bool autotune_pdims = (grid_desc->config.pdims[0] == 0 && grid_desc->config.pdims[1] == 0);
Expand Down Expand Up @@ -901,11 +890,6 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,

CHECK_CUDA(cudaFree(data));

// Delete cuda events
for (auto& event : events) {
CHECK_CUDA(cudaEventDestroy(event));
}

// Set handle to best option (broadcast from rank 0 for consistency)
CHECK_MPI(MPI_Bcast(&comm_backend_best, sizeof(cudecompHaloCommBackend_t), MPI_CHAR, 0, handle->mpi_comm));
CHECK_MPI(MPI_Bcast(pdims_best, 2 * sizeof(int), MPI_INT, 0, handle->mpi_comm));
Expand Down
46 changes: 2 additions & 44 deletions src/cudecomp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -804,12 +804,6 @@ cudecompResult_t cudecompGridDescCreate(cudecompHandle_t handle, cudecompGridDes

// Create CUDA events for scheduling
grid_desc->events.resize(handle->nranks);
for (auto& event : grid_desc->events) {
CHECK_CUDA(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
}
#ifdef ENABLE_NVSHMEM
CHECK_CUDA(cudaEventCreateWithFlags(&grid_desc->nvshmem_sync_event, cudaEventDisableTiming));
#endif

// Run autotuning if requested
if (options) {
Expand Down Expand Up @@ -922,44 +916,8 @@ cudecompResult_t cudecompGridDescDestroy(cudecompHandle_t handle, cudecompGridDe
CHECK_MPI(MPI_Comm_free(&grid_desc->col_comm_info.mpi_comm));
}

for (auto e : grid_desc->events) {
if (e) { CHECK_CUDA(cudaEventDestroy(e)); }
}

#ifdef ENABLE_NVSHMEM
if (grid_desc->nvshmem_sync_event) { CHECK_CUDA(cudaEventDestroy(grid_desc->nvshmem_sync_event)); }
#endif

if (handle->performance_report_enable) {
// Print performance report before destroying events
printPerformanceReport(handle, grid_desc);

// Destroy all transpose performance sample events in the map
for (auto& entry : grid_desc->transpose_perf_samples_map) {
auto& collection = entry.second;
for (auto& sample : collection.samples) {
CHECK_CUDA(cudaEventDestroy(sample.transpose_start_event));
CHECK_CUDA(cudaEventDestroy(sample.transpose_end_event));
for (auto& event : sample.alltoall_start_events) {
CHECK_CUDA(cudaEventDestroy(event));
}
for (auto& event : sample.alltoall_end_events) {
CHECK_CUDA(cudaEventDestroy(event));
}
}
}

// Destroy all halo performance sample events in the map
for (auto& entry : grid_desc->halo_perf_samples_map) {
auto& collection = entry.second;
for (auto& sample : collection.samples) {
CHECK_CUDA(cudaEventDestroy(sample.halo_start_event));
CHECK_CUDA(cudaEventDestroy(sample.halo_end_event));
CHECK_CUDA(cudaEventDestroy(sample.sendrecv_start_event));
CHECK_CUDA(cudaEventDestroy(sample.sendrecv_end_event));
}
}
}
// Print performance report if enabled
if (handle->performance_report_enable) { printPerformanceReport(handle, grid_desc); }

if (transposeBackendRequiresNccl(grid_desc->config.transpose_comm_backend) ||
haloBackendRequiresNccl(grid_desc->config.halo_comm_backend)) {
Expand Down
22 changes: 1 addition & 21 deletions src/performance.cc
Original file line number Diff line number Diff line change
Expand Up @@ -207,21 +207,11 @@ getOrCreateTransposePerformanceSamples(const cudecompHandle_t handle, cudecompGr
// Create new sample collection for this configuration
cudecompTransposePerformanceSampleCollection collection;
collection.samples.resize(handle->performance_report_samples);
collection.sample_idx = 0;

// Create events for each sample
// Create per-rank alltoall events for each sample
for (auto& sample : collection.samples) {
CHECK_CUDA(cudaEventCreate(&sample.transpose_start_event));
CHECK_CUDA(cudaEventCreate(&sample.transpose_end_event));
sample.alltoall_start_events.resize(handle->nranks);
sample.alltoall_end_events.resize(handle->nranks);
for (auto& event : sample.alltoall_start_events) {
CHECK_CUDA(cudaEventCreate(&event));
}
for (auto& event : sample.alltoall_end_events) {
CHECK_CUDA(cudaEventCreate(&event));
}
sample.valid = false;
}

samples_map[config] = std::move(collection);
Expand All @@ -240,16 +230,6 @@ cudecompHaloPerformanceSampleCollection& getOrCreateHaloPerformanceSamples(const
// Create new sample collection for this configuration
cudecompHaloPerformanceSampleCollection collection;
collection.samples.resize(handle->performance_report_samples);
collection.sample_idx = 0;

// Create events for each sample
for (auto& sample : collection.samples) {
CHECK_CUDA(cudaEventCreate(&sample.halo_start_event));
CHECK_CUDA(cudaEventCreate(&sample.halo_end_event));
CHECK_CUDA(cudaEventCreate(&sample.sendrecv_start_event));
CHECK_CUDA(cudaEventCreate(&sample.sendrecv_end_event));
sample.valid = false;
}

samples_map[config] = std::move(collection);
}
Expand Down
Loading