Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion include/internal/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,16 @@ class graphCache {
using key_type = std::tuple<void*, void*, int, int, cudecompPencilInfo_t, cudecompPencilInfo_t, cudecompDataType_t>;

public:
~graphCache();
~graphCache() noexcept;
void replay(const key_type& key, cudaStream_t stream) const;
cudaStream_t startCapture(const key_type& key, cudaStream_t stream) const;
void endCapture(const key_type& key);
bool cached(const key_type& key) const;
void clear();

private:
void clearNoThrow() noexcept;

std::unordered_map<key_type, cudaGraphExec_t> graph_cache_;
cudaStream graph_stream_;
};
Expand Down
129 changes: 107 additions & 22 deletions src/autotune.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <memory>
#include <numeric>
#include <string>
#include <type_traits>
#include <vector>

#include <cuda_runtime.h>
Expand All @@ -43,6 +45,33 @@
namespace cudecomp {
namespace {

struct cudaBufferGuardDeleter {
void operator()(void* ptr) const noexcept {
if (ptr) { cudaFree(ptr); }
}
};

template <typename Backend> struct workspaceGuardDeleter {
cudecompHandle_t handle;
cudecompGridDesc_t grid_desc;
Backend backend;

void operator()(void* ptr) const noexcept {
if (!ptr) return;

if constexpr (std::is_same_v<Backend, cudecompTransposeCommBackend_t>) {
grid_desc->config.transpose_comm_backend = backend;
} else {
grid_desc->config.halo_comm_backend = backend;
}
cudecompFree(handle, grid_desc, ptr);
}
};

using cudaBufferGuard = std::unique_ptr<void, cudaBufferGuardDeleter>;
using transposeWorkspaceGuard = std::unique_ptr<void, workspaceGuardDeleter<cudecompTransposeCommBackend_t>>;
using haloWorkspaceGuard = std::unique_ptr<void, workspaceGuardDeleter<cudecompHaloCommBackend_t>>;

static std::vector<int> getFactors(int N) {
std::vector<int> factors;
for (int i = 1; i <= std::sqrt(N); ++i) {
Expand Down Expand Up @@ -154,6 +183,11 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
void* work = nullptr;
void* work_nvshmem = nullptr;

cudaBufferGuard data_guard;
cudaBufferGuard data2_guard;
transposeWorkspaceGuard work_guard(nullptr, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_MPI_P2P});
transposeWorkspaceGuard work_nvshmem_guard(nullptr, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_NVSHMEM});

int64_t data_sz = 0;
int64_t work_sz = 0;

Expand Down Expand Up @@ -213,11 +247,19 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
int64_t work_sz_new = num_elements_work * dtype_size;
if (data_sz_new > data_sz) {
data_sz = data_sz_new;
if (data) CHECK_CUDA(cudaFree(data));
if (data) {
data_guard.release();
CHECK_CUDA(cudaFree(data));
}
CHECK_CUDA(cudaMalloc(&data, data_sz));
data_guard.reset(data);
if (need_data2) {
if (data2) CHECK_CUDA(cudaFree(data2));
if (data2) {
data2_guard.release();
CHECK_CUDA(cudaFree(data2));
}
CHECK_CUDA(cudaMalloc(&data2, data_sz));
data2_guard.reset(data2);
}
}

Expand All @@ -232,15 +274,21 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
auto tmp = grid_desc->config.transpose_comm_backend;
grid_desc->config.transpose_comm_backend =
(need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
work_guard.release();
CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
grid_desc->config.transpose_comm_backend = tmp;
}
// Temporarily set backend to force nvshmem_malloc path in cudecompMalloc/Free
auto tmp = grid_desc->config.transpose_comm_backend;
grid_desc->config.transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_NVSHMEM;
if (work_nvshmem) CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem));
if (work_nvshmem) {
work_nvshmem_guard.release();
CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem));
}
CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work_nvshmem), work_sz));
grid_desc->config.transpose_comm_backend = tmp;
work_nvshmem_guard =
transposeWorkspaceGuard(work_nvshmem, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_NVSHMEM});

// Check if there is enough memory for separate non-NVSHMEM allocated work buffer
auto ret = cudaMalloc(&work, work_sz);
Expand All @@ -254,22 +302,25 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
cudaGetLastError(); // Reset CUDA error state
} else {
CHECK_CUDA(cudaFree(work));
auto tmp = grid_desc->config.transpose_comm_backend;
grid_desc->config.transpose_comm_backend =
(need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
cudecompResult_t ret = cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work), work_sz);
auto backend = (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
tmp = grid_desc->config.transpose_comm_backend;
grid_desc->config.transpose_comm_backend = backend;
CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work), work_sz));
grid_desc->config.transpose_comm_backend = tmp;
// Check after restoring the temporary backend used to select the allocation path.
CHECK_CUDECOMP(ret);
work_guard = transposeWorkspaceGuard(work, {handle, grid_desc, backend});
}
#endif
} else {
auto backend = (need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
auto tmp = grid_desc->config.transpose_comm_backend;
grid_desc->config.transpose_comm_backend =
(need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
if (work) { CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); }
grid_desc->config.transpose_comm_backend = backend;
if (work) {
work_guard.release();
CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
}
CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work), work_sz));
grid_desc->config.transpose_comm_backend = tmp;
work_guard = transposeWorkspaceGuard(work, {handle, grid_desc, backend});
}
}

Expand Down Expand Up @@ -490,26 +541,33 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
auto tmp = grid_desc->config.transpose_comm_backend;
grid_desc->config.transpose_comm_backend =
(need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
work_guard.release();
CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
grid_desc->config.transpose_comm_backend = tmp;
}
#ifdef ENABLE_NVSHMEM
// Temporarily set backend to force nvshmem_malloc path in cudecompMalloc/Free
auto tmp = grid_desc->config.transpose_comm_backend;
grid_desc->config.transpose_comm_backend = CUDECOMP_TRANSPOSE_COMM_NVSHMEM;
work_nvshmem_guard.release();
CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem));
grid_desc->config.transpose_comm_backend = tmp;
#endif
} else {
auto tmp = grid_desc->config.transpose_comm_backend;
grid_desc->config.transpose_comm_backend =
(need_nccl) ? CUDECOMP_TRANSPOSE_COMM_NCCL : CUDECOMP_TRANSPOSE_COMM_MPI_P2P;
work_guard.release();
CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
grid_desc->config.transpose_comm_backend = tmp;
}

data_guard.release();
CHECK_CUDA(cudaFree(data));
if (need_data2) { CHECK_CUDA(cudaFree(data2)); }
if (need_data2) {
data2_guard.release();
CHECK_CUDA(cudaFree(data2));
}

// Set handle to best option (broadcast from rank 0 for consistency)
CHECK_MPI(MPI_Bcast(&comm_backend_best, sizeof(cudecompTransposeCommBackend_t), MPI_CHAR, 0, handle->mpi_comm));
Expand All @@ -535,12 +593,17 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
// registration handles on larger test data buffers. These stale registration handles can cause to delays in freeing
// the test buffer GPU memory.
char *tmp1, *tmp2;
cudaBufferGuard tmp1_guard, tmp2_guard;
size_t per_rank_size = (1024 * 1024 + handle->nranks - 1) / handle->nranks;
CHECK_CUDA(cudaMalloc(&tmp1, per_rank_size * handle->nranks * sizeof(*tmp1)));
tmp1_guard.reset(tmp1);
CHECK_CUDA(cudaMalloc(&tmp2, per_rank_size * handle->nranks * sizeof(*tmp2)));
tmp2_guard.reset(tmp2);
CHECK_MPI(MPI_Alltoall(tmp1, per_rank_size, MPI_CHAR, tmp2, per_rank_size, MPI_CHAR, handle->mpi_comm));
CHECK_MPI(MPI_Barrier(handle->mpi_comm));
tmp1_guard.release();
CHECK_CUDA(cudaFree(tmp1));
tmp2_guard.release();
CHECK_CUDA(cudaFree(tmp2));

// Reset performance samples after autotuning
Expand Down Expand Up @@ -601,6 +664,10 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
void* work = nullptr;
void* work_nvshmem = nullptr;

cudaBufferGuard data_guard;
haloWorkspaceGuard work_guard(nullptr, {handle, grid_desc, CUDECOMP_HALO_COMM_MPI});
haloWorkspaceGuard work_nvshmem_guard(nullptr, {handle, grid_desc, CUDECOMP_HALO_COMM_NVSHMEM});

int64_t data_sz = 0;
int64_t work_sz = 0;

Expand Down Expand Up @@ -644,8 +711,12 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
int64_t work_sz_new = num_elements_work * dtype_size;
if (data_sz_new > data_sz) {
data_sz = data_sz_new;
if (data) CHECK_CUDA(cudaFree(data));
if (data) {
data_guard.release();
CHECK_CUDA(cudaFree(data));
}
CHECK_CUDA(cudaMalloc(&data, data_sz));
data_guard.reset(data);
}

// For nvshmem, buffers must be the same size. Find global maximums.
Expand All @@ -658,15 +729,20 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
if (work && work != work_nvshmem) {
auto tmp = grid_desc->config.halo_comm_backend;
grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
work_guard.release();
CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
grid_desc->config.halo_comm_backend = tmp;
}
// Temporarily set backend to force nvshmem_malloc path in cudecompMalloc/Free
auto tmp = grid_desc->config.halo_comm_backend;
grid_desc->config.halo_comm_backend = CUDECOMP_HALO_COMM_NVSHMEM;
if (work_nvshmem) CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem));
if (work_nvshmem) {
work_nvshmem_guard.release();
CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem));
}
CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work_nvshmem), work_sz));
grid_desc->config.halo_comm_backend = tmp;
work_nvshmem_guard = haloWorkspaceGuard(work_nvshmem, {handle, grid_desc, CUDECOMP_HALO_COMM_NVSHMEM});

// Check if there is enough memory for separate non-NVSHMEM allocated work buffer
auto ret = cudaMalloc(&work, work_sz);
Expand All @@ -680,20 +756,25 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
cudaGetLastError(); // Reset CUDA error state
} else {
CHECK_CUDA(cudaFree(work));
auto tmp = grid_desc->config.halo_comm_backend;
grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
cudecompResult_t ret = cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work), work_sz);
auto backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
tmp = grid_desc->config.halo_comm_backend;
grid_desc->config.halo_comm_backend = backend;
CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work), work_sz));
grid_desc->config.halo_comm_backend = tmp;
// Check after restoring the temporary backend used to select the allocation path.
CHECK_CUDECOMP(ret);
work_guard = haloWorkspaceGuard(work, {handle, grid_desc, backend});
}
#endif
} else {
auto backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
auto tmp = grid_desc->config.halo_comm_backend;
grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
if (work) { CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work)); }
grid_desc->config.halo_comm_backend = backend;
if (work) {
work_guard.release();
CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
}
CHECK_CUDECOMP(cudecompMalloc(handle, grid_desc, reinterpret_cast<void**>(&work), work_sz));
grid_desc->config.halo_comm_backend = tmp;
work_guard = haloWorkspaceGuard(work, {handle, grid_desc, backend});
}
}

Expand Down Expand Up @@ -827,23 +908,27 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
if (work != work_nvshmem) {
auto tmp = grid_desc->config.halo_comm_backend;
grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
work_guard.release();
CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
grid_desc->config.halo_comm_backend = tmp;
}
#ifdef ENABLE_NVSHMEM
// Temporarily set backend to force nvshmem_malloc path in cudecompMalloc/Free
auto tmp = grid_desc->config.halo_comm_backend;
grid_desc->config.halo_comm_backend = CUDECOMP_HALO_COMM_NVSHMEM;
work_nvshmem_guard.release();
CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work_nvshmem));
grid_desc->config.halo_comm_backend = tmp;
#endif
} else {
auto tmp = grid_desc->config.halo_comm_backend;
grid_desc->config.halo_comm_backend = (need_nccl) ? CUDECOMP_HALO_COMM_NCCL : CUDECOMP_HALO_COMM_MPI;
work_guard.release();
CHECK_CUDECOMP(cudecompFree(handle, grid_desc, work));
grid_desc->config.halo_comm_backend = tmp;
}

data_guard.release();
CHECK_CUDA(cudaFree(data));

// Set handle to best option (broadcast from rank 0 for consistency)
Expand Down
10 changes: 9 additions & 1 deletion src/graph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

namespace cudecomp {

graphCache::~graphCache() { this->clear(); }
graphCache::~graphCache() noexcept { clearNoThrow(); }

void graphCache::replay(const graphCache::key_type& key, cudaStream_t stream) const {
CHECK_CUDA(cudaGraphLaunch(graph_cache_.at(key), stream));
Expand Down Expand Up @@ -58,4 +58,12 @@ void graphCache::clear() {
graph_cache_.clear();
}

void graphCache::clearNoThrow() noexcept {
for (auto& entry : graph_cache_) {
cudaGraphExecDestroy(entry.second);
}

graph_cache_.clear();
}

} // namespace cudecomp
Loading