Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions include/internal/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <sstream>
#include <string>
#include <tuple>
#include <type_traits>
#include <unordered_map>
#include <utility>
#include <vector>
Expand Down Expand Up @@ -302,6 +303,18 @@ using comm_count_t = int64_t;

enum cudecompCommAxis { CUDECOMP_COMM_COL = 0, CUDECOMP_COMM_ROW = 1 };

static inline MPI_Datatype mpiSizeTDatatype() {
if constexpr (std::is_same_v<size_t, unsigned int>) {
return MPI_UNSIGNED;
} else if constexpr (std::is_same_v<size_t, unsigned long>) {
return MPI_UNSIGNED_LONG;
} else if constexpr (std::is_same_v<size_t, unsigned long long>) {
return MPI_UNSIGNED_LONG_LONG;
} else {
THROW_NOT_SUPPORTED("unsupported size_t type for MPI reduction");
}
}

static inline void setProcessGridIndex(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc) {
switch (grid_desc->config.rank_order) {
case CUDECOMP_RANK_ORDER_COL_MAJOR:
Expand Down
21 changes: 11 additions & 10 deletions src/autotune.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,8 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
transposeWorkspaceGuard work_guard(nullptr, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_MPI_P2P});
transposeWorkspaceGuard work_nvshmem_guard(nullptr, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_NVSHMEM});

int64_t data_sz = 0;
int64_t work_sz = 0;
size_t data_sz = 0;
size_t work_sz = 0;

bool valid = false;
for (auto& pdims : pdim_list) {
Expand Down Expand Up @@ -243,8 +243,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
int64_t size_x = std::max(pinfo_x0.size, pinfo_x3.size);
int64_t size_y = std::max(std::max(std::max(pinfo_y0.size, pinfo_y1.size), pinfo_y2.size), pinfo_y3.size);
int64_t size_z = std::max(pinfo_z1.size, pinfo_z2.size);
int64_t data_sz_new = std::max(std::max(size_x, size_y), size_z) * dtype_size;
int64_t work_sz_new = num_elements_work * dtype_size;
size_t data_sz_new =
static_cast<size_t>(std::max(std::max(size_x, size_y), size_z)) * static_cast<size_t>(dtype_size);
size_t work_sz_new = static_cast<size_t>(num_elements_work) * static_cast<size_t>(dtype_size);
if (data_sz_new > data_sz) {
data_sz = data_sz_new;
if (data) {
Expand All @@ -264,7 +265,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
}

// For nvshmem, buffers must be the same size. Find global maximums.
CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &work_sz_new, 1, MPI_LONG_LONG_INT, MPI_MAX, handle->mpi_comm));
CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &work_sz_new, 1, mpiSizeTDatatype(), MPI_MAX, handle->mpi_comm));

if (work_sz_new > work_sz) {
work_sz = work_sz_new;
Expand Down Expand Up @@ -668,8 +669,8 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
haloWorkspaceGuard work_guard(nullptr, {handle, grid_desc, CUDECOMP_HALO_COMM_MPI});
haloWorkspaceGuard work_nvshmem_guard(nullptr, {handle, grid_desc, CUDECOMP_HALO_COMM_NVSHMEM});

int64_t data_sz = 0;
int64_t work_sz = 0;
size_t data_sz = 0;
size_t work_sz = 0;

bool valid = false;
for (auto& pdims : pdim_list) {
Expand Down Expand Up @@ -707,8 +708,8 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
cudecompGetHaloWorkspaceSize(handle, grid_desc, options->halo_axis, options->halo_extents, &num_elements_work));
int64_t dtype_size;
CHECK_CUDECOMP(cudecompGetDataTypeSize(options->dtype, &dtype_size));
int64_t data_sz_new = pinfo.size * dtype_size;
int64_t work_sz_new = num_elements_work * dtype_size;
size_t data_sz_new = static_cast<size_t>(pinfo.size) * static_cast<size_t>(dtype_size);
size_t work_sz_new = static_cast<size_t>(num_elements_work) * static_cast<size_t>(dtype_size);
if (data_sz_new > data_sz) {
data_sz = data_sz_new;
if (data) {
Expand All @@ -720,7 +721,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
}

// For nvshmem, buffers must be the same size. Find global maximums.
CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &work_sz_new, 1, MPI_LONG_LONG_INT, MPI_MAX, handle->mpi_comm));
CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &work_sz_new, 1, mpiSizeTDatatype(), MPI_MAX, handle->mpi_comm));

if (work_sz_new > work_sz) {
work_sz = work_sz_new;
Expand Down
4 changes: 2 additions & 2 deletions src/cudecomp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1217,7 +1217,7 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid
haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend)) {
#ifdef ENABLE_NVSHMEM
// NVSHMEM requires allocations to be the same size for all ranks. Find maximum.
CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &buffer_size_bytes, 1, MPI_LONG_LONG_INT, MPI_MAX, handle->mpi_comm));
CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &buffer_size_bytes, 1, mpiSizeTDatatype(), MPI_MAX, handle->mpi_comm));

auto nvshmem_runtime = grid_desc->nvshmem_runtime;
if (!nvshmem_runtime || !nvshmem_runtime->initialized) { THROW_INVALID_USAGE("NVSHMEM runtime is unavailable"); }
Expand All @@ -1228,7 +1228,7 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid
}
if (!nvshmem_runtime->nvshmem_vmm && handle->rank == 0 && buffer_size_bytes > nvshmem_free_size) {
fprintf(stderr,
"CUDECOMP:WARN: Attempting an NVSHMEM allocation of %lld bytes but *approximately* "
"CUDECOMP:WARN: Attempting an NVSHMEM allocation of %zu bytes but *approximately* "
"%zu free bytes of %zu total bytes of symmetric heap space available. If the allocation fails, "
"set NVSHMEM_SYMMETRIC_SIZE >= %zu and try again.\n",
buffer_size_bytes, nvshmem_free_size, nvshmem_runtime->nvshmem_symmetric_size,
Expand Down
Loading