diff --git a/include/internal/common.h b/include/internal/common.h index 0ab7ee5..18c9bad 100644 --- a/include/internal/common.h +++ b/include/internal/common.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -302,6 +303,18 @@ using comm_count_t = int64_t; enum cudecompCommAxis { CUDECOMP_COMM_COL = 0, CUDECOMP_COMM_ROW = 1 }; +static inline MPI_Datatype mpiSizeTDatatype() { + if constexpr (std::is_same_v) { + return MPI_UNSIGNED; + } else if constexpr (std::is_same_v) { + return MPI_UNSIGNED_LONG; + } else if constexpr (std::is_same_v) { + return MPI_UNSIGNED_LONG_LONG; + } else { + THROW_NOT_SUPPORTED("unsupported size_t type for MPI reduction"); + } +} + static inline void setProcessGridIndex(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc) { switch (grid_desc->config.rank_order) { case CUDECOMP_RANK_ORDER_COL_MAJOR: diff --git a/src/autotune.cc b/src/autotune.cc index d797bd5..858660e 100644 --- a/src/autotune.cc +++ b/src/autotune.cc @@ -188,8 +188,8 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d transposeWorkspaceGuard work_guard(nullptr, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_MPI_P2P}); transposeWorkspaceGuard work_nvshmem_guard(nullptr, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_NVSHMEM}); - int64_t data_sz = 0; - int64_t work_sz = 0; + size_t data_sz = 0; + size_t work_sz = 0; bool valid = false; for (auto& pdims : pdim_list) { @@ -243,8 +243,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d int64_t size_x = std::max(pinfo_x0.size, pinfo_x3.size); int64_t size_y = std::max(std::max(std::max(pinfo_y0.size, pinfo_y1.size), pinfo_y2.size), pinfo_y3.size); int64_t size_z = std::max(pinfo_z1.size, pinfo_z2.size); - int64_t data_sz_new = std::max(std::max(size_x, size_y), size_z) * dtype_size; - int64_t work_sz_new = num_elements_work * dtype_size; + size_t data_sz_new = + static_cast(std::max(std::max(size_x, size_y), size_z)) * static_cast(dtype_size); + size_t work_sz_new = static_cast(num_elements_work) * static_cast(dtype_size); if (data_sz_new > data_sz) { data_sz = data_sz_new; if (data) { @@ -264,7 +265,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d } // For nvshmem, buffers must be the same size. Find global maximums. - CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &work_sz_new, 1, MPI_LONG_LONG_INT, MPI_MAX, handle->mpi_comm)); + CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &work_sz_new, 1, mpiSizeTDatatype(), MPI_MAX, handle->mpi_comm)); if (work_sz_new > work_sz) { work_sz = work_sz_new; @@ -668,8 +669,8 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, haloWorkspaceGuard work_guard(nullptr, {handle, grid_desc, CUDECOMP_HALO_COMM_MPI}); haloWorkspaceGuard work_nvshmem_guard(nullptr, {handle, grid_desc, CUDECOMP_HALO_COMM_NVSHMEM}); - int64_t data_sz = 0; - int64_t work_sz = 0; + size_t data_sz = 0; + size_t work_sz = 0; bool valid = false; for (auto& pdims : pdim_list) { @@ -707,8 +708,8 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, cudecompGetHaloWorkspaceSize(handle, grid_desc, options->halo_axis, options->halo_extents, &num_elements_work)); int64_t dtype_size; CHECK_CUDECOMP(cudecompGetDataTypeSize(options->dtype, &dtype_size)); - int64_t data_sz_new = pinfo.size * dtype_size; - int64_t work_sz_new = num_elements_work * dtype_size; + size_t data_sz_new = static_cast(pinfo.size) * static_cast(dtype_size); + size_t work_sz_new = static_cast(num_elements_work) * static_cast(dtype_size); if (data_sz_new > data_sz) { data_sz = data_sz_new; if (data) { @@ -720,7 +721,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc, } // For nvshmem, buffers must be the same size. Find global maximums. - CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &work_sz_new, 1, MPI_LONG_LONG_INT, MPI_MAX, handle->mpi_comm)); + CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &work_sz_new, 1, mpiSizeTDatatype(), MPI_MAX, handle->mpi_comm)); if (work_sz_new > work_sz) { work_sz = work_sz_new; diff --git a/src/cudecomp.cc b/src/cudecomp.cc index 1da8bf8..26d39d6 100644 --- a/src/cudecomp.cc +++ b/src/cudecomp.cc @@ -1217,7 +1217,7 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend)) { #ifdef ENABLE_NVSHMEM // NVSHMEM requires allocations to be the same size for all ranks. Find maximum. - CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &buffer_size_bytes, 1, MPI_LONG_LONG_INT, MPI_MAX, handle->mpi_comm)); + CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &buffer_size_bytes, 1, mpiSizeTDatatype(), MPI_MAX, handle->mpi_comm)); auto nvshmem_runtime = grid_desc->nvshmem_runtime; if (!nvshmem_runtime || !nvshmem_runtime->initialized) { THROW_INVALID_USAGE("NVSHMEM runtime is unavailable"); } @@ -1228,7 +1228,7 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid } if (!nvshmem_runtime->nvshmem_vmm && handle->rank == 0 && buffer_size_bytes > nvshmem_free_size) { fprintf(stderr, - "CUDECOMP:WARN: Attempting an NVSHMEM allocation of %lld bytes but *approximately* " + "CUDECOMP:WARN: Attempting an NVSHMEM allocation of %zu bytes but *approximately* " "%zu free bytes of %zu total bytes of symmetric heap space available. If the allocation fails, " "set NVSHMEM_SYMMETRIC_SIZE >= %zu and try again.\n", buffer_size_bytes, nvshmem_free_size, nvshmem_runtime->nvshmem_symmetric_size,