NVIDIA · romerojosh · Jun 22, 2026 · Jun 16, 2026 · Jun 17, 2026
diff --git a/include/internal/common.h b/include/internal/common.h
@@ -29,6 +29,7 @@
 #include <sstream>
 #include <string>
 #include <tuple>
+#include <type_traits>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -302,6 +303,18 @@ using comm_count_t = int64_t;
 
 enum cudecompCommAxis { CUDECOMP_COMM_COL = 0, CUDECOMP_COMM_ROW = 1 };
 
+static inline MPI_Datatype mpiSizeTDatatype() {
+  if constexpr (std::is_same_v<size_t, unsigned int>) {
+    return MPI_UNSIGNED;
+  } else if constexpr (std::is_same_v<size_t, unsigned long>) {
+    return MPI_UNSIGNED_LONG;
+  } else if constexpr (std::is_same_v<size_t, unsigned long long>) {
+    return MPI_UNSIGNED_LONG_LONG;
+  } else {
+    THROW_NOT_SUPPORTED("unsupported size_t type for MPI reduction");
+  }
+}
+
 static inline void setProcessGridIndex(const cudecompHandle_t handle, cudecompGridDesc_t grid_desc) {
   switch (grid_desc->config.rank_order) {
   case CUDECOMP_RANK_ORDER_COL_MAJOR:

diff --git a/src/autotune.cc b/src/autotune.cc
@@ -188,8 +188,8 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
   transposeWorkspaceGuard work_guard(nullptr, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_MPI_P2P});
   transposeWorkspaceGuard work_nvshmem_guard(nullptr, {handle, grid_desc, CUDECOMP_TRANSPOSE_COMM_NVSHMEM});
 
-  int64_t data_sz = 0;
-  int64_t work_sz = 0;
+  size_t data_sz = 0;
+  size_t work_sz = 0;
 
   bool valid = false;
   for (auto& pdims : pdim_list) {
@@ -243,8 +243,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
     int64_t size_x = std::max(pinfo_x0.size, pinfo_x3.size);
     int64_t size_y = std::max(std::max(std::max(pinfo_y0.size, pinfo_y1.size), pinfo_y2.size), pinfo_y3.size);
     int64_t size_z = std::max(pinfo_z1.size, pinfo_z2.size);
-    int64_t data_sz_new = std::max(std::max(size_x, size_y), size_z) * dtype_size;
-    int64_t work_sz_new = num_elements_work * dtype_size;
+    size_t data_sz_new =
+        static_cast<size_t>(std::max(std::max(size_x, size_y), size_z)) * static_cast<size_t>(dtype_size);
+    size_t work_sz_new = static_cast<size_t>(num_elements_work) * static_cast<size_t>(dtype_size);
     if (data_sz_new > data_sz) {
       data_sz = data_sz_new;
       if (data) {
@@ -264,7 +265,7 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
     }
 
     // For nvshmem, buffers must be the same size. Find global maximums.
-    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &work_sz_new, 1, MPI_LONG_LONG_INT, MPI_MAX, handle->mpi_comm));
+    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &work_sz_new, 1, mpiSizeTDatatype(), MPI_MAX, handle->mpi_comm));
 
     if (work_sz_new > work_sz) {
       work_sz = work_sz_new;
@@ -668,8 +669,8 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
   haloWorkspaceGuard work_guard(nullptr, {handle, grid_desc, CUDECOMP_HALO_COMM_MPI});
   haloWorkspaceGuard work_nvshmem_guard(nullptr, {handle, grid_desc, CUDECOMP_HALO_COMM_NVSHMEM});
 
-  int64_t data_sz = 0;
-  int64_t work_sz = 0;
+  size_t data_sz = 0;
+  size_t work_sz = 0;
 
   bool valid = false;
   for (auto& pdims : pdim_list) {
@@ -707,8 +708,8 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
         cudecompGetHaloWorkspaceSize(handle, grid_desc, options->halo_axis, options->halo_extents, &num_elements_work));
     int64_t dtype_size;
     CHECK_CUDECOMP(cudecompGetDataTypeSize(options->dtype, &dtype_size));
-    int64_t data_sz_new = pinfo.size * dtype_size;
-    int64_t work_sz_new = num_elements_work * dtype_size;
+    size_t data_sz_new = static_cast<size_t>(pinfo.size) * static_cast<size_t>(dtype_size);
+    size_t work_sz_new = static_cast<size_t>(num_elements_work) * static_cast<size_t>(dtype_size);
     if (data_sz_new > data_sz) {
       data_sz = data_sz_new;
       if (data) {
@@ -720,7 +721,7 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
     }
 
     // For nvshmem, buffers must be the same size. Find global maximums.
-    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &work_sz_new, 1, MPI_LONG_LONG_INT, MPI_MAX, handle->mpi_comm));
+    CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &work_sz_new, 1, mpiSizeTDatatype(), MPI_MAX, handle->mpi_comm));
 
     if (work_sz_new > work_sz) {
       work_sz = work_sz_new;

diff --git a/src/cudecomp.cc b/src/cudecomp.cc
@@ -1217,7 +1217,7 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid
         haloBackendRequiresNvshmem(grid_desc->config.halo_comm_backend)) {
 #ifdef ENABLE_NVSHMEM
       // NVSHMEM requires allocations to be the same size for all ranks. Find maximum.
-      CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &buffer_size_bytes, 1, MPI_LONG_LONG_INT, MPI_MAX, handle->mpi_comm));
+      CHECK_MPI(MPI_Allreduce(MPI_IN_PLACE, &buffer_size_bytes, 1, mpiSizeTDatatype(), MPI_MAX, handle->mpi_comm));
 
       auto nvshmem_runtime = grid_desc->nvshmem_runtime;
       if (!nvshmem_runtime || !nvshmem_runtime->initialized) { THROW_INVALID_USAGE("NVSHMEM runtime is unavailable"); }
@@ -1228,7 +1228,7 @@ cudecompResult_t cudecompMalloc(cudecompHandle_t handle, cudecompGridDesc_t grid
       }
       if (!nvshmem_runtime->nvshmem_vmm && handle->rank == 0 && buffer_size_bytes > nvshmem_free_size) {
         fprintf(stderr,
-                "CUDECOMP:WARN: Attempting an NVSHMEM allocation of %lld bytes but *approximately* "
+                "CUDECOMP:WARN: Attempting an NVSHMEM allocation of %zu bytes but *approximately* "
                 "%zu free bytes of %zu total bytes of symmetric heap space available. If the allocation fails, "
                 "set NVSHMEM_SYMMETRIC_SIZE >= %zu and try again.\n",
                 buffer_size_bytes, nvshmem_free_size, nvshmem_runtime->nvshmem_symmetric_size,