Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/internal/comm_routines.h
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ cudecompAlltoallPipelined(const cudecompHandle_t& handle, const cudecompGridDesc
#endif
}
case CUDECOMP_TRANSPOSE_COMM_NCCL_PL: {
auto comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
const auto& comm_info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;
// For fully intra-group alltoall, use distinct NCCL local comm instead of global comm as it is faster.
auto comm = (comm_info.ngroups == 1) ? *grid_desc->nccl_local_comm : *grid_desc->nccl_comm;
auto pl_stream = handle->streams[0].get();
Expand Down
97 changes: 92 additions & 5 deletions include/internal/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,13 @@ typedef std::shared_ptr<nvshmemRuntimeState> nvshmemRuntime;

// cuDecomp handle containing general information
struct cudecompHandle {
cudecompHandle() = default;
~cudecompHandle() noexcept;

cudecompHandle(const cudecompHandle&) = delete;
cudecompHandle& operator=(const cudecompHandle&) = delete;
cudecompHandle(cudecompHandle&&) = delete;
cudecompHandle& operator=(cudecompHandle&&) = delete;

MPI_Comm mpi_comm = MPI_COMM_NULL; // MPI communicator
int32_t rank; // MPI rank
Expand All @@ -96,10 +103,12 @@ struct cudecompHandle {

std::vector<cudecomp::cudaStream> streams; // internal streams for concurrent scheduling

cutensorHandle_t cutensor_handle; // cuTENSOR handle;
#if CUTENSOR_MAJOR >= 2
cutensorPlanPreference_t cutensor_plan_pref; // cuTENSOR plan preference;
bool cutensor_needs_permute_chunking = false; // Flag to enable large tensor workaround
cutensorHandle_t cutensor_handle = nullptr; // cuTENSOR handle;
cutensorPlanPreference_t cutensor_plan_pref = nullptr; // cuTENSOR plan preference;
bool cutensor_needs_permute_chunking = false; // Flag to enable large tensor workaround
#else
cutensorHandle_t cutensor_handle; // cuTENSOR handle;
#endif

std::vector<std::array<char, MPI_MAX_PROCESSOR_NAME>> hostnames; // list of hostnames by rank
Expand Down Expand Up @@ -134,6 +143,7 @@ struct cudecompHandle {
""; // directory to write CSV performance reports, empty means no file writing

// Miscellaneous
bool nvml_initialized = false; // Flag to track NVML initialization
int32_t device_p2p_ce_count = 0; // number of P2P CEs available
int32_t device_num_sms = 0; // number of SMs on the device
int32_t device_max_threads_per_sm = 0; // maximum threads per SM
Expand All @@ -142,9 +152,40 @@ struct cudecompHandle {

// Structure with information about row/column communicator
struct cudecompCommInfo {
cudecompCommInfo() = default;
~cudecompCommInfo() noexcept { reset(); }

cudecompCommInfo(const cudecompCommInfo&) = delete;
cudecompCommInfo& operator=(const cudecompCommInfo&) = delete;
cudecompCommInfo(cudecompCommInfo&&) = delete;
cudecompCommInfo& operator=(cudecompCommInfo&&) = delete;

void reset() noexcept {
if (mpi_comm != MPI_COMM_NULL) {
MPI_Comm comm = mpi_comm;
mpi_comm = MPI_COMM_NULL;
MPI_Comm_free(&comm);
}
#ifdef ENABLE_NVSHMEM
if (nvshmem_team != NVSHMEM_TEAM_INVALID) {
nvshmem_team_destroy(nvshmem_team);
nvshmem_team = NVSHMEM_TEAM_INVALID;
}
if (nvshmem_signals) {
nvshmem_free(nvshmem_signals);
nvshmem_signals = nullptr;
}
#endif
rank = 0;
nranks = 0;
ngroups = 0;
npergroup = 0;
mnnvl_active = false;
}

MPI_Comm mpi_comm = MPI_COMM_NULL;
int32_t rank;
int32_t nranks;
int32_t rank = 0;
int32_t nranks = 0;

int32_t ngroups = 0; // number of p2p groups (i.e. grouping of ranks with fast interconnect) in communicator
int32_t npergroup = 0; // number of ranks per p2p group
Expand Down Expand Up @@ -194,6 +235,14 @@ struct cudecompHaloPerformanceSampleCollection {

// cuDecomp grid descriptor containing grid-specific information
struct cudecompGridDesc {
~cudecompGridDesc() noexcept {
row_comm_info.reset();
col_comm_info.reset();
#ifdef ENABLE_NVSHMEM
if (nvshmem_block_counters) { cudaFree(nvshmem_block_counters); }
#endif
}

cudecompGridDescConfig_t config; // configuration struct
bool gdims_dist_set = false; // flag to record if gdims_dist was set to non-default values
bool transpose_mem_order_set = false; // flag to record if transpose_mem_order was set to non-default values
Expand Down Expand Up @@ -355,6 +404,7 @@ static void setCommInfo(cudecompHandle_t& handle, cudecompGridDesc_t& grid_desc,
cudecompCommAxis comm_axis) {
auto& info = (comm_axis == CUDECOMP_COMM_ROW) ? grid_desc->row_comm_info : grid_desc->col_comm_info;

info.reset();
info.mpi_comm = mpi_comm;
CHECK_MPI(MPI_Comm_rank(info.mpi_comm, &info.rank));
CHECK_MPI(MPI_Comm_size(info.mpi_comm, &info.nranks));
Expand Down Expand Up @@ -420,6 +470,43 @@ static void setCommInfo(cudecompHandle_t& handle, cudecompGridDesc_t& grid_desc,
info.ngroups = info.nranks / info.npergroup;
}

static void createCommInfo(cudecompHandle_t& handle, cudecompGridDesc_t& grid_desc, bool need_nvshmem = false) {
grid_desc->row_comm_info.reset();
grid_desc->col_comm_info.reset();

setProcessGridIndex(handle, grid_desc);

MPI_Comm row_comm;
CHECK_MPI(MPI_Comm_split(handle->mpi_comm, grid_desc->pidx[0], handle->rank, &row_comm));
setCommInfo(handle, grid_desc, row_comm, CUDECOMP_COMM_ROW);

MPI_Comm col_comm;
CHECK_MPI(MPI_Comm_split(handle->mpi_comm, grid_desc->pidx[1], handle->rank, &col_comm));
setCommInfo(handle, grid_desc, col_comm, CUDECOMP_COMM_COL);

#ifdef ENABLE_NVSHMEM
if (need_nvshmem) {
nvshmem_team_config_t tmp;
nvshmem_team_split_2d(NVSHMEM_TEAM_WORLD, grid_desc->config.pdims[1], &tmp, 0,
&grid_desc->row_comm_info.nvshmem_team, &tmp, 0, &grid_desc->col_comm_info.nvshmem_team);

grid_desc->row_comm_info.nvshmem_signals =
(uint64_t*)nvshmem_malloc(grid_desc->row_comm_info.nranks * sizeof(uint64_t));
if (!grid_desc->row_comm_info.nvshmem_signals) { THROW_NVSHMEM_ERROR("nvshmem_malloc failed"); }
CHECK_CUDA(
cudaMemset(grid_desc->row_comm_info.nvshmem_signals, 0, grid_desc->row_comm_info.nranks * sizeof(uint64_t)));

grid_desc->col_comm_info.nvshmem_signals =
(uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
if (!grid_desc->col_comm_info.nvshmem_signals) { THROW_NVSHMEM_ERROR("nvshmem_malloc failed"); }
CHECK_CUDA(
cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
}
#else
if (need_nvshmem) { THROW_NOT_SUPPORTED("build does not support NVSHMEM communication backends."); }
#endif
}

static inline void getAlltoallPeerRanks(cudecompGridDesc_t grid_desc, cudecompCommAxis comm_axis, int iter,
int& src_rank, int& dst_rank) {

Expand Down
76 changes: 16 additions & 60 deletions src/autotune.cc
Original file line number Diff line number Diff line change
Expand Up @@ -274,28 +274,9 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
}

// Create test row and column communicators
int color_row = grid_desc->pidx[0];
MPI_Comm row_comm;
CHECK_MPI(MPI_Comm_split(handle->mpi_comm, color_row, handle->rank, &row_comm));
setCommInfo(handle, grid_desc, row_comm, CUDECOMP_COMM_ROW);

int color_col = grid_desc->pidx[1];
MPI_Comm col_comm;
CHECK_MPI(MPI_Comm_split(handle->mpi_comm, color_col, handle->rank, &col_comm));
setCommInfo(handle, grid_desc, col_comm, CUDECOMP_COMM_COL);
createCommInfo(handle, grid_desc, need_nvshmem);
if (need_nvshmem) {
#ifdef ENABLE_NVSHMEM
nvshmem_team_config_t tmp;
nvshmem_team_split_2d(NVSHMEM_TEAM_WORLD, grid_desc->config.pdims[1], &tmp, 0,
&grid_desc->row_comm_info.nvshmem_team, &tmp, 0, &grid_desc->col_comm_info.nvshmem_team);
grid_desc->row_comm_info.nvshmem_signals =
(uint64_t*)nvshmem_malloc(grid_desc->row_comm_info.nranks * sizeof(uint64_t));
CHECK_CUDA(
cudaMemset(grid_desc->row_comm_info.nvshmem_signals, 0, grid_desc->row_comm_info.nranks * sizeof(uint64_t)));
grid_desc->col_comm_info.nvshmem_signals =
(uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
CHECK_CUDA(
cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
CHECK_CUDA(cudaMalloc(&grid_desc->nvshmem_block_counters, handle->nranks * sizeof(int)));
CHECK_CUDA(cudaMemset(grid_desc->nvshmem_block_counters, 0, handle->nranks * sizeof(int)));
#endif
Expand Down Expand Up @@ -490,18 +471,15 @@ void autotuneTransposeBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_d
}
}

// Destroy test communicators
CHECK_MPI(MPI_Comm_free(&grid_desc->row_comm_info.mpi_comm));
CHECK_MPI(MPI_Comm_free(&grid_desc->col_comm_info.mpi_comm));
// Destroy test communicator resources
grid_desc->row_comm_info.reset();
grid_desc->col_comm_info.reset();
if (need_nvshmem) {
#ifdef ENABLE_NVSHMEM
nvshmem_team_destroy(grid_desc->row_comm_info.nvshmem_team);
nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
nvshmem_free(grid_desc->row_comm_info.nvshmem_signals);
nvshmem_free(grid_desc->col_comm_info.nvshmem_signals);
CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters));
grid_desc->row_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
if (grid_desc->nvshmem_block_counters) {
CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters));
grid_desc->nvshmem_block_counters = nullptr;
}
#endif
}
}
Expand Down Expand Up @@ -720,28 +698,9 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
}

// Create test row and column communicators
int color_row = grid_desc->pidx[0];
MPI_Comm row_comm;
CHECK_MPI(MPI_Comm_split(handle->mpi_comm, color_row, handle->rank, &row_comm));
setCommInfo(handle, grid_desc, row_comm, CUDECOMP_COMM_ROW);

int color_col = grid_desc->pidx[1];
MPI_Comm col_comm;
CHECK_MPI(MPI_Comm_split(handle->mpi_comm, color_col, handle->rank, &col_comm));
setCommInfo(handle, grid_desc, col_comm, CUDECOMP_COMM_COL);
createCommInfo(handle, grid_desc, need_nvshmem);
if (need_nvshmem) {
#ifdef ENABLE_NVSHMEM
nvshmem_team_config_t tmp;
nvshmem_team_split_2d(NVSHMEM_TEAM_WORLD, grid_desc->config.pdims[1], &tmp, 0,
&grid_desc->row_comm_info.nvshmem_team, &tmp, 0, &grid_desc->col_comm_info.nvshmem_team);
grid_desc->row_comm_info.nvshmem_signals =
(uint64_t*)nvshmem_malloc(grid_desc->row_comm_info.nranks * sizeof(uint64_t));
CHECK_CUDA(
cudaMemset(grid_desc->row_comm_info.nvshmem_signals, 0, grid_desc->row_comm_info.nranks * sizeof(uint64_t)));
grid_desc->col_comm_info.nvshmem_signals =
(uint64_t*)nvshmem_malloc(grid_desc->col_comm_info.nranks * sizeof(uint64_t));
CHECK_CUDA(
cudaMemset(grid_desc->col_comm_info.nvshmem_signals, 0, grid_desc->col_comm_info.nranks * sizeof(uint64_t)));
CHECK_CUDA(cudaMalloc(&grid_desc->nvshmem_block_counters, handle->nranks * sizeof(int)));
CHECK_CUDA(cudaMemset(grid_desc->nvshmem_block_counters, 0, handle->nranks * sizeof(int)));
#endif
Expand Down Expand Up @@ -850,18 +809,15 @@ void autotuneHaloBackend(cudecompHandle_t handle, cudecompGridDesc_t grid_desc,
}
}

// Destroy test communicators
CHECK_MPI(MPI_Comm_free(&grid_desc->row_comm_info.mpi_comm));
CHECK_MPI(MPI_Comm_free(&grid_desc->col_comm_info.mpi_comm));
// Destroy test communicator resources
grid_desc->row_comm_info.reset();
grid_desc->col_comm_info.reset();
if (need_nvshmem) {
#ifdef ENABLE_NVSHMEM
nvshmem_team_destroy(grid_desc->row_comm_info.nvshmem_team);
nvshmem_team_destroy(grid_desc->col_comm_info.nvshmem_team);
nvshmem_free(grid_desc->row_comm_info.nvshmem_signals);
nvshmem_free(grid_desc->col_comm_info.nvshmem_signals);
CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters));
grid_desc->row_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
grid_desc->col_comm_info.nvshmem_team = NVSHMEM_TEAM_INVALID;
if (grid_desc->nvshmem_block_counters) {
CHECK_CUDA(cudaFree(grid_desc->nvshmem_block_counters));
grid_desc->nvshmem_block_counters = nullptr;
}
#endif
}
}
Expand Down
Loading
Loading