From 8bd27fe34db8afa1f1022eba837187a076d4c775 Mon Sep 17 00:00:00 2001 From: Minh Vu Date: Thu, 25 Jun 2026 00:25:04 +0200 Subject: [PATCH] Fix pipelined self-copy ranks Signed-off-by: Minh Vu --- include/internal/transpose.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/internal/transpose.h b/include/internal/transpose.h index 8b5a93c..d68e149 100644 --- a/include/internal/transpose.h +++ b/include/internal/transpose.h @@ -465,7 +465,10 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c for (int j = 1; j < splits_a.size() + 1; ++j) { int src_rank, dst_rank; getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank); - if (j == splits_a.size()) dst_rank = comm_rank; + if (j == splits_a.size()) { + src_rank = comm_rank; + dst_rank = comm_rank; + } size_t shift = offsets_a[dst_rank]; for (int i = 0; i < 3; ++i) { @@ -541,7 +544,10 @@ static void cudecompTranspose_(int ax, int dir, const cudecompHandle_t handle, c for (int j = 1; j < splits_a.size() + 1; ++j) { int src_rank, dst_rank; getAlltoallPeerRanks(grid_desc, comm_axis, j, src_rank, dst_rank); - if (j == splits_a.size()) dst_rank = comm_rank; + if (j == splits_a.size()) { + src_rank = comm_rank; + dst_rank = comm_rank; + } size_t shift = offsets_a[dst_rank]; for (int i = 0; i < 3; ++i) {