From 19b0ae915cab925d8f8b75865b1b26ab576b0f80 Mon Sep 17 00:00:00 2001 From: AtlantaPepsi Date: Fri, 8 May 2026 20:46:46 +0000 Subject: [PATCH 1/2] revert pinned hostmem for pod comm --- src/header/TransferBench.hpp | 39 +++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index 9f4589c..5a3b04e 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -615,6 +615,7 @@ namespace TransferBench #define hipMemGenericAllocationHandle_t CUmemGenericAllocationHandle #define hipMemAccessDesc CUmemAccessDesc #define hipMemFabricHandle_t CUmemFabricHandle + #define hipMemLocation CUmemLocation // Enumerations #define hipDeviceAttributeClockRate cudaDevAttrClockRate @@ -628,6 +629,7 @@ namespace TransferBench #define hipMemcpyHostToDevice cudaMemcpyHostToDevice #define hipSuccess cudaSuccess #define hipMemLocationTypeDevice CU_MEM_LOCATION_TYPE_DEVICE + #define hipMemLocationTypeHostNuma CU_MEM_LOCATION_TYPE_HOST_NUMA #define hipMemAllocationTypePinned CU_MEM_ALLOCATION_TYPE_PINNED #define hipMemHandleTypeFabric CU_MEM_HANDLE_TYPE_FABRIC #define hipMemAllocationGranularityRecommended CU_MEM_ALLOC_GRANULARITY_RECOMMENDED @@ -1411,6 +1413,25 @@ namespace { } #ifdef POD_COMM_ENABLED + static ErrResult GetMemLocation(MemDevice const& memDevice, hipMemLocation& location) + { + if (IsCpuMemType(memDevice.memType)) { + location.type = hipMemLocationTypeHostNuma; + } else if (IsGpuMemType(memDevice.memType) && memDevice.memType != MEM_MANAGED) { + location.type = hipMemLocationTypeDevice; + } else { + return {ERR_FATAL, "Unsupported memory location"}; + } + + // Determine location id + if (memDevice.memType == MEM_CPU_CLOSEST) { + location.id = GetClosestCpuNumaToGpu(memDevice.memIndex); + } else { + location.id = memDevice.memIndex; + } + return ERR_NONE; + } + static ErrResult GetMemAllocationProp(MemDevice const& memDevice, hipMemAllocationProp& prop) { @@ -1428,10 +1449,7 @@ namespace { } prop.requestedHandleTypes = hipMemHandleTypeFabric; -// at this point shouldn't have any memtype other than device -// ERR_CHECK(GetMemLocation(memDevice, prop.location)); - prop.location.type = hipMemLocationTypeDevice; - prop.location.id = memDevice.memIndex; + ERR_CHECK(GetMemLocation(memDevice, prop.location)); return ERR_NONE; } #endif @@ -1519,9 +1537,7 @@ namespace { // Specify memory access descriptor to enable local read/write hipMemAccessDesc desc; -// ERR_CHECK(GetMemLocation(memDevice, desc.location)); - desc.location.type = hipMemLocationTypeDevice; - desc.location.id = memDevice.memIndex; + ERR_CHECK(GetMemLocation(memDevice, desc.location)); desc.flags = hipMemAccessFlagsProtReadWrite; // Set access flags for virtual address range @@ -1529,9 +1545,12 @@ namespace { // Clear the memory if (IsCpuMemType(memType)) { + // Note: CheckPages() / move_pages() is intentionally NOT called here. + // For fabric-exportable HOST_NUMA memory the VA is owned by the driver + // (not a normal anonymous mmap VMA), so move_pages() returns + // -EFAULT/-EINVAL and would falsely trip a fatal error. NUMA placement + // should be already enforced by the prop.location passed to hipMemCreate(). memset(*memPtr, 0, roundedUpBytes); - // Check that the allocated pages are actually on the correct NUMA node - ERR_CHECK(CheckPages((char*)*memPtr, roundedUpBytes, deviceIdx)); } else if (IsGpuMemType(memType)) { ERR_CHECK(hipSetDevice(memDevice.memIndex)); ERR_CHECK(hipMemset(*memPtr, 0, numBytes)); @@ -8064,6 +8083,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #undef hipMemGenericAllocationHandle_t #undef hipMemAccessDesc #undef hipMemFabricHandle_t +#undef hipMemLocation // Enumerations #undef hipDeviceAttributeClockRate @@ -8077,6 +8097,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) #undef hipMemcpyHostToDevice #undef hipSuccess #undef hipMemLocationTypeDevice +#undef hipMemLocationTypeHostNuma #undef hipMemAllocationTypePinned //#undef hipMemAllocationTypeUncached #undef hipMemHandleTypeFabric From 634e46ea7f23f7d93b31c9d2d8d21386ca4dd0fb Mon Sep 17 00:00:00 2001 From: AtlantaPepsi Date: Thu, 14 May 2026 23:32:40 +0000 Subject: [PATCH 2/2] minor fix --- src/header/TransferBench.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index 5a3b04e..01e4212 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -4094,7 +4094,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) hipError_t exportErr = hipSuccess; const char* exportStep = "hipSetDevice"; if (memDevice.memRank == GetRank()) { - exportErr = hipSetDevice(memDevice.memIndex); + if (IsGpuMemType(memDevice.memType)) { + exportErr = hipSetDevice(memDevice.memIndex); + } if (exportErr == hipSuccess) { exportStep = "hipMemExportToShareableHandle"; exportErr = hipMemExportToShareableHandle(&fabricHandle, *memHandle, hipMemHandleTypeFabric, 0);