-
Notifications
You must be signed in to change notification settings - Fork 26
revert pinned hostmem for pod comm #292
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: candidate
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -615,6 +615,7 @@ namespace TransferBench | |
| #define hipMemGenericAllocationHandle_t CUmemGenericAllocationHandle | ||
| #define hipMemAccessDesc CUmemAccessDesc | ||
| #define hipMemFabricHandle_t CUmemFabricHandle | ||
| #define hipMemLocation CUmemLocation | ||
|
|
||
| // Enumerations | ||
| #define hipDeviceAttributeClockRate cudaDevAttrClockRate | ||
|
|
@@ -628,6 +629,7 @@ namespace TransferBench | |
| #define hipMemcpyHostToDevice cudaMemcpyHostToDevice | ||
| #define hipSuccess cudaSuccess | ||
| #define hipMemLocationTypeDevice CU_MEM_LOCATION_TYPE_DEVICE | ||
| #define hipMemLocationTypeHostNuma CU_MEM_LOCATION_TYPE_HOST_NUMA | ||
| #define hipMemAllocationTypePinned CU_MEM_ALLOCATION_TYPE_PINNED | ||
| #define hipMemHandleTypeFabric CU_MEM_HANDLE_TYPE_FABRIC | ||
| #define hipMemAllocationGranularityRecommended CU_MEM_ALLOC_GRANULARITY_RECOMMENDED | ||
|
|
@@ -1411,6 +1413,25 @@ namespace { | |
| } | ||
|
|
||
| #ifdef POD_COMM_ENABLED | ||
| static ErrResult GetMemLocation(MemDevice const& memDevice, hipMemLocation& location) | ||
| { | ||
| if (IsCpuMemType(memDevice.memType)) { | ||
| location.type = hipMemLocationTypeHostNuma; | ||
| } else if (IsGpuMemType(memDevice.memType) && memDevice.memType != MEM_MANAGED) { | ||
| location.type = hipMemLocationTypeDevice; | ||
| } else { | ||
| return {ERR_FATAL, "Unsupported memory location"}; | ||
| } | ||
|
|
||
| // Determine location id | ||
| if (memDevice.memType == MEM_CPU_CLOSEST) { | ||
| location.id = GetClosestCpuNumaToGpu(memDevice.memIndex); | ||
| } else { | ||
| location.id = memDevice.memIndex; | ||
| } | ||
| return ERR_NONE; | ||
| } | ||
|
|
||
| static ErrResult GetMemAllocationProp(MemDevice const& memDevice, hipMemAllocationProp& prop) | ||
| { | ||
|
|
||
|
|
@@ -1428,10 +1449,7 @@ namespace { | |
| } | ||
|
|
||
| prop.requestedHandleTypes = hipMemHandleTypeFabric; | ||
| // at this point shouldn't have any memtype other than device | ||
| // ERR_CHECK(GetMemLocation(memDevice, prop.location)); | ||
| prop.location.type = hipMemLocationTypeDevice; | ||
| prop.location.id = memDevice.memIndex; | ||
| ERR_CHECK(GetMemLocation(memDevice, prop.location)); | ||
| return ERR_NONE; | ||
| } | ||
| #endif | ||
|
|
@@ -1519,19 +1537,20 @@ namespace { | |
|
|
||
| // Specify memory access descriptor to enable local read/write | ||
| hipMemAccessDesc desc; | ||
| // ERR_CHECK(GetMemLocation(memDevice, desc.location)); | ||
| desc.location.type = hipMemLocationTypeDevice; | ||
| desc.location.id = memDevice.memIndex; | ||
| ERR_CHECK(GetMemLocation(memDevice, desc.location)); | ||
| desc.flags = hipMemAccessFlagsProtReadWrite; | ||
|
|
||
| // Set access flags for virtual address range | ||
| ERR_CHECK(hipMemSetAccess((gpu_device_ptr)*memPtr, roundedUpBytes, &desc, 1)); | ||
|
|
||
| // Clear the memory | ||
| if (IsCpuMemType(memType)) { | ||
| // Note: CheckPages() / move_pages() is intentionally NOT called here. | ||
| // For fabric-exportable HOST_NUMA memory the VA is owned by the driver | ||
| // (not a normal anonymous mmap VMA), so move_pages() returns | ||
| // -EFAULT/-EINVAL and would falsely trip a fatal error. NUMA placement | ||
| // should be already enforced by the prop.location passed to hipMemCreate(). | ||
| memset(*memPtr, 0, roundedUpBytes); | ||
| // Check that the allocated pages are actually on the correct NUMA node | ||
| ERR_CHECK(CheckPages((char*)*memPtr, roundedUpBytes, deviceIdx)); | ||
| } else if (IsGpuMemType(memType)) { | ||
| ERR_CHECK(hipSetDevice(memDevice.memIndex)); | ||
| ERR_CHECK(hipMemset(*memPtr, 0, numBytes)); | ||
|
|
@@ -4075,7 +4094,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) | |
| hipError_t exportErr = hipSuccess; | ||
| const char* exportStep = "hipSetDevice"; | ||
| if (memDevice.memRank == GetRank()) { | ||
| exportErr = hipSetDevice(memDevice.memIndex); | ||
| if (IsGpuMemType(memDevice.memType)) { | ||
| exportErr = hipSetDevice(memDevice.memIndex); | ||
| } | ||
|
Comment on lines
+4097
to
+4099
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this hipSetDevice even necessary? The runtime should already have the ability to determine where the memory handle is. It also seems strange that you would not need it for CPU memory types but GPU memory types. Based on the documentation, cuMemExportToSharableHandle doesn't require the current device context either, and won't ever return CUDA_ERROR_INVALID_CONTEXT |
||
| if (exportErr == hipSuccess) { | ||
| exportStep = "hipMemExportToShareableHandle"; | ||
| exportErr = hipMemExportToShareableHandle(&fabricHandle, *memHandle, hipMemHandleTypeFabric, 0); | ||
|
|
@@ -8064,6 +8085,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) | |
| #undef hipMemGenericAllocationHandle_t | ||
| #undef hipMemAccessDesc | ||
| #undef hipMemFabricHandle_t | ||
| #undef hipMemLocation | ||
|
|
||
| // Enumerations | ||
| #undef hipDeviceAttributeClockRate | ||
|
|
@@ -8077,6 +8099,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) | |
| #undef hipMemcpyHostToDevice | ||
| #undef hipSuccess | ||
| #undef hipMemLocationTypeDevice | ||
| #undef hipMemLocationTypeHostNuma | ||
| #undef hipMemAllocationTypePinned | ||
| //#undef hipMemAllocationTypeUncached | ||
| #undef hipMemHandleTypeFabric | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.