diff --git a/CMakeLists.txt b/CMakeLists.txt index 32544d8897..9dfa2cb79f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,6 +208,7 @@ option(BUILD_SHARED "Build Charm++ dynamic libraries" OFF) # Other options option(BUILD_CUDA "Build with CUDA support" OFF) +option(BUILD_HIP "Build with HIP support" OFF) option(PXSHM "Build with PXSHM" OFF) # LRTS PMI options @@ -511,7 +512,7 @@ if(EXISTS ${CMAKE_SOURCE_DIR}/src/arch/${NETWORK}/gdir_link) file(STRINGS src/arch/${NETWORK}/gdir_link GDIR) elseif(${NETWORK} MATCHES "gni-") set(GDIR "gni") -elseif(${NETWORK} MATCHES "mpi-cray") +elseif(${NETWORK} MATCHES "`mpi`-cray") set(GDIR "mpi") elseif(${NETWORK} MATCHES "ofi-cray") set(GDIR "ofi") @@ -533,8 +534,12 @@ else() set(CMK_BUILD_CHARMRUN 1) endif() +set(CHARMRUN_ELASTIC_DIR src/arch/common) +set(CHARMRUN_HAPI_DIR src/arch/common) + include(cmake/detect-features.cmake) include(cmake/ci-files.cmake) +add_custom_target(ci-generated DEPENDS ${all-ci-outputs}) if(${TARGET} STREQUAL "all-test") @@ -660,6 +665,8 @@ configure_file(src/arch/common/cc-msvc.sh include/ COPYONLY) configure_file(src/arch/common/conv-mach-craype.sh include/ COPYONLY) configure_file(src/arch/common/conv-mach-cuda.sh include/ COPYONLY) configure_file(src/arch/common/conv-mach-cuda.h include/ COPYONLY) +configure_file(src/arch/common/conv-mach-hip.sh include/ COPYONLY) +configure_file(src/arch/common/conv-mach-hip.h include/ COPYONLY) configure_file(src/arch/common/conv-mach-darwin.sh include/ COPYONLY) configure_file(src/arch/common/conv-mach-flang.h include/ COPYONLY) configure_file(src/arch/common/conv-mach-flang.sh include/ COPYONLY) @@ -688,9 +695,13 @@ configure_file(src/arch/common/conv-mach-tsan.h include/ COPYONLY) configure_file(src/arch/common/conv-mach-tsan.sh include/ COPYONLY) configure_file(src/scripts/conv-config.sh include/ COPYONLY) configure_file(src/arch/${VDIR}/conv-mach.sh include/ COPYONLY) +configure_file(src/util/ckrescale.h include/ COPYONLY) + +add_library(ckrescale src/util/ckrescale.C) set(CUDA_DIR "") -if(BUILD_CUDA) +set(HIP_DIR "") +if(BUILD_CUDA OR BUILD_HIP) file(GLOB_RECURSE hybridAPI-h-sources ${CMAKE_SOURCE_DIR}/src/arch/cuda/*.h) file(GLOB_RECURSE hybridAPI-cxx-sources ${CMAKE_SOURCE_DIR}/src/arch/cuda/*.cpp) @@ -698,19 +709,96 @@ if(BUILD_CUDA) configure_file(${file} include/ COPYONLY) endforeach() - if(CMAKE_VERSION VERSION_GREATER 3.17 OR CMAKE_VERSION VERSION_EQUAL 3.17) - find_package(CUDAToolkit REQUIRED) - set(CMAKE_CUDA_COMPILER "${CUDAToolkit_NVCC_EXECUTABLE}") - enable_language(CUDA) - set(CUDA_DIR "${CUDAToolkit_TARGET_DIR}") - else() - find_package(CUDA REQUIRED) - set(CUDA_DIR "${CUDA_TOOLKIT_ROOT_DIR}") + if (BUILD_CUDA) + if(CMAKE_VERSION VERSION_GREATER 3.17 OR CMAKE_VERSION VERSION_EQUAL 3.17) + find_package(CUDAToolkit REQUIRED) + set(CMAKE_CUDA_COMPILER "${CUDAToolkit_NVCC_EXECUTABLE}") + enable_language(CUDA) + set(CUDA_DIR "${CUDAToolkit_TARGET_DIR}") + else() + find_package(CUDA REQUIRED) + set(CUDA_DIR "${CUDA_TOOLKIT_ROOT_DIR}") + endif() + + # Find CUPTI library and include directory + find_library(CUPTI_LIBRARY cupti + HINTS "${CUDA_DIR}/extras/CUPTI/lib64" + "${CUDA_DIR}/lib64" + ) + set(CUPTI_INCLUDE_DIR "${CUDA_DIR}/extras/CUPTI/include") + if(NOT CUPTI_LIBRARY) + message(WARNING "CUPTI library not found. GPU load balancing will not be available.") + else() + message(STATUS "Found CUPTI: ${CUPTI_LIBRARY}") + endif() + + add_library(hybridapi ${hybridAPI-cxx-sources} $) + add_dependencies(hybridapi ci-generated) + + if(CUPTI_LIBRARY) + target_include_directories(hybridapi PRIVATE "${CUPTI_INCLUDE_DIR}") + target_link_libraries(hybridapi ${CUPTI_LIBRARY}) + endif() + + if(TRACING) + target_compile_definitions(hybridapi PRIVATE HAPI_TRACE) + endif() endif() - add_library(cudahybridapi ${hybridAPI-cxx-sources}) - if(TRACING) - target_compile_definitions(cudahybridapi PRIVATE HAPI_TRACE) + + if (BUILD_HIP) + add_compile_definitions(__HIP_PLATFORM_AMD__) + # Modern ROCm/HIP detection + if(NOT DEFINED ROCM_PATH) + if(NOT DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation") + else() + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation") + endif() + endif() + + # Find hipcc wrapper for reference + find_program(HIP_HIPCC_EXECUTABLE + NAMES hipcc + PATHS "${ROCM_PATH}/bin" "${ROCM_PATH}/hip/bin" + NO_DEFAULT_PATH + ) + + if(NOT HIP_HIPCC_EXECUTABLE) + message(FATAL_ERROR "Could not find hipcc. Please set ROCM_PATH to your ROCm installation directory.") + endif() + + # Find the actual clang compiler used by ROCm (required by CMake) + find_program(CMAKE_HIP_COMPILER + NAMES clang++ + PATHS "${ROCM_PATH}/llvm/bin" "${ROCM_PATH}/bin" + NO_DEFAULT_PATH + ) + + if(NOT CMAKE_HIP_COMPILER) + message(FATAL_ERROR "Could not find ROCm clang++ compiler in ${ROCM_PATH}") + endif() + + set(HIP_DIR "${ROCM_PATH}") + set(CMAKE_HIP_ARCHITECTURES "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100" CACHE STRING "HIP architectures") + + # Enable HIP language support + enable_language(HIP) + + add_library(hybridapi ${hybridAPI-cxx-sources}) + add_dependencies(hybridapi ci-generated) + target_include_directories(hybridapi PRIVATE "${ROCM_PATH}/include") + + if(TRACING) + target_compile_definitions(hybridapi PRIVATE HAPI_TRACE) + endif() endif() + + # hapi_memory_daemon - standalone executable for shrink/expand GPU memory management + if(BUILD_CUDA) + add_executable(hapi_memory_daemon src/arch/cuda/hybridAPI/hapi_memory_daemon.cpp) + add_dependencies(hapi_memory_daemon hybridapi ck converse ckqt moduleNDMeshStreamer ckmain modulecompletion conv-static) + endif() + endif() if(EXISTS ${CMAKE_SOURCE_DIR}/src/arch/${VDIR}/conv-mach-cxi.sh) @@ -911,6 +999,12 @@ if(${CMK_BUILD_CHARMRUN}) add_dependencies(charmrun create_symlinks) else() configure_file(${CHARMRUN_DIR}/charmrun ${CMAKE_BINARY_DIR}/bin COPYONLY) + if(EXISTS ${CMAKE_SOURCE_DIR}/${CHARMRUN_ELASTIC_DIR}/charmrun_elastic) + configure_file(${CHARMRUN_ELASTIC_DIR}/charmrun_elastic ${CMAKE_BINARY_DIR}/bin COPYONLY) + endif() + if(EXISTS ${CMAKE_SOURCE_DIR}/${CHARMRUN_HAPI_DIR}/charmrun_hapi) + configure_file(${CHARMRUN_HAPI_DIR}/charmrun_hapi ${CMAKE_BINARY_DIR}/bin COPYONLY) + endif() endif() configure_file(src/scripts/testrun bin/ COPYONLY) @@ -1000,7 +1094,11 @@ if(${TARGET} STREQUAL "charm4py") endif() if (${BUILD_CUDA}) - target_link_libraries(charm cudart cudahybridapi) + target_link_libraries(charm cudart hybridapi) + endif() + + if (${BUILD_HIP}) + target_link_libraries(charm hiprtc hybridapi) endif() if(${TRACING}) @@ -1019,9 +1117,12 @@ else() if(RECONVERSE) target_link_libraries(ckhello PRIVATE reconverse) endif() - add_dependencies(ckhello ck ckqt conv-static + add_dependencies(ckhello ck ckqt ckrescale conv-static converse ckmain moduleNDMeshStreamer modulecompletion) + if(BUILD_CUDA OR BUILD_HIP) + add_dependencies(ckhello hybridapi) + endif() endif() # Create conv-mach-opt.sh @@ -1067,7 +1168,7 @@ foreach(l BUILDOPTS CMK_AMPI_WITH_ROMIO CMK_BUILD_PYTHON CMK_CAN_LINK_FORTRAN CXX_NO_AS_NEEDED LDXX_WHOLE_ARCHIVE_PRE LDXX_WHOLE_ARCHIVE_POST CMK_MACOSX CMK_POST_EXE CMK_SHARED_SUF CMK_USER_SUFFIX OPTS_LD CMK_COMPILER_KNOWS_FVISIBILITY CMK_LINKER_KNOWS_UNDEFINED - CMK_SUPPORTS_MEMORY_ISOMALLOC CUDA_DIR CMK_USER_DISABLED_TLS CMK_CXI) + CMK_SUPPORTS_MEMORY_ISOMALLOC CUDA_DIR HIP_DIR CMK_USER_DISABLED_TLS CMK_CXI) file(APPEND ${optfile_sh} "${l}=\"${${l}}\"\n" ) endforeach(l) @@ -1104,7 +1205,7 @@ endif() set(optfile_mak ${CMAKE_BINARY_DIR}/include/conv-mach-opt.mak) file(WRITE ${optfile_mak} "# Build-time options header for Makefiles, automatically generated by cmake.\n") -foreach(l CUDA_DIR BUILD_CUDA CMK_AMPI_WITH_ROMIO CMK_MACOSX CMK_BUILD_PYTHON +foreach(l CUDA_DIR HIP_DIR BUILD_CUDA BUILD_HIP CMK_AMPI_WITH_ROMIO CMK_MACOSX CMK_BUILD_PYTHON CMK_CHARMDEBUG CMK_COMPILER CMK_GDIR CMK_HAS_MALLOC_HOOK CMK_HAS_MMAP CMK_LIBJPEG CMK_LUSTREAPI CMK_MULTICORE CMK_NO_BUILD_SHARED CMK_NO_PARTITIONS CMK_SHARED_SUF CMK_SMP CMK_SUPPORTS_FSGLOBALS CMK_SUPPORTS_PIPGLOBALS CMK_SUPPORTS_PIEGLOBALS @@ -1117,7 +1218,8 @@ endforeach(l) # Add options set(CUDA ${BUILD_CUDA}) # need CUDA to match conv-mach file name -foreach(opt SMP OMP TCP PTHREADS SYNCFT PXSHM PERSISTENT OOC CUDA PAPI CXI) +set(HIP ${BUILD_HIP}) # need HIP to match conv-mach file name +foreach(opt SMP OMP TCP PTHREADS SYNCFT PXSHM PERSISTENT OOC CUDA HIP PAPI CXI) if(${opt}) string(TOLOWER ${opt} optl) file(APPEND ${optfile_sh} ". ${CMAKE_BINARY_DIR}/include/conv-mach-${optl}.sh\n") diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000..1fe1b02e00 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,25 @@ +FROM mpioperator/openmpi + +RUN apt update && apt install -y build-essential zlib1g-dev ca-certificates cmake git + +RUN apt update \ + && apt install -y --no-install-recommends \ + g++ \ + gfortran \ + libopenmpi-dev \ + && rm -rf /var/lib/apt/lists/* + +#RUN git clone https://github.com/charmplusplus/charm.git +RUN mkdir /home/mpiuser/charm +COPY . /home/mpiuser/charm +RUN cd charm && git checkout shrinkexpand-mpi && ./build charm++ mpi-linux-x86_64 --enable-shrinkexpand -j8 --force --with-production + +RUN cd charm/examples/charm++/shrink_expand && make clean && make +RUN cd charm/examples/charm++/shrink_expand/jacobi2d-iter && make clean && make +RUN cd charm/examples/charm++/shrink_expand/startup && make clean && make +RUN mkdir /app +RUN cp charm/examples/charm++/shrink_expand/jacobi2d-iter/charmrun /app/ +RUN cp charm/examples/charm++/shrink_expand/jacobi2d-iter/charmrun_elastic /app/ +RUN cp charm/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d /app/ +RUN cp charm/examples/charm++/shrink_expand/startup/startup /app/ +RUN chmod 777 /app diff --git a/benchmarks/charm++/cuda/gpudirect/latency/latency.C b/benchmarks/charm++/cuda/gpudirect/latency/latency.C index 512e2c7edf..6945280745 100644 --- a/benchmarks/charm++/cuda/gpudirect/latency/latency.C +++ b/benchmarks/charm++/cuda/gpudirect/latency/latency.C @@ -125,6 +125,7 @@ public: int peer; double start_time; + double allocTime; double* times; char* h_local_data; @@ -211,6 +212,13 @@ public: cudaStreamSynchronize(stream); thisProxy[peer].receiveReg(size, h_local_data); } else { + double allocStart = CkWallTimer(); + char* d_local_data_new; + hapiCheck(cudaMalloc(&d_local_data_new, max_size)); + hapiCheck(cudaFree(d_local_data)); + d_local_data = d_local_data_new; + send_buffer = CkDeviceBuffer(d_local_data_new); + allocTime = CkWallTimer() - allocStart; thisProxy[peer].receiveZC(size, send_buffer); } } @@ -230,7 +238,7 @@ public: // Inform the runtime where the incoming data should be stored // and which CUDA stream should be used for the transfer data = d_remote_data; - devicePost[0].cuda_stream = stream; // Not used with UCX + devicePost[0].hapi_stream = stream; // Not used with UCX } // Second receive (regular entry method), invoked once the data transfers complete @@ -247,7 +255,7 @@ public: } else { // PE 0: received pong if (iter > warmup_iters) { - times[iter-warmup_iters-1] = (CkWallTimer() - start_time) / 2.0; + times[iter-warmup_iters-1] = (CkWallTimer() - start_time) / 2.0 - allocTime; } // Start next iteration or end test for current size diff --git a/buildcmake b/buildcmake index 777c295172..170b9a5e2e 100755 --- a/buildcmake +++ b/buildcmake @@ -103,6 +103,7 @@ opt_ccs=0 opt_charmdebug=0 opt_controlpoint=0 opt_cuda=0 +opt_hip=0 opt_destination="" opt_disabletls=0 opt_install_prefix="" @@ -181,6 +182,9 @@ function parse_platform_compilers() { cuda) opt_cuda=1 ;; + hip) + opt_hip=1 + ;; cxi) opt_cxi=1 ;; @@ -681,6 +685,7 @@ CC=$opt_CC CXX=$opt_CXX FC=$opt_FC cmake "$my_srcdir" \ -DCHARMDEBUG="$opt_charmdebug" \ -DCONTROLPOINT="$opt_controlpoint" \ -DBUILD_CUDA="$opt_cuda" \ + -DBUILD_HIP="$opt_hip" \ -DDISABLE_TLS="$opt_disabletls" \ -DDRONE_MODE="$opt_drone_mode" \ -DENABLE_FORTRAN=$opt_enable_fortran \ diff --git a/cmake/converse.cmake b/cmake/converse.cmake index 2c0c4f34b8..badf4523fd 100644 --- a/cmake/converse.cmake +++ b/cmake/converse.cmake @@ -240,7 +240,8 @@ add_library(charm_cxx_utils STATIC add_library(topomanager STATIC ${tmgr-cxx-sources} - ${tmgr-h-sources}) + ${tmgr-h-sources} + $) target_include_directories(topomanager PUBLIC src/util/topomanager @@ -253,7 +254,7 @@ target_include_directories(topomanager PUBLIC # charm_cxx_utils # ) add_custom_target(converse) -add_dependencies(converse reconverse topomanager charm_cxx_utils) +add_dependencies(converse reconverse topomanager charm_cxx_utils ckrescale) #file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/include/comm_backend) diff --git a/cmake/fetch_reconverse/CMakeLists.txt b/cmake/fetch_reconverse/CMakeLists.txt index c26ef0edb8..1586a5c41c 100644 --- a/cmake/fetch_reconverse/CMakeLists.txt +++ b/cmake/fetch_reconverse/CMakeLists.txt @@ -27,5 +27,6 @@ FetchContent_MakeAvailable(reconverse) set(BUILD_SHARED_LIBS ${_save_BUILD_SHARED_LIBS} CACHE INTERNAL "") configure_file(${reconverse_SOURCE_DIR}/include/converse.h ${CMAKE_BINARY_DIR}/include/ COPYONLY) +configure_file(${reconverse_SOURCE_DIR}/include/conv-rdma.h ${CMAKE_BINARY_DIR}/include/ COPYONLY) configure_file(${reconverse_SOURCE_DIR}/src/cldb.h ${CMAKE_BINARY_DIR}/include/ COPYONLY) configure_file(${reconverse_SOURCE_DIR}/include/charm-config.h ${CMAKE_BINARY_DIR}/include/ COPYONLY) diff --git a/doc/charm++/manual.rst b/doc/charm++/manual.rst index 9a7e871430..e4db2af2ec 100644 --- a/doc/charm++/manual.rst +++ b/doc/charm++/manual.rst @@ -9314,7 +9314,8 @@ This entry method should be invoked on the sender by wrapping the source buffer with ``CkDeviceBuffer``, whose constructor takes a pointer to the source buffer, a Charm++ callback to be invoked once the transfer completes (optional), and a CUDA stream associated with the transfer -(which is only used internally in the CUDA memcpy and IPC based implementation and is also optional): +(which is only used internally in the CUDA memcpy and IPC based implementation and is also optional). +The user guarantees that the GPU buffer won't be modified until the callback is called: .. code-block:: c++ diff --git a/examples/ampi/Cjacobi3D/Makefile b/examples/ampi/Cjacobi3D/Makefile index 257f7ca208..277268a009 100644 --- a/examples/ampi/Cjacobi3D/Makefile +++ b/examples/ampi/Cjacobi3D/Makefile @@ -1,7 +1,7 @@ -include ../../common.mk -include ../../../include/conv-mach-opt.mak CHARMBASE=../../../ -CHARMC=../../../bin/ampicxx $(OPTS) +CHARMC=../../../netlrts-linux-x86_64/bin/ampicxx $(OPTS) TOKENS=6 -include $(CHARMBASE)/include/conv-mach-opt.mak @@ -12,6 +12,7 @@ AMPI_TARGETS := \ jacobi \ jacobi.pup \ jacobi-get \ + jacobi.pie ifeq (1,$(CMK_SUPPORTS_TLSGLOBALS)) AMPI_TARGETS += jacobi.tls @@ -47,6 +48,10 @@ jacobi.tls: jacobi.C $(CHARMC) -c -tlsglobals jacobi.C -o jacobi.tls.o $(CHARMC) -o jacobi.tls jacobi.tls.o -tlsglobals +jacobi.pie: jacobi-pie.C + $(CHARMC) -c -pieglobals jacobi-pie.C -o jacobi.pie.o + $(CHARMC) -o jacobi.pie jacobi.pie.o -pieglobals + jacobi.rose: jacobi.C $(CHARMC) -roseomptlsglobals -o jacobi.rose.o -c $< $(CHARMC) -roseomptlsglobals -o $@ jacobi.rose.o @@ -93,5 +98,5 @@ endif clean: - rm -f *.o jacobi *~ moduleinit.C charmrun conv-host jacobi-cpp jacobi.iso jacobi-get jacobi.tls ampirun + rm -f *.o jacobi *~ moduleinit.C charmrun conv-host jacobi-cpp jacobi.iso jacobi-get jacobi.tls jacobi.pie ampirun rm -rf 40 80 120 diff --git a/examples/ampi/Cjacobi3D/jacobi.C b/examples/ampi/Cjacobi3D/jacobi.C index 37d0cc7e7a..b15e6f3e96 100644 --- a/examples/ampi/Cjacobi3D/jacobi.C +++ b/examples/ampi/Cjacobi3D/jacobi.C @@ -27,39 +27,7 @@ class chunk { double rbzp[DIMX*DIMY]; }; -#ifdef AMPI -void chunk_pup(pup_er p, void *d) -{ - chunk **cpp = (chunk **) d; - if(pup_isUnpacking(p)) - *cpp = new chunk; - chunk *cp = *cpp; - pup_doubles(p, &cp->t[0][0][0], (DIMX+2)*(DIMY+2)*(DIMZ+2)); - pup_int(p, &cp->xidx); - pup_int(p, &cp->yidx); - pup_int(p, &cp->zidx); - pup_int(p, &cp->xp); - pup_int(p, &cp->xm); - pup_int(p, &cp->yp); - pup_int(p, &cp->ym); - pup_int(p, &cp->zp); - pup_int(p, &cp->zm); - pup_doubles(p, cp->sbxm, (DIMY*DIMZ)); - pup_doubles(p, cp->sbxp, (DIMY*DIMZ)); - pup_doubles(p, cp->rbxm, (DIMY*DIMZ)); - pup_doubles(p, cp->rbxp, (DIMY*DIMZ)); - pup_doubles(p, cp->sbym, (DIMX*DIMZ)); - pup_doubles(p, cp->sbyp, (DIMX*DIMZ)); - pup_doubles(p, cp->rbym, (DIMX*DIMZ)); - pup_doubles(p, cp->rbyp, (DIMX*DIMZ)); - pup_doubles(p, cp->sbzm, (DIMX*DIMY)); - pup_doubles(p, cp->sbzp, (DIMX*DIMY)); - pup_doubles(p, cp->rbzm, (DIMX*DIMY)); - pup_doubles(p, cp->rbzp, (DIMX*DIMY)); - if(pup_isDeleting(p)) - delete cp; -} -#endif +__thread chunk cp; #define abs(x) ((x)<0.0 ? -(x) : (x)) @@ -102,7 +70,6 @@ int main(int ac, char** av) int i, j, k, m; int iter, niter, cp_idx; double maxerr, error, tval, starttime, itertime; - chunk *cp; int rank, size; MPI_Request req[12]; @@ -132,74 +99,64 @@ int main(int ac, char** av) MPI_Bcast(&niter, 1, MPI_INT, 0, MPI_COMM_WORLD); - cp = new chunk; -#if defined(AMPI) && ! defined(NO_PUP) - AMPI_Register_pup((MPI_PupFn)chunk_pup, (void*)&cp, &cp_idx); -#endif - - index3d(rank, cp->xidx, cp->yidx, cp->zidx); - cp->xp = index1d((cp->xidx+1)%NX,cp->yidx,cp->zidx); - cp->xm = index1d((cp->xidx+NX-1)%NX,cp->yidx,cp->zidx); - cp->yp = index1d(cp->xidx,(cp->yidx+1)%NY,cp->zidx); - cp->ym = index1d(cp->xidx,(cp->yidx+NY-1)%NY,cp->zidx); - cp->zp = index1d(cp->xidx,cp->yidx,(cp->zidx+1)%NZ); - cp->zm = index1d(cp->xidx,cp->yidx,(cp->zidx+NZ-1)%NZ); + index3d(rank, cp.xidx, cp.yidx, cp.zidx); + cp.xp = index1d((cp.xidx+1)%NX,cp.yidx,cp.zidx); + cp.xm = index1d((cp.xidx+NX-1)%NX,cp.yidx,cp.zidx); + cp.yp = index1d(cp.xidx,(cp.yidx+1)%NY,cp.zidx); + cp.ym = index1d(cp.xidx,(cp.yidx+NY-1)%NY,cp.zidx); + cp.zp = index1d(cp.xidx,cp.yidx,(cp.zidx+1)%NZ); + cp.zm = index1d(cp.xidx,cp.yidx,(cp.zidx+NZ-1)%NZ); for(i=1; i<=DIMZ; i++) for(j=1; j<=DIMY; j++) for(k=1; k<=DIMX; k++) - cp->t[k][j][i] = DIMY*DIMX*(i-1) + DIMX*(j-2) + (k-1); + cp.t[k][j][i] = DIMY*DIMX*(i-1) + DIMX*(j-2) + (k-1); MPI_Barrier(MPI_COMM_WORLD); starttime = MPI_Wtime(); for(iter=1; iter<=niter; iter++) { maxerr = 0.0; - copyout(cp->sbxm, cp->t, 1, 1, 1, DIMY, 1, DIMZ); - copyout(cp->sbxp, cp->t, DIMX, DIMX, 1, DIMY, 1, DIMZ); - copyout(cp->sbym, cp->t, 1, DIMX, 1, 1, 1, DIMZ); - copyout(cp->sbyp, cp->t, 1, DIMX, DIMY, DIMY, 1, DIMZ); - copyout(cp->sbzm, cp->t, 1, DIMX, 1, DIMY, 1, 1); - copyout(cp->sbzp, cp->t, 1, DIMX, 1, DIMY, DIMZ, DIMZ); - - MPI_Irecv(cp->rbxp, DIMY*DIMZ, MPI_DOUBLE, cp->xp, 0, MPI_COMM_WORLD, &req[0]); - MPI_Irecv(cp->rbxm, DIMY*DIMZ, MPI_DOUBLE, cp->xm, 1, MPI_COMM_WORLD, &req[1]); - MPI_Irecv(cp->rbyp, DIMX*DIMZ, MPI_DOUBLE, cp->yp, 2, MPI_COMM_WORLD, &req[2]); - MPI_Irecv(cp->rbym, DIMX*DIMZ, MPI_DOUBLE, cp->ym, 3, MPI_COMM_WORLD, &req[3]); - MPI_Irecv(cp->rbzm, DIMX*DIMY, MPI_DOUBLE, cp->zm, 5, MPI_COMM_WORLD, &req[4]); - MPI_Irecv(cp->rbzp, DIMX*DIMY, MPI_DOUBLE, cp->zp, 4, MPI_COMM_WORLD, &req[5]); - - MPI_Isend(cp->sbxm, DIMY*DIMZ, MPI_DOUBLE, cp->xm, 0, MPI_COMM_WORLD, &req[6]); - MPI_Isend(cp->sbxp, DIMY*DIMZ, MPI_DOUBLE, cp->xp, 1, MPI_COMM_WORLD, &req[7]); - MPI_Isend(cp->sbym, DIMX*DIMZ, MPI_DOUBLE, cp->ym, 2, MPI_COMM_WORLD, &req[8]); - MPI_Isend(cp->sbyp, DIMX*DIMZ, MPI_DOUBLE, cp->yp, 3, MPI_COMM_WORLD, &req[9]); - MPI_Isend(cp->sbzm, DIMX*DIMY, MPI_DOUBLE, cp->zm, 4, MPI_COMM_WORLD, &req[10]); - MPI_Isend(cp->sbzp, DIMX*DIMY, MPI_DOUBLE, cp->zp, 5, MPI_COMM_WORLD, &req[11]); + copyout(cp.sbxm, cp.t, 1, 1, 1, DIMY, 1, DIMZ); + copyout(cp.sbxp, cp.t, DIMX, DIMX, 1, DIMY, 1, DIMZ); + copyout(cp.sbym, cp.t, 1, DIMX, 1, 1, 1, DIMZ); + copyout(cp.sbyp, cp.t, 1, DIMX, DIMY, DIMY, 1, DIMZ); + copyout(cp.sbzm, cp.t, 1, DIMX, 1, DIMY, 1, 1); + copyout(cp.sbzp, cp.t, 1, DIMX, 1, DIMY, DIMZ, DIMZ); + + MPI_Irecv(cp.rbxp, DIMY*DIMZ, MPI_DOUBLE, cp.xp, 0, MPI_COMM_WORLD, &req[0]); + MPI_Irecv(cp.rbxm, DIMY*DIMZ, MPI_DOUBLE, cp.xm, 1, MPI_COMM_WORLD, &req[1]); + MPI_Irecv(cp.rbyp, DIMX*DIMZ, MPI_DOUBLE, cp.yp, 2, MPI_COMM_WORLD, &req[2]); + MPI_Irecv(cp.rbym, DIMX*DIMZ, MPI_DOUBLE, cp.ym, 3, MPI_COMM_WORLD, &req[3]); + MPI_Irecv(cp.rbzm, DIMX*DIMY, MPI_DOUBLE, cp.zm, 5, MPI_COMM_WORLD, &req[4]); + MPI_Irecv(cp.rbzp, DIMX*DIMY, MPI_DOUBLE, cp.zp, 4, MPI_COMM_WORLD, &req[5]); + + MPI_Isend(cp.sbxm, DIMY*DIMZ, MPI_DOUBLE, cp.xm, 0, MPI_COMM_WORLD, &req[6]); + MPI_Isend(cp.sbxp, DIMY*DIMZ, MPI_DOUBLE, cp.xp, 1, MPI_COMM_WORLD, &req[7]); + MPI_Isend(cp.sbym, DIMX*DIMZ, MPI_DOUBLE, cp.ym, 2, MPI_COMM_WORLD, &req[8]); + MPI_Isend(cp.sbyp, DIMX*DIMZ, MPI_DOUBLE, cp.yp, 3, MPI_COMM_WORLD, &req[9]); + MPI_Isend(cp.sbzm, DIMX*DIMY, MPI_DOUBLE, cp.zm, 4, MPI_COMM_WORLD, &req[10]); + MPI_Isend(cp.sbzp, DIMX*DIMY, MPI_DOUBLE, cp.zp, 5, MPI_COMM_WORLD, &req[11]); MPI_Waitall(12, req, MPI_STATUSES_IGNORE); - copyin(cp->sbxm, cp->t, 0, 0, 1, DIMY, 1, DIMZ); - copyin(cp->sbxp, cp->t, DIMX+1, DIMX+1, 1, DIMY, 1, DIMZ); - copyin(cp->sbym, cp->t, 1, DIMX, 0, 0, 1, DIMZ); - copyin(cp->sbyp, cp->t, 1, DIMX, DIMY+1, DIMY+1, 1, DIMZ); - copyin(cp->sbzm, cp->t, 1, DIMX, 1, DIMY, 0, 0); - copyin(cp->sbzp, cp->t, 1, DIMX, 1, DIMY, DIMZ+1, DIMZ+1); - - if(iter > 25 && iter < 85 && rank == 35) - m = 9; - else - m = 1; - for(; m>0; m--) - for(i=1; i<=DIMZ; i++) - for(j=1; j<=DIMY; j++) - for(k=1; k<=DIMX; k++) { - tval = (cp->t[k][j][i] + cp->t[k][j][i+1] + - cp->t[k][j][i-1] + cp->t[k][j+1][i] + - cp->t[k][j-1][i] + cp->t[k+1][j][i] + - cp->t[k-1][j][i]) / 7.0; - error = abs(tval-cp->t[k][j][i]); - cp->t[k][j][i] = tval; - if (error > maxerr) maxerr = error; - } + copyin(cp.sbxm, cp.t, 0, 0, 1, DIMY, 1, DIMZ); + copyin(cp.sbxp, cp.t, DIMX+1, DIMX+1, 1, DIMY, 1, DIMZ); + copyin(cp.sbym, cp.t, 1, DIMX, 0, 0, 1, DIMZ); + copyin(cp.sbyp, cp.t, 1, DIMX, DIMY+1, DIMY+1, 1, DIMZ); + copyin(cp.sbzm, cp.t, 1, DIMX, 1, DIMY, 0, 0); + copyin(cp.sbzp, cp.t, 1, DIMX, 1, DIMY, DIMZ+1, DIMZ+1); + + for(i=1; i<=DIMZ; i++) + for(j=1; j<=DIMY; j++) + for(k=1; k<=DIMX; k++) { + tval = (cp.t[k][j][i] + cp.t[k][j][i+1] + + cp.t[k][j][i-1] + cp.t[k][j+1][i] + + cp.t[k][j-1][i] + cp.t[k+1][j][i] + + cp.t[k-1][j][i]) / 7.0; + error = abs(tval-cp.t[k][j][i]); + cp.t[k][j][i] = tval; + if (error > maxerr) maxerr = error; + } MPI_Allreduce(MPI_IN_PLACE, &maxerr, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); itertime = MPI_Wtime() - starttime; @@ -207,11 +164,6 @@ int main(int ac, char** av) if (rank == 0) printf("iter %d time: %lf maxerr: %lf\n", iter, itertime / size, maxerr); starttime = MPI_Wtime(); -#ifdef AMPI - if(iter%10 == 5) { - AMPI_Migrate(AMPI_INFO_LB_SYNC); - } -#endif } MPI_Finalize(); return 0; diff --git a/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/Makefile b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/Makefile new file mode 100644 index 0000000000..14e380e0fe --- /dev/null +++ b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/Makefile @@ -0,0 +1,31 @@ +OPTS = -O3 -DHAPI_CUDA_CALLBACK + +CHARM_DIR = ../../../../.. +CHARMC = $(CHARM_DIR)/bin/charmc $(OPTS) +CHARM_INC = -I$(CHARM_DIR)/include + +NVCC = nvcc +NVCC_FLAGS = -c -std=c++11 -use_fast_math $(OPTS) +LD_LIBS = -module EveryLB + +TARGET = jacobi2d +all: $(TARGET) + +OBJS = $(TARGET).o $(TARGET)CUDA.o + +$(TARGET): $(OBJS) + $(CHARMC) -language charm++ -o $@ $(OBJS) $(LD_LIBS) + +$(TARGET).decl.h: $(TARGET).ci $(TARGET).h + $(CHARMC) $< + +$(TARGET).def.h: $(TARGET).ci $(TARGET).h + +$(TARGET).o: $(TARGET).C $(TARGET).decl.h $(TARGET).def.h $(TARGET).h + $(CHARMC) -c $< + +$(TARGET)CUDA.o: $(TARGET).cu $(TARGET).h + $(NVCC) -o $@ $(NVCC_FLAGS) $(CHARM_INC) $< + +clean: + rm -f *.decl.h *.def.h conv-host *.o $(TARGET) charmrun diff --git a/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.C b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.C new file mode 100644 index 0000000000..8a427e1508 --- /dev/null +++ b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.C @@ -0,0 +1,646 @@ +#include "hapi.h" +#include "hapi_nvtx.h" +#include "jacobi2d.decl.h" +#include "jacobi2d.h" +#include +#include + +#define COMM_ONLY 0 +#define CUDA_SYNC 0 + +/* readonly */ CProxy_Main main_proxy; +/* readonly */ CProxy_Block block_proxy; +/* readonly */ int grid_width; +/* readonly */ int grid_height; +/* readonly */ int block_width; +/* readonly */ int block_height; +/* readonly */ int n_chares_x; +/* readonly */ int n_chares_y; +/* readonly */ int n_iters; +/* readonly */ int warmup_iters; +/* readonly */ bool sync_ver; +/* readonly */ bool use_zerocopy; +/* readonly */ bool print_elements; +/* readonly */ int lb_freq; +/* readonly */ int first_lb; +/* readonly */ int imbalance; + +extern void invokeInitKernel(DataType* d_temperature, int block_width, + int block_height, cudaStream_t stream); +extern void invokeBoundaryKernels(DataType* d_temperature, int block_width, + int block_height, bool left_bound, bool right_bound, bool top_bound, + bool bottom_bound, cudaStream_t stream); +extern void invokeJacobiKernel(DataType* d_temperature, DataType* d_new_temperature, + int block_width, int block_height, int iter, cudaStream_t stream); +extern void invokePackingKernels(DataType* d_temperature, DataType* d_left_ghost, + DataType* d_right_ghost, bool left_bound, bool right_bound, int block_width, + int block_height, cudaStream_t stream); +extern void invokeUnpackingKernel(DataType* d_temperature, DataType* d_ghost, + bool is_left, int block_width, int block_height, cudaStream_t stream); + +enum Direction { LEFT = 1, RIGHT, TOP, BOTTOM }; + +class Main : public CBase_Main { + int my_iter; + double init_start_time; + double start_time; + double comm_start_time; + double comm_agg_time; + double update_start_time; + double update_agg_time; + +public: + Main(CkArgMsg* m) { + // Set default values + main_proxy = thisProxy; + grid_width = 8192; + grid_height = 8192; + block_width = 2048; + block_height = 2048; + n_iters = 100; + warmup_iters = 10; + use_zerocopy = false; + print_elements = false; + sync_ver = false; + my_iter = 0; + first_lb = 10; + lb_freq = 100; + imbalance = 5; // Max extra iterations for load imbalance + + // Initialize aggregate timers + update_agg_time = 0.0; + comm_agg_time = 0.0; + + // Process arguments + int c; + while ((c = getopt(m->argc, m->argv, "W:H:w:h:i:b:f:m:u:yzp")) != -1) { + switch (c) { + case 'W': + grid_width = atoi(optarg); + break; + case 'H': + grid_height = atoi(optarg); + break; + case 'w': + block_width = atoi(optarg); + break; + case 'h': + block_height = atoi(optarg); + break; + case 'i': + n_iters = atoi(optarg); + break; + case 'b': + lb_freq = atoi(optarg); + break; + case 'f': + first_lb = atoi(optarg); + break; + case 'm': + imbalance = atoi(optarg); + break; + case 'u': + warmup_iters = atoi(optarg); + break; + case 'y': + sync_ver = true; + break; + case 'z': + use_zerocopy = true; + break; + case 'p': + print_elements = true; + break; + default: + CkPrintf( + "Usage: %s -W [grid width] -H [grid height] -w [block width] -h [block height]" + "-b [lb frequency] -f [first lb] -m [max imbalance] " + "-i [iterations] -u [warmup] -y (use sync version) -z (use GPU zerocopy) -p (print blocks)\n", + m->argv[0]); + CkExit(); + } + } + delete m; + + if (grid_width % block_width != 0 || grid_height % block_height != 0) { + CkAbort("Invalid grid & block configuration\n"); + } + + // Number of chares per dimension + n_chares_x = grid_width / block_width; + n_chares_y = grid_height / block_height; + + // Print configuration + CkPrintf("\n[CUDA 2D Jacobi example]\n"); + CkPrintf("Grid: %d x %d, Block: %d x %d, Chares: %d x %d, Iterations: %d, " + "Warm-up: %d, Bulk-synchronous: %d, Zerocopy: %d, Print: %d\n\n", + grid_width, grid_height, block_width, block_height, n_chares_x, n_chares_y, + n_iters, warmup_iters, sync_ver, use_zerocopy, print_elements); + + // Create blocks and start iteration + block_proxy = CProxy_Block::ckNew(n_chares_x, n_chares_y); + init_start_time = CkWallTimer(); + block_proxy.init(); + } + + void initDone() { + CkPrintf("Init time: %.3lf s\n", CkWallTimer() - init_start_time); + + startIter(); + } + + void startIter() { + if (my_iter++ == warmup_iters) start_time = CkWallTimer(); + update_start_time = CkWallTimer(); + + block_proxy.exchangeGhosts(); + } + + void updateDone() { + if (my_iter > warmup_iters) update_agg_time += CkWallTimer() - update_start_time; + comm_start_time = CkWallTimer(); + + block_proxy.packGhosts(); + } + + void commDone() { + if (my_iter > warmup_iters) comm_agg_time += CkWallTimer() - comm_start_time; + + if (my_iter == warmup_iters + n_iters) { + allDone(); + } else { + startIter(); + } + } + + void allDone() { + double total_time = CkWallTimer() - start_time; + CkPrintf("Total time: %.3lf s\nAverage iteration time: %.3lf us\n", + total_time, (total_time / n_iters) * 1e6); + if (sync_ver) { + CkPrintf("Comm time per iteration: %.3lf us\nUpdate time per iteration: %.3lf us\n", + (comm_agg_time / n_iters) * 1e6, (update_agg_time / n_iters) * 1e6); + } + + if (print_elements) { + sleep(1); + block_proxy(0,0).print(); + } else { + CkExit(); + } + } + + void printDone() { + CkExit(); + } +}; + +class Block : public CBase_Block { + Block_SDAG_CODE + + public: + int my_iter; + int neighbors; + int remote_count; + int x, y; + int load_iters; + + DataType* __restrict__ h_temperature; + DataType* __restrict__ d_temperature; + DataType* __restrict__ d_new_temperature; + DataType* __restrict__ h_left_ghost; + DataType* __restrict__ h_right_ghost; + DataType* __restrict__ h_top_ghost; + DataType* __restrict__ h_bottom_ghost; + DataType* __restrict__ d_left_ghost; + DataType* __restrict__ d_right_ghost; + DataType* __restrict__ d_send_left_ghost; + DataType* __restrict__ d_send_right_ghost; + DataType* __restrict__ d_send_top_ghost; + DataType* __restrict__ d_send_bottom_ghost; + DataType* __restrict__ d_recv_left_ghost; + DataType* __restrict__ d_recv_right_ghost; + + cudaStream_t compute_stream; + cudaStream_t comm_stream; + + cudaEvent_t compute_event; + cudaEvent_t comm_event; + + bool left_bound, right_bound, top_bound, bottom_bound; + + Block() { + usesAtSync = true; + } + + Block(CkMigrateMessage* m) { + usesAtSync = true; + hapiCheck(cudaStreamCreateWithPriority(&compute_stream, cudaStreamDefault, 0)); + hapiCheck(cudaStreamCreateWithPriority(&comm_stream, cudaStreamDefault, -1)); + + hapiCheck(cudaEventCreateWithFlags(&compute_event, cudaEventDisableTiming)); + hapiCheck(cudaEventCreateWithFlags(&comm_event, cudaEventDisableTiming)); + } + + ~Block() { + // hapiCheck(cudaFreeHost(h_temperature)); + hapiCheck(cudaFree(d_temperature)); + hapiCheck(cudaFree(d_new_temperature)); + // hapiCheck(cudaFreeHost(h_left_ghost)); + // hapiCheck(cudaFreeHost(h_right_ghost)); + // hapiCheck(cudaFreeHost(h_top_ghost)); + // hapiCheck(cudaFreeHost(h_bottom_ghost)); + if (!use_zerocopy) { + hapiCheck(cudaFree(d_left_ghost)); + hapiCheck(cudaFree(d_right_ghost)); + } else { + hapiCheck(cudaFree(d_send_left_ghost)); + hapiCheck(cudaFree(d_send_right_ghost)); + hapiCheck(cudaFree(d_send_top_ghost)); + hapiCheck(cudaFree(d_send_bottom_ghost)); + hapiCheck(cudaFree(d_recv_left_ghost)); + hapiCheck(cudaFree(d_recv_right_ghost)); + } + + hapiCheck(cudaStreamDestroy(compute_stream)); + hapiCheck(cudaStreamDestroy(comm_stream)); + + hapiCheck(cudaEventDestroy(compute_event)); + hapiCheck(cudaEventDestroy(comm_event)); + } + + void pup(PUP::er& p) { + p | my_iter; + p | neighbors; + p | remote_count; + p | x; + p | y; + p | left_bound; + p | right_bound; + p | top_bound; + p | bottom_bound; + p | load_iters; + + if (p.isUnpacking()) { + // hapiCheck(hapiMallocHost((void**)&h_temperature, + // sizeof(DataType) * (block_width + 2) * (block_height + 2))); + hapiCheck(hapiMalloc((void**)&d_temperature, + sizeof(DataType) * (block_width + 2) * (block_height + 2))); + hapiCheck(hapiMalloc((void**)&d_new_temperature, + sizeof(DataType) * (block_width + 2) * (block_height + 2))); + // hapiCheck(hapiMallocHost((void**)&h_left_ghost, sizeof(DataType) * block_height)); + // hapiCheck(hapiMallocHost((void**)&h_right_ghost, sizeof(DataType) * block_height)); + // hapiCheck(hapiMallocHost((void**)&h_top_ghost, sizeof(DataType) * block_width)); + // hapiCheck(hapiMallocHost((void**)&h_bottom_ghost, sizeof(DataType) * block_width)); + if (!use_zerocopy) { + hapiCheck(hapiMalloc((void**)&d_left_ghost, sizeof(DataType) * block_height)); + hapiCheck(hapiMalloc((void**)&d_right_ghost, sizeof(DataType) * block_height)); + } else { + hapiCheck(hapiMalloc((void**)&d_send_left_ghost, sizeof(DataType) * block_height)); + hapiCheck(hapiMalloc((void**)&d_send_right_ghost, sizeof(DataType) * block_height)); + hapiCheck(hapiMalloc((void**)&d_send_top_ghost, sizeof(DataType) * block_width)); + hapiCheck(hapiMalloc((void**)&d_send_bottom_ghost, sizeof(DataType) * block_width)); + hapiCheck(hapiMalloc((void**)&d_recv_left_ghost, sizeof(DataType) * block_height)); + hapiCheck(hapiMalloc((void**)&d_recv_right_ghost, sizeof(DataType) * block_height)); + } + } + + p(d_temperature, (block_width + 2) * (block_height + 2), PUP::PUPMode::DEVICE); + p(d_new_temperature, (block_width + 2) * (block_height + 2), PUP::PUPMode::DEVICE); + } + + void init() { + // Initialize values + my_iter = 0; + neighbors = 0; + x = thisIndex.x; + y = thisIndex.y; + + load_iters = (((float) (x + y)) / (n_chares_x + n_chares_y)) * imbalance; + //CkPrintf("Block (%d,%d) load iters: %d\n", x, y, load_iters); + + std::ostringstream os; + os << "Init (" << std::to_string(x) << "," << std::to_string(y) << ")"; + NVTXTracer(os.str(), NVTXColor::Turquoise); + + // Check bounds and set number of valid neighbors + left_bound = right_bound = top_bound = bottom_bound = false; + if (thisIndex.x == 0) + left_bound = true; + else + neighbors++; + if (thisIndex.x == n_chares_x - 1) + right_bound = true; + else + neighbors++; + if (thisIndex.y == 0) + top_bound = true; + else + neighbors++; + if (thisIndex.y == n_chares_y - 1) + bottom_bound = true; + else + neighbors++; + + // Allocate memory and create CUDA entities + hapiCheck(hapiMallocHost((void**)&h_temperature, + sizeof(DataType) * (block_width + 2) * (block_height + 2))); + hapiCheck(hapiMalloc((void**)&d_temperature, + sizeof(DataType) * (block_width + 2) * (block_height + 2))); + hapiCheck(hapiMalloc((void**)&d_new_temperature, + sizeof(DataType) * (block_width + 2) * (block_height + 2))); + hapiCheck(hapiMallocHost((void**)&h_left_ghost, sizeof(DataType) * block_height)); + hapiCheck(hapiMallocHost((void**)&h_right_ghost, sizeof(DataType) * block_height)); + hapiCheck(hapiMallocHost((void**)&h_top_ghost, sizeof(DataType) * block_width)); + hapiCheck(hapiMallocHost((void**)&h_bottom_ghost, sizeof(DataType) * block_width)); + if (!use_zerocopy) { + hapiCheck(hapiMalloc((void**)&d_left_ghost, sizeof(DataType) * block_height)); + hapiCheck(hapiMalloc((void**)&d_right_ghost, sizeof(DataType) * block_height)); + } else { + hapiCheck(hapiMalloc((void**)&d_send_left_ghost, sizeof(DataType) * block_height)); + hapiCheck(hapiMalloc((void**)&d_send_right_ghost, sizeof(DataType) * block_height)); + hapiCheck(hapiMalloc((void**)&d_send_top_ghost, sizeof(DataType) * block_width)); + hapiCheck(hapiMalloc((void**)&d_send_bottom_ghost, sizeof(DataType) * block_width)); + hapiCheck(hapiMalloc((void**)&d_recv_left_ghost, sizeof(DataType) * block_height)); + hapiCheck(hapiMalloc((void**)&d_recv_right_ghost, sizeof(DataType) * block_height)); + } + + hapiCheck(cudaStreamCreateWithPriority(&compute_stream, cudaStreamDefault, 0)); + hapiCheck(cudaStreamCreateWithPriority(&comm_stream, cudaStreamDefault, -1)); + + hapiCheck(cudaEventCreateWithFlags(&compute_event, cudaEventDisableTiming)); + hapiCheck(cudaEventCreateWithFlags(&comm_event, cudaEventDisableTiming)); + + // Initialize temperature data + invokeInitKernel(d_temperature, block_width, block_height, compute_stream); + invokeInitKernel(d_new_temperature, block_width, block_height, compute_stream); + + // Enforce boundary conditions + invokeBoundaryKernels(d_temperature, block_width, block_height, left_bound, + right_bound, top_bound, bottom_bound, compute_stream); + invokeBoundaryKernels(d_new_temperature, block_width, block_height, left_bound, + right_bound, top_bound, bottom_bound, compute_stream); + +#if CUDA_SYNC + cudaStreamSynchronize(compute_stream); + thisProxy[thisIndex].initDone(); +#else + // TODO: Support reduction callback in hapiAddCallback + CkCallback* cb = new CkCallback(CkIndex_Block::initDone(), thisProxy[thisIndex]); + hapiAddCallback(compute_stream, cb); +#endif + } + + void initDone() { + contribute(CkCallback(CkReductionTarget(Main, initDone), main_proxy)); + } + + void iterate() { + if (my_iter == first_lb || (my_iter != 0 && my_iter % lb_freq == 0)) { + cudaStreamSynchronize(comm_stream); + cudaStreamSynchronize(compute_stream); + AtSync(); + } else { + thisProxy[thisIndex].exchangeGhosts(); + } + } + + void ResumeFromSync() { + thisProxy[thisIndex].exchangeGhosts(); + } + + void update() { + std::ostringstream os; + os << "update (" << std::to_string(x) << "," << std::to_string(y) << ")"; + NVTXTracer(os.str(), NVTXColor::WetAsphalt); + + // Operations in compute stream should only be executed when + // operations in communication stream (transfers and unpacking) complete + hapiCheck(cudaEventRecord(comm_event, comm_stream)); + hapiCheck(cudaStreamWaitEvent(compute_stream, comm_event, 0)); + +#if !COMM_ONLY + // Invoke GPU kernel for Jacobi computation + invokeJacobiKernel(d_temperature, d_new_temperature, block_width, block_height, load_iters, + compute_stream); +#endif + + // Operations in communication stream (packing and transfers) should + // only be executed when operations in compute stream complete + hapiCheck(cudaEventRecord(compute_event, compute_stream)); + hapiCheck(cudaStreamWaitEvent(comm_stream, compute_event, 0)); + + // Copy final temperature data back to host + if (print_elements && (my_iter == warmup_iters + n_iters)) { + hapiCheck(hapiMemcpyAsync(h_temperature, d_new_temperature, + sizeof(DataType) * (block_width + 2) * (block_height + 2), + cudaMemcpyDeviceToHost, comm_stream)); + } + + if (sync_ver) { +#if CUDA_SYNC + cudaStreamSynchronize(compute_stream); + thisProxy[thisIndex].updateDone(); +#else + CkCallback* cb = new CkCallback(CkIndex_Block::updateDone(), thisProxy[thisIndex]); + hapiAddCallback(compute_stream, cb); +#endif + } + } + + void updateDone() { + contribute(CkCallback(CkReductionTarget(Main, updateDone), main_proxy)); + } + + void packGhosts() { + std::ostringstream os; + os << "packGhosts (" << std::to_string(x) << "," << std::to_string(y) << ")"; + NVTXTracer(os.str(), NVTXColor::Emerald); + + if (use_zerocopy) { +#if !COMM_ONLY + // Pack non-contiguous ghosts to temporary contiguous buffers on device + invokePackingKernels(d_new_temperature, d_send_left_ghost, d_send_right_ghost, + left_bound, right_bound, block_width, block_height, comm_stream); +#endif + + // Copy top and bottom ghosts to send buffers + if (!top_bound) + hapiCheck(hapiMemcpyAsync(d_send_top_ghost, d_new_temperature + (block_width + 2) + 1, + block_width * sizeof(DataType), cudaMemcpyDeviceToDevice, comm_stream)); + if (!bottom_bound) + hapiCheck(hapiMemcpyAsync(d_send_bottom_ghost, d_new_temperature + (block_width + 2) * block_height + 1, + block_width * sizeof(DataType), cudaMemcpyDeviceToDevice, comm_stream)); + } else { +#if !COMM_ONLY + // Pack non-contiguous ghosts to temporary contiguous buffers on device + invokePackingKernels(d_new_temperature, d_left_ghost, d_right_ghost, + left_bound, right_bound, block_width, block_height, comm_stream); +#endif + + // Transfer ghosts from device to host + if (!left_bound) + hapiCheck(hapiMemcpyAsync(h_left_ghost, d_left_ghost, block_height * sizeof(DataType), + cudaMemcpyDeviceToHost, comm_stream)); + if (!right_bound) + hapiCheck(hapiMemcpyAsync(h_right_ghost, d_right_ghost, block_height * sizeof(DataType), + cudaMemcpyDeviceToHost, comm_stream)); + if (!top_bound) + hapiCheck(hapiMemcpyAsync(h_top_ghost, d_new_temperature + (block_width + 2) + 1, + block_width * sizeof(DataType), cudaMemcpyDeviceToHost, comm_stream)); + if (!bottom_bound) + hapiCheck(hapiMemcpyAsync(h_bottom_ghost, d_new_temperature + (block_width + 2) * block_height + 1, + block_width * sizeof(DataType), cudaMemcpyDeviceToHost, comm_stream)); + } + +#if CUDA_SYNC + cudaStreamSynchronize(comm_stream); + thisProxy[thisIndex].packGhostsDone(); +#else + // Add asynchronous callback to be invoked when packing kernels and + // ghost transfers are complete + CkCallback* cb = new CkCallback(CkIndex_Block::packGhostsDone(), thisProxy[thisIndex]); + hapiAddCallback(comm_stream, cb); +#endif + } + + void sendGhosts() { + std::ostringstream os; + os << "sendGhosts (" << std::to_string(x) << "," << std::to_string(y) << ")"; + NVTXTracer(os.str(), NVTXColor::PeterRiver); + + // Send ghosts to neighboring chares + if (use_zerocopy) { + if (!left_bound) + thisProxy(x - 1, y).receiveGhostsZC(my_iter, RIGHT, block_height, + CkDeviceBuffer(d_send_left_ghost, comm_stream)); + if (!right_bound) + thisProxy(x + 1, y).receiveGhostsZC(my_iter, LEFT, block_height, + CkDeviceBuffer(d_send_right_ghost, comm_stream)); + if (!top_bound) + thisProxy(x, y - 1).receiveGhostsZC(my_iter, BOTTOM, block_width, + CkDeviceBuffer(d_send_top_ghost, comm_stream)); + if (!bottom_bound) + thisProxy(x, y + 1).receiveGhostsZC(my_iter, TOP, block_width, + CkDeviceBuffer(d_send_bottom_ghost, comm_stream)); + } else { + if (!left_bound) + thisProxy(x - 1, y).receiveGhostsReg(my_iter, RIGHT, block_height, h_left_ghost); + if (!right_bound) + thisProxy(x + 1, y).receiveGhostsReg(my_iter, LEFT, block_height, h_right_ghost); + if (!top_bound) + thisProxy(x, y - 1).receiveGhostsReg(my_iter, BOTTOM, block_width, h_top_ghost); + if (!bottom_bound) + thisProxy(x, y + 1).receiveGhostsReg(my_iter, TOP, block_width, h_bottom_ghost); + } + } + + // This is the post entry method, the regular entry method is defined as a + // SDAG entry method in the .ci file + void receiveGhostsZC(int ref, int dir, int &size, DataType *&buf, CkDeviceBufferPost *devicePost) { + switch (dir) { + case LEFT: + buf = d_recv_left_ghost; + break; + case RIGHT: + buf = d_recv_right_ghost; + break; + case TOP: + buf = d_temperature + 1; + break; + case BOTTOM: + buf = d_temperature + (block_width + 2) * (block_height + 1) + 1; + break; + default: + CkAbort("Error: invalid direction"); + } + devicePost[0].hapi_stream = comm_stream; + } + + void processGhostsZC(int dir, int size, DataType* gh) { + std::ostringstream os; + os << "processGhostsZC (" << std::to_string(x) << "," << std::to_string(y) << ")"; + NVTXTracer(os.str(), NVTXColor::Amethyst); + + switch (dir) { + case LEFT: + invokeUnpackingKernel(d_temperature, d_recv_left_ghost, true, block_width, + block_height, comm_stream); + break; + case RIGHT: + invokeUnpackingKernel(d_temperature, d_recv_right_ghost, false, block_width, + block_height, comm_stream); + break; + case TOP: + case BOTTOM: + break; + default: + CkAbort("Error: invalid direction"); + } + } + + void processGhostsReg(int dir, int size, DataType* gh) { + std::ostringstream os; + os << "processGhostsReg (" << std::to_string(x) << "," << std::to_string(y) << ")"; + NVTXTracer(os.str(), NVTXColor::Amethyst); + + switch (dir) { + case LEFT: + memcpy(h_left_ghost, gh, size * sizeof(DataType)); + hapiCheck(hapiMemcpyAsync(d_left_ghost, h_left_ghost, + block_height * sizeof(DataType), cudaMemcpyHostToDevice, comm_stream)); +#if !COMM_ONLY + invokeUnpackingKernel(d_temperature, d_left_ghost, true, block_width, + block_height, comm_stream); +#endif + break; + case RIGHT: + memcpy(h_right_ghost, gh, size * sizeof(DataType)); + hapiCheck(hapiMemcpyAsync(d_right_ghost, h_right_ghost, + block_height * sizeof(DataType), cudaMemcpyHostToDevice, comm_stream)); +#if !COMM_ONLY + invokeUnpackingKernel(d_temperature, d_right_ghost, false, block_width, + block_height, comm_stream); +#endif + break; + case TOP: + memcpy(h_top_ghost, gh, size * sizeof(DataType)); + hapiCheck(hapiMemcpyAsync(d_temperature + 1, h_top_ghost, + block_width * sizeof(DataType), cudaMemcpyHostToDevice, comm_stream)); + break; + case BOTTOM: + memcpy(h_bottom_ghost, gh, size * sizeof(DataType)); + hapiCheck(hapiMemcpyAsync(d_temperature + (block_width + 2) * (block_height + 1) + 1, + h_bottom_ghost, block_width * sizeof(DataType), cudaMemcpyHostToDevice, comm_stream)); + break; + default: + CkAbort("Error: invalid direction"); + } + } + + void print() { + CkPrintf("[%d,%d]\n", thisIndex.x, thisIndex.y); + for (int j = 0; j < block_height + 2; j++) { + for (int i = 0; i < block_width + 2; i++) { +#ifdef TEST_CORRECTNESS + CkPrintf("%d ", h_temperature[(block_width + 2) * j + i]); +#else + CkPrintf("%.6lf ", h_temperature[(block_width + 2) * j + i]); +#endif + } + CkPrintf("\n"); + } + + if (!(thisIndex.x == n_chares_x-1 && thisIndex.y == n_chares_y-1)) { + if (thisIndex.x == n_chares_x-1) { + thisProxy(0,thisIndex.y+1).print(); + } else { + thisProxy(thisIndex.x+1,thisIndex.y).print(); + } + } else { + main_proxy.printDone(); + } + } +}; + +#include "jacobi2d.def.h" diff --git a/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.ci b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.ci new file mode 100644 index 0000000000..3957f23bfd --- /dev/null +++ b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.ci @@ -0,0 +1,90 @@ +mainmodule jacobi2d { + include "jacobi2d.h"; + + readonly CProxy_Main main_proxy; + readonly CProxy_Block block_proxy; + readonly int grid_width; + readonly int grid_height; + readonly int block_width; + readonly int block_height; + readonly int n_chares_x; + readonly int n_chares_y; + readonly int n_iters; + readonly int warmup_iters; + readonly bool sync_ver; + readonly bool use_zerocopy; + readonly bool print_elements; + readonly int lb_freq; + readonly int first_lb; + readonly int imbalance; + + mainchare Main { + entry Main(CkArgMsg* m); + entry [reductiontarget] void initDone(); + entry void startIter(); + entry [reductiontarget] void updateDone(); + entry [reductiontarget] void commDone(); + entry [reductiontarget] void allDone(); + entry void printDone(); + }; + + array [2D] Block { + entry Block(void); + entry void init(); + entry void initDone(); + entry void update(); + entry void updateDone(); + entry void packGhosts(); + entry void packGhostsDone(); + entry void receiveGhostsZC(int ref, int dir, int w, nocopydevice DataType gh[w]); + entry void receiveGhostsReg(int ref, int dir, int w, DataType gh[w]); + entry void iterate(); + + entry void exchangeGhosts() { + serial { + my_iter++; + update(); + if (!sync_ver) packGhosts(); + } + + when packGhostsDone() { + serial { + // When packing is done, we know that the new temperatures have been updated + // (because the host doesn't separately detect when the Jacobi kernel completes) + std::swap(d_temperature, d_new_temperature); + sendGhosts(); + } + } + + for (remote_count = 0; remote_count < neighbors; remote_count++) { + if (use_zerocopy) { + when receiveGhostsZC[my_iter](int ref, int dir, int w, nocopydevice DataType buf[w]) { + serial { + processGhostsZC(dir, w, buf); + } + } + } else { + when receiveGhostsReg[my_iter](int ref, int dir, int w, DataType buf[w]) { + serial { + processGhostsReg(dir, w, buf); + } + } + } + } + + serial { + if (sync_ver || my_iter <= warmup_iters) { + contribute(CkCallback(CkReductionTarget(Main, commDone), main_proxy)); + } else { + if (my_iter < warmup_iters + n_iters) { + thisProxy[thisIndex].iterate(); + } else { + contribute(CkCallback(CkReductionTarget(Main, allDone), main_proxy)); + } + } + } + } + + entry void print(); + }; +}; diff --git a/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.cu b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.cu new file mode 100644 index 0000000000..de1001d1c6 --- /dev/null +++ b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.cu @@ -0,0 +1,195 @@ +#include "hapi.h" +#include "jacobi2d.h" + +#define TILE_SIZE 16 +#define DIVIDEBY5 0.2 + +__global__ void initKernel(DataType* temperature, int block_width, + int block_height) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + if (i < block_width + 2 && j < block_height + 2) { + temperature[IDX(i,j)] = 0; + } +} + +__global__ void leftBoundaryKernel(DataType* temperature, int block_width, + int block_height) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + if (j < block_height) { + temperature[IDX(0,1+j)] = 1; + } +} + +__global__ void rightBoundaryKernel(DataType* temperature, int block_width, + int block_height) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + if (j < block_height) { + temperature[IDX(block_width+1,1+j)] = 1; + } +} + +__global__ void topBoundaryKernel(DataType* temperature, int block_width, + int block_height) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < block_width) { + temperature[IDX(1+i,0)] = 1; + } +} + +__global__ void bottomBoundaryKernel(DataType* temperature, int block_width, + int block_height) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i < block_width) { + temperature[IDX(1+i,block_height+1)] = 1; + } +} + +__global__ void jacobiKernel(DataType* temperature, DataType* new_temperature, + int block_width, int block_height, int iter) { + int i = (blockDim.x * blockIdx.x + threadIdx.x) + 1; + int j = (blockDim.y * blockIdx.y + threadIdx.y) + 1; + + if (i <= block_width && j <= block_height) { +#ifdef TEST_CORRECTNESS + new_temperature[IDX(i,j)] = (temperature[IDX(i-1,j)] + temperature[IDX(i+1,j)] + + temperature[IDX(i,j-1)] + temperature[IDX(i,j+1)] + temperature[IDX(i,j)]) % + 1e5; +#else + DataType temp = 0; + + for (int it = 0; it < iter; it++) + temp += (temperature[IDX(i-1,j)] + temperature[IDX(i+1,j)] + + temperature[IDX(i,j-1)] + temperature[IDX(i,j+1)] + temperature[IDX(i,j)]) * + DIVIDEBY5; + + new_temperature[IDX(i,j)] = temp / iter; +#endif + } +} + +__global__ void leftPackingKernel(DataType* temperature, DataType* ghost, + int block_width, int block_height) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + if (j < block_height) { + ghost[j] = temperature[IDX(1,1+j)]; + } +} + +__global__ void rightPackingKernel(DataType* temperature, DataType* ghost, + int block_width, int block_height) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + if (j < block_height) { + ghost[j] = temperature[IDX(block_width,1+j)]; + } +} + +__global__ void leftUnpackingKernel(DataType* temperature, DataType* ghost, + int block_width, int block_height) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + if (j < block_height) { + temperature[IDX(0,1+j)] = ghost[j]; + } +} + +__global__ void rightUnpackingKernel(DataType* temperature, DataType* ghost, + int block_width, int block_height) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + if (j < block_height) { + temperature[IDX(block_width+1,1+j)] = ghost[j]; + } +} + +void invokeInitKernel(DataType* d_temperature, int block_width, int block_height, + cudaStream_t stream) { + dim3 block_dim(TILE_SIZE, TILE_SIZE); + dim3 grid_dim(((block_width + 2) + (block_dim.x - 1)) / block_dim.x, + ((block_height + 2) + (block_dim.y - 1)) / block_dim.y); + + HAPI_LAUNCH_KERNEL_WRAPPER((initKernel<<>>(d_temperature, block_width, block_height)), stream) + // hapiLaunchKernelWrapper(initKernel, grid_dim, block_dim, 0, stream, + // d_temperature, block_width, block_height); + hapiCheck(cudaPeekAtLastError()); +} + +void invokeBoundaryKernels(DataType* d_temperature, int block_width, + int block_height, bool left_bound, bool right_bound, bool top_bound, + bool bottom_bound, cudaStream_t stream) { + dim3 block_dim(TILE_SIZE * TILE_SIZE); + + if (left_bound) { + dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x); + leftBoundaryKernel<<>>(d_temperature, + block_width, block_height); + // hapiLaunchKernelWrapper(leftBoundaryKernel, grid_dim, block_dim, 0, stream, + // d_temperature, block_width, block_height); + } + if (right_bound) { + dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x); + rightBoundaryKernel<<>>(d_temperature, + block_width, block_height); + // hapiLaunchKernelWrapper(rightBoundaryKernel, grid_dim, block_dim, 0, stream, + // d_temperature, block_width, block_height); + } + if (top_bound) { + dim3 grid_dim((block_width + (block_dim.x - 1)) / block_dim.x); + topBoundaryKernel<<>>(d_temperature, + block_width, block_height); + // hapiLaunchKernelWrapper(topBoundaryKernel, grid_dim, block_dim, 0, stream, + // d_temperature, block_width, block_height); + } + if (bottom_bound) { + dim3 grid_dim((block_width + (block_dim.x - 1)) / block_dim.x); + bottomBoundaryKernel<<>>(d_temperature, + block_width, block_height); + // hapiLaunchKernelWrapper(bottomBoundaryKernel, grid_dim, block_dim, 0, stream, + // d_temperature, block_width, block_height); + } + hapiCheck(cudaPeekAtLastError()); +} + +void invokeJacobiKernel(DataType* d_temperature, DataType* d_new_temperature, + int block_width, int block_height, int iter, cudaStream_t stream) { + dim3 block_dim(TILE_SIZE, TILE_SIZE); + dim3 grid_dim((block_width + (block_dim.x - 1)) / block_dim.x, + (block_height + (block_dim.y - 1)) / block_dim.y); + + HAPI_LAUNCH_KERNEL_WRAPPER((jacobiKernel<<>>(d_temperature, d_new_temperature, block_width, block_height, iter)), stream) + // hapiLaunchKernelWrapper(jacobiKernel, grid_dim, block_dim, 0, stream, + // d_temperature, d_new_temperature, block_width, block_height, iter); + hapiCheck(cudaPeekAtLastError()); +} + +void invokePackingKernels(DataType* d_temperature, DataType* d_left_ghost, + DataType* d_right_ghost, bool left_bound, bool right_bound, int block_width, + int block_height, cudaStream_t stream) { + dim3 block_dim(TILE_SIZE * TILE_SIZE); + dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x); + if (!left_bound) { + leftPackingKernel<<>>(d_temperature, d_left_ghost, block_width, block_height); + // hapiLaunchKernelWrapper(leftPackingKernel, grid_dim, block_dim, 0, stream, + // d_temperature, d_left_ghost, block_width, block_height); + } + if (!right_bound) { + rightPackingKernel<<>>(d_temperature, d_right_ghost, block_width, block_height); + // hapiLaunchKernelWrapper(rightPackingKernel, grid_dim, block_dim, 0, stream, + // d_temperature, d_right_ghost, block_width, block_height); + } + hapiCheck(cudaPeekAtLastError()); +} + +void invokeUnpackingKernel(DataType* d_temperature, DataType* d_ghost, bool is_left, + int block_width, int block_height, cudaStream_t stream) { + dim3 block_dim(TILE_SIZE * TILE_SIZE); + dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x); + if (is_left) { + leftUnpackingKernel<<>>(d_temperature, d_ghost, block_width, block_height); + // hapiLaunchKernelWrapper(leftUnpackingKernel, grid_dim, block_dim, 0, stream, + // d_temperature, d_ghost, block_width, block_height); + } else { + rightUnpackingKernel<<>>(d_temperature, d_ghost, block_width, block_height); + // hapiLaunchKernelWrapper(rightUnpackingKernel, grid_dim, block_dim, 0, stream, + // d_temperature, d_ghost, block_width, block_height); + } + hapiCheck(cudaPeekAtLastError()); +} diff --git a/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.h b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.h new file mode 100644 index 0000000000..56c3aa7662 --- /dev/null +++ b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.h @@ -0,0 +1,12 @@ +#ifndef __CUDA_GPUDIRECT_JACOBI2D_H_ +#define __CUDA_GPUDIRECT_JACOBI2D_H_ + +#ifdef TEST_CORRECTNESS +typedef int DataType; +#else +typedef float DataType; +#endif + +#define IDX(x,y) ((block_width+2)*(y)+(x)) + +#endif // __CUDA_GPUDIRECT_JACOBI2D_H_ diff --git a/examples/charm++/cuda/gpudirect/jacobi2d/Makefile b/examples/charm++/cuda/gpudirect/jacobi2d/Makefile index 72d1f7cb44..9e38264283 100644 --- a/examples/charm++/cuda/gpudirect/jacobi2d/Makefile +++ b/examples/charm++/cuda/gpudirect/jacobi2d/Makefile @@ -1,31 +1,37 @@ -OPTS = -O3 +OPTS = -O3 -DHAPI_CUDA_CALLBACK -CHARM_DIR = ../../../../.. +CHARM_DIR = /u/ajain18/oldCharm/multicore-linux-x86_64 CHARMC = $(CHARM_DIR)/bin/charmc $(OPTS) CHARM_INC = -I$(CHARM_DIR)/include +CHARMC_FLAGS = -D__HIP_PLATFORM_AMD__=ON -NVCC = nvcc -NVCC_FLAGS = -c -std=c++11 -use_fast_math $(OPTS) -LD_LIBS = +HAPITOOLKIT_HOME ?= /opt/rocm +HAPICC = hipcc +HAPICC_FLAGS = -c -std=c++11 +HAPICC_INC = -I$(HAPITOOLKIT_HOME)/include +HAPICC_LIB = -L$(HAPITOOLKIT_HOME)/lib + +LD_LIBS = -module EveryLB TARGET = jacobi2d all: $(TARGET) -OBJS = $(TARGET).o $(TARGET)CUDA.o +OBJS = $(TARGET).o $(TARGET)HAPI.o $(TARGET): $(OBJS) - $(CHARMC) -language charm++ -o $@ $(OBJS) $(LD_LIBS) - -$(TARGET).decl.h: $(TARGET).ci $(TARGET).h - $(CHARMC) $< + $(CHARMC) $(CHARMC_FLAGS) -language charm++ -o $@ $(OBJS) $(LD_LIBS) -$(TARGET).def.h: $(TARGET).ci $(TARGET).h +$(TARGET).decl.h: $(TARGET).ci + $(CHARMC) $(CHARMC_FLAGS) $< -$(TARGET).o: $(TARGET).C $(TARGET).decl.h $(TARGET).def.h $(TARGET).h - $(CHARMC) -c $< +$(TARGET).o: $(TARGET).C $(TARGET).decl.h + $(CHARMC) $(CHARMC_FLAGS) -c $< -$(TARGET)CUDA.o: $(TARGET).cu $(TARGET).h - $(NVCC) -o $@ $(NVCC_FLAGS) $(CHARM_INC) $< +$(TARGET)HAPI.o: $(TARGET).cu + $(HAPICC) $(CHARMC_FLAGS) -o $@ $(HAPICC_FLAGS) $(HAPICC_INC) $(CHARM_INC) $< clean: rm -f *.decl.h *.def.h conv-host *.o $(TARGET) charmrun + +test: all + $(call run, ./$(TARGET) +p2) diff --git a/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.C b/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.C index 6e10917377..62a461ea27 100644 --- a/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.C +++ b/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.C @@ -21,19 +21,21 @@ /* readonly */ bool sync_ver; /* readonly */ bool use_zerocopy; /* readonly */ bool print_elements; +/* readonly */ int lb_freq; +/* readonly */ int first_lb; extern void invokeInitKernel(DataType* d_temperature, int block_width, - int block_height, cudaStream_t stream); + int block_height, hapiStream_t stream); extern void invokeBoundaryKernels(DataType* d_temperature, int block_width, int block_height, bool left_bound, bool right_bound, bool top_bound, - bool bottom_bound, cudaStream_t stream); + bool bottom_bound, hapiStream_t stream); extern void invokeJacobiKernel(DataType* d_temperature, DataType* d_new_temperature, - int block_width, int block_height, cudaStream_t stream); + int block_width, int block_height, hapiStream_t stream); extern void invokePackingKernels(DataType* d_temperature, DataType* d_left_ghost, DataType* d_right_ghost, bool left_bound, bool right_bound, int block_width, - int block_height, cudaStream_t stream); + int block_height, hapiStream_t stream); extern void invokeUnpackingKernel(DataType* d_temperature, DataType* d_ghost, - bool is_left, int block_width, int block_height, cudaStream_t stream); + bool is_left, int block_width, int block_height, hapiStream_t stream); enum Direction { LEFT = 1, RIGHT, TOP, BOTTOM }; @@ -60,6 +62,8 @@ public: print_elements = false; sync_ver = false; my_iter = 0; + first_lb = 10; + lb_freq = 100; // Initialize aggregate timers update_agg_time = 0.0; @@ -120,7 +124,7 @@ public: "Warm-up: %d, Bulk-synchronous: %d, Zerocopy: %d, Print: %d\n\n", grid_width, grid_height, block_width, block_height, n_chares_x, n_chares_y, n_iters, warmup_iters, sync_ver, use_zerocopy, print_elements); - +fflush(stdout); // Create blocks and start iteration block_proxy = CProxy_Block::ckNew(n_chares_x, n_chares_y); init_start_time = CkWallTimer(); @@ -129,7 +133,7 @@ public: void initDone() { CkPrintf("Init time: %.3lf s\n", CkWallTimer() - init_start_time); - +fflush(stdout); startIter(); } @@ -161,9 +165,11 @@ public: double total_time = CkWallTimer() - start_time; CkPrintf("Total time: %.3lf s\nAverage iteration time: %.3lf us\n", total_time, (total_time / n_iters) * 1e6); + fflush(stdout); if (sync_ver) { CkPrintf("Comm time per iteration: %.3lf us\nUpdate time per iteration: %.3lf us\n", (comm_agg_time / n_iters) * 1e6, (update_agg_time / n_iters) * 1e6); +fflush(stdout); } if (print_elements) { @@ -185,6 +191,7 @@ class Block : public CBase_Block { public: int my_iter; int neighbors; + int send_done_idx; int remote_count; int x, y; @@ -203,42 +210,102 @@ class Block : public CBase_Block { DataType* __restrict__ d_send_bottom_ghost; DataType* __restrict__ d_recv_left_ghost; DataType* __restrict__ d_recv_right_ghost; + DataType* __restrict__ d_recv_top_ghost; + DataType* __restrict__ d_recv_bottom_ghost; - cudaStream_t compute_stream; - cudaStream_t comm_stream; + hapiStream_t compute_stream; + hapiStream_t comm_stream; - cudaEvent_t compute_event; - cudaEvent_t comm_event; + hapiEvent_t compute_event; + hapiEvent_t comm_event; bool left_bound, right_bound, top_bound, bottom_bound; - Block() {} + Block() { + usesAtSync = true; + ckout<<"["<>>(d_temperature, block_width, block_height); - hapiCheck(cudaPeekAtLastError()); + // hapiLaunchKernelWrapper(initKernel, grid_dim, block_dim, 0, stream, + // d_temperature, block_width, block_height); + // hapiCheck(hapiPeekAtLastError()); } void invokeBoundaryKernels(DataType* d_temperature, int block_width, int block_height, bool left_bound, bool right_bound, bool top_bound, - bool bottom_bound, cudaStream_t stream) { + bool bottom_bound, hapiStream_t stream) { dim3 block_dim(TILE_SIZE * TILE_SIZE); if (left_bound) { dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x); leftBoundaryKernel<<>>(d_temperature, - block_width, block_height); + block_width, block_height); + // hapiLaunchKernelWrapper(leftBoundaryKernel, grid_dim, block_dim, 0, stream, + // d_temperature, block_width, block_height); } if (right_bound) { dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x); rightBoundaryKernel<<>>(d_temperature, - block_width, block_height); + block_width, block_height); + // hapiLaunchKernelWrapper(rightBoundaryKernel, grid_dim, block_dim, 0, stream, + // d_temperature, block_width, block_height); } if (top_bound) { dim3 grid_dim((block_width + (block_dim.x - 1)) / block_dim.x); topBoundaryKernel<<>>(d_temperature, - block_width, block_height); + block_width, block_height); + // hapiLaunchKernelWrapper(topBoundaryKernel, grid_dim, block_dim, 0, stream, + // d_temperature, block_width, block_height); } if (bottom_bound) { dim3 grid_dim((block_width + (block_dim.x - 1)) / block_dim.x); bottomBoundaryKernel<<>>(d_temperature, - block_width, block_height); + block_width, block_height); + // hapiLaunchKernelWrapper(bottomBoundaryKernel, grid_dim, block_dim, 0, stream, + // d_temperature, block_width, block_height); } - hapiCheck(cudaPeekAtLastError()); + // hapiCheck(hapiPeekAtLastError()); } void invokeJacobiKernel(DataType* d_temperature, DataType* d_new_temperature, - int block_width, int block_height, cudaStream_t stream) { + int block_width, int block_height, hapiStream_t stream) { dim3 block_dim(TILE_SIZE, TILE_SIZE); dim3 grid_dim((block_width + (block_dim.x - 1)) / block_dim.x, (block_height + (block_dim.y - 1)) / block_dim.y); jacobiKernel<<>>(d_temperature, d_new_temperature, block_width, block_height); - hapiCheck(cudaPeekAtLastError()); + // hapiLaunchKernelWrapper(jacobiKernel, grid_dim, block_dim, 0, stream, + // d_temperature, d_new_temperature, block_width, block_height); + // hapiCheck(hapiPeekAtLastError()); } void invokePackingKernels(DataType* d_temperature, DataType* d_left_ghost, DataType* d_right_ghost, bool left_bound, bool right_bound, int block_width, - int block_height, cudaStream_t stream) { + int block_height, hapiStream_t stream) { dim3 block_dim(TILE_SIZE * TILE_SIZE); dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x); if (!left_bound) { leftPackingKernel<<>>(d_temperature, d_left_ghost, block_width, block_height); + // hapiLaunchKernelWrapper(leftPackingKernel, grid_dim, block_dim, 0, stream, + // d_temperature, d_left_ghost, block_width, block_height); } if (!right_bound) { rightPackingKernel<<>>(d_temperature, d_right_ghost, block_width, block_height); + // hapiLaunchKernelWrapper(rightPackingKernel, grid_dim, block_dim, 0, stream, + // d_temperature, d_right_ghost, block_width, block_height); } - hapiCheck(cudaPeekAtLastError()); + // hapiCheck(hapiPeekAtLastError()); } void invokeUnpackingKernel(DataType* d_temperature, DataType* d_ghost, bool is_left, - int block_width, int block_height, cudaStream_t stream) { + int block_width, int block_height, hapiStream_t stream) { dim3 block_dim(TILE_SIZE * TILE_SIZE); dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x); if (is_left) { leftUnpackingKernel<<>>(d_temperature, d_ghost, block_width, block_height); + // hapiLaunchKernelWrapper(leftUnpackingKernel, grid_dim, block_dim, 0, stream, + // d_temperature, d_ghost, block_width, block_height); } else { rightUnpackingKernel<<>>(d_temperature, d_ghost, block_width, block_height); + // hapiLaunchKernelWrapper(rightUnpackingKernel, grid_dim, block_dim, 0, stream, + // d_temperature, d_ghost, block_width, block_height); } - hapiCheck(cudaPeekAtLastError()); + // hapiCheck(hapiPeekAtLastError()); } diff --git a/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.h b/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.h index ed628a4f59..56c3aa7662 100644 --- a/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.h +++ b/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.h @@ -4,7 +4,7 @@ #ifdef TEST_CORRECTNESS typedef int DataType; #else -typedef double DataType; +typedef float DataType; #endif #define IDX(x,y) ((block_width+2)*(y)+(x)) diff --git a/examples/charm++/cuda/gpudirect/sdag/sdag.C b/examples/charm++/cuda/gpudirect/sdag/sdag.C index c0da070c30..d4093f897c 100644 --- a/examples/charm++/cuda/gpudirect/sdag/sdag.C +++ b/examples/charm++/cuda/gpudirect/sdag/sdag.C @@ -9,7 +9,7 @@ /* readonly */ int block_size; /* readonly */ int n_iters; -extern void invokeInitKernel(double*, int, double, cudaStream_t); +extern void invokeInitKernel(double*, int, double, hapiStream_t); class Main : public CBase_Main { double start_time; @@ -71,19 +71,19 @@ class Block : public CBase_Block { int* reg_local_data; int* reg_remote_data; - cudaStream_t stream; + hapiStream_t stream; Block() {} ~Block() { - // Free memory and destroy CUDA stream - hapiCheck(cudaFreeHost(h_local_data)); - hapiCheck(cudaFreeHost(h_remote_data)); - hapiCheck(cudaFree(d_local_data)); - hapiCheck(cudaFree(d_remote_data)); + // Free memory and destroy hapi stream + hapiCheck(hapiFreeHost(h_local_data)); + hapiCheck(hapiFreeHost(h_remote_data)); + hapiCheck(hapiFree(d_local_data)); + hapiCheck(hapiFree(d_remote_data)); free(reg_local_data); free(reg_remote_data); - cudaStreamDestroy(stream); + hapiStreamDestroy(stream); } void init() { @@ -93,14 +93,14 @@ class Block : public CBase_Block { peer = (thisIndex < CkNumPes() / 2) ? (thisIndex + CkNumPes() / 2) : (thisIndex - CkNumPes() / 2); - // Allocate memory and create CUDA stream - hapiCheck(cudaMallocHost(&h_local_data, sizeof(double) * block_size)); - hapiCheck(cudaMallocHost(&h_remote_data, sizeof(double) * block_size)); - hapiCheck(cudaMalloc(&d_local_data, sizeof(double) * block_size)); - hapiCheck(cudaMalloc(&d_remote_data, sizeof(double) * block_size)); + // Allocate memory and create hapi stream + hapiCheck(hapiMallocHost(&h_local_data, sizeof(double) * block_size)); + hapiCheck(hapiMallocHost(&h_remote_data, sizeof(double) * block_size)); + hapiCheck(hapiMalloc(&d_local_data, sizeof(double) * block_size)); + hapiCheck(hapiMalloc(&d_remote_data, sizeof(double) * block_size)); reg_local_data = (int*)malloc(sizeof(int) * block_size); reg_remote_data = (int*)malloc(sizeof(int) * block_size); - cudaStreamCreate(&stream); + hapiStreamCreate(&stream); // Initialize data invokeInitKernel(d_local_data, block_size, (double)thisIndex, stream); @@ -115,9 +115,9 @@ class Block : public CBase_Block { void receive(int ref, int &size1, double *&arr1, int size2, int *arr2, CkDeviceBufferPost *devicePost) { // Inform the runtime where the incoming data should be stored - // and which CUDA stream should be used for the transfer + // and which hapi stream should be used for the transfer arr1 = d_remote_data; - devicePost[0].cuda_stream = stream; + devicePost[0].hapi_stream = stream; // Last array should be available here as it is not RDMA // Copy it over for validation @@ -127,9 +127,9 @@ class Block : public CBase_Block { void validateData() { // Move the data to the host for validation - hapiCheck(cudaMemcpyAsync(h_remote_data, d_remote_data, - sizeof(double) * block_size, cudaMemcpyDeviceToHost, stream)); - hapiCheck(cudaStreamSynchronize(stream)); + hapiCheck(hapiMemcpyAsync(h_remote_data, d_remote_data, + sizeof(double) * block_size, hapiMemcpyDeviceToHost, stream)); + hapiCheck(hapiStreamSynchronize(stream)); // Validate data bool validated = true; diff --git a/examples/charm++/cuda/gpudirect/sdag/sdag.cu b/examples/charm++/cuda/gpudirect/sdag/sdag.cu index 43740102eb..16143dbe0e 100644 --- a/examples/charm++/cuda/gpudirect/sdag/sdag.cu +++ b/examples/charm++/cuda/gpudirect/sdag/sdag.cu @@ -10,11 +10,11 @@ __global__ void initKernel(double* data, int count, double val) { } } -void invokeInitKernel(double* data, int count, double val, cudaStream_t stream) { +void invokeInitKernel(double* data, int count, double val, hapiStream_t stream) { dim3 block_dim(TB_SIZE); dim3 grid_dim((count + block_dim.x - 1) / block_dim.x); initKernel<<>>(data, count, val); - hapiCheck(cudaPeekAtLastError()); + hapiCheck(hapiPeekAtLastError()); } diff --git a/examples/charm++/cuda/gpudirect/verify/Makefile b/examples/charm++/cuda/gpudirect/verify/Makefile index 2ea467f546..d323fe387f 100644 --- a/examples/charm++/cuda/gpudirect/verify/Makefile +++ b/examples/charm++/cuda/gpudirect/verify/Makefile @@ -1,11 +1,16 @@ OPTS = -O0 -g -CHARM_DIR = ../../../../.. +CHARM_DIR = /u/ajain18/oldCharm/multicore-linux-x86_64 CHARMC = $(CHARM_DIR)/bin/charmc $(OPTS) CHARM_INC = -I$(CHARM_DIR)/include -NVCC = nvcc -NVCC_FLAGS = -O3 -c -std=c++11 -use_fast_math +AMD_FLAGS = -D__HIP_PLATFORM_AMD__=ON + +HAPICC = hipcc +HAPITOOLKIT_HOME = /opt/rocm +HAPICC_INC = -I$(HAPITOOLKIT_HOME)/include +HAPICC_LIB = -L$(HAPITOOLKIT_HOME)/lib +HAPI_FLAGS = -O3 -g -c -std=c++11 -use_fast_math LD_LIBS = TARGET = verify @@ -14,18 +19,18 @@ all: $(TARGET) OBJS = $(TARGET).o $(TARGET)CUDA.o $(TARGET): $(OBJS) - $(CHARMC) -language charm++ -module CommonLBs -o $@ $(OBJS) $(LD_LIBS) + $(CHARMC) $(AMD_FLAGS) -language charm++ -module CommonLBs -o $@ $(OBJS) $(LD_LIBS) $(TARGET).decl.h: $(TARGET).ci - $(CHARMC) $< + $(CHARMC) $(AMD_FLAGS) $< $(TARGET).def.h: $(TARGET).ci $(TARGET).o: $(TARGET).C $(TARGET).decl.h $(TARGET).def.h - $(CHARMC) -c $< + $(CHARMC) $(AMD_FLAGS) -c $< $(TARGET)CUDA.o: $(TARGET).cu - $(NVCC) -o $@ $(NVCC_FLAGS) $(CHARM_INC) $< + $(HAPICC) $(AMD_FLAGS) -o $@ $(HAPI_FLAGS) $(HAPICC_INC) $(CHARM_INC) $(HAPICC_LIB) $< clean: rm -f *.decl.h *.def.h conv-host *.o $(TARGET) charmrun diff --git a/examples/charm++/cuda/gpudirect/verify/charmrun_hapi b/examples/charm++/cuda/gpudirect/verify/charmrun_hapi new file mode 100755 index 0000000000..2064a05b7c --- /dev/null +++ b/examples/charm++/cuda/gpudirect/verify/charmrun_hapi @@ -0,0 +1,292 @@ +#!/bin/bash + +is_restart=false +original_args=("$@") +pes_file="/dev/shm/numRestartProcs.txt" +original_nodelist_file="/tmp/hapi_original_nodelist.txt" + +# --- Pre-parse to find the nodelist for daemon startup --- +machinefile="" +for ((i=0; i<${#original_args[@]}; ++i)); do + if [[ "${original_args[i]}" == "++nodelist" ]]; then + machinefile="${original_args[i+1]}" + break + fi +done + +num_nodes=0 +if [[ -n "$machinefile" ]]; then + if [[ ! -f "$machinefile" ]]; then + echo "Charmrun> Error: nodelist file not found: $machinefile" >&2 + exit 1 + fi + num_nodes=$(wc -l < "$machinefile") +else + echo "Charmrun> Warning: ++nodelist not found. Assuming 1 node for HAPI daemon." + num_nodes=1 +fi + +# --- Clean up and start the memory daemon in the background --- +# Read IP addresses and slots from nodelist file (format: ipaddress slots=X) +declare -A node_slots +node_ips=() +if [[ -n "$machinefile" ]]; then + while IFS= read -r line; do + # Extract IP address and slots count + ip=$(echo "$line" | awk '{print $1}') + slots=$(echo "$line" | grep -o 'slots=[0-9]*' | cut -d'=' -f2) + if [[ -n "$ip" ]]; then + node_ips+=("$ip") + # Default to 1 slot if not specified + node_slots["$ip"]=${slots:-1} + fi + done < "$machinefile" +else + # Default to localhost if no nodelist + node_ips=("localhost") + node_slots["localhost"]=1 +fi + +# Save the original nodelist for restart comparison (only on first run) +if [[ ! -f "$original_nodelist_file" ]]; then + if [[ -n "$machinefile" ]]; then + cp "$machinefile" "$original_nodelist_file" + echo "Charmrun> Saved original nodelist to $original_nodelist_file" + else + echo "localhost slots=1" > "$original_nodelist_file" + echo "Charmrun> Created default nodelist file at $original_nodelist_file" + fi +fi + +# Function to get nodes from a nodelist file +get_nodes_from_file() { + local file="$1" + local -A nodes_map + local -a nodes_list + + if [[ -f "$file" && -s "$file" ]]; then + while IFS= read -r line; do + # Skip empty lines and comments + [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue + ip=$(echo "$line" | awk '{print $1}') + slots=$(echo "$line" | grep -o 'slots=[0-9]*' | cut -d'=' -f2) + if [[ -n "$ip" ]]; then + nodes_map["$ip"]=${slots:-1} + nodes_list+=("$ip") + fi + done < "$file" + fi + + # Return both the list and the associative array (global variables) + eval "original_nodes=(${nodes_list[*]})" + for ip in "${nodes_list[@]}"; do + eval "original_node_slots[\"$ip\"]=${nodes_map["$ip"]}" + done +} + +# Function to find new nodes by comparing current nodelist with original (optimized with hashmap) +find_new_nodes() { + declare -A original_node_slots + declare -a original_nodes + + # Get original nodes + get_nodes_from_file "$original_nodelist_file" + + declare -A original_nodes_map + for orig_ip in "${original_nodes[@]}"; do + original_nodes_map["$orig_ip"]=1 + done + + local -a new_nodes + local -A new_node_slots + + # Compare current nodes with original nodes using hashmap lookup + for ip in "${node_ips[@]}"; do + # Check if node exists in original nodes hashmap (O(1) lookup) + if [[ -z "${original_nodes_map[$ip]}" ]]; then + echo "Charmrun> New node detected: $ip" + new_nodes+=("$ip") + new_node_slots["$ip"]=${node_slots["$ip"]} + fi + done + + # Return new nodes (use global variables) + eval "detected_new_nodes=(${new_nodes[*]})" + for ip in "${new_nodes[@]}"; do + eval "detected_new_node_slots[\"$ip\"]=${new_node_slots["$ip"]}" + done +} + +# Clean up on all nodes via SSH (async) +cleanup_pids=() +echo "Charmrun> Initial cleanup on ${#node_ips[@]} node(s): ${node_ips[*]}" +for ip in "${node_ips[@]}"; do + slots=${node_slots["$ip"]} + fifo_cmd="rm -f /dev/shm/numRestartProcs.txt /tmp/server_pipe_* /tmp/client_pipe_* /tmp/daemon_ready_*" + for ((slot=0; slot Starting memory daemons on all nodes..." +for ip in "${node_ips[@]}"; do + slots=${node_slots["$ip"]} + echo "Charmrun> Starting $slots daemon(s) on node $ip" + for ((slot=0; slot /dev/null 2>&1 &" & + daemon_pids+=($!) + done +done + +# Optional: Wait a brief moment for SSH connections to establish (non-blocking) +# sleep 1 + +# --- Main execution loop --- +while true; do + # Reset and parse arguments for each run + args=() + pes_arg="" + restart_arg="" + + temp_args=("${original_args[@]}") + i=0 + while [ $i -lt ${#temp_args[@]} ]; do + arg="${temp_args[$i]}" + case "$arg" in + +p|++p) + i=$((i+1)) + pes_arg="$arg ${temp_args[$i]}" + ;; + +p[0-9]*) + pes_arg="$arg" + ;; + ++p[0-9]*) + pes_arg="$arg" + ;; + *) + args+=("$arg") + ;; + esac + i=$((i+1)) + done + + # Check the flag. If it's a restart, prepare the extra arguments. + if [ "$is_restart" = true ]; then + restart_arg="+shrinkexpand +restart /dev/shm" + if [ -f "$pes_file" ]; then + num_pes=$(cat "$pes_file") + pes_arg="+p $num_pes" + fi + + echo "Charmrun> Restart detected - checking for new nodes..." + + # Re-read current nodelist to check for new nodes + declare -A current_node_slots + current_node_ips=() + if [[ -n "$machinefile" ]]; then + while IFS= read -r line; do + ip=$(echo "$line" | awk '{print $1}') + slots=$(echo "$line" | grep -o 'slots=[0-9]*' | cut -d'=' -f2) + if [[ -n "$ip" ]]; then + current_node_ips+=("$ip") + current_node_slots["$ip"]=${slots:-1} + fi + done < "$machinefile" + else + current_node_ips=("localhost") + current_node_slots["localhost"]=1 + fi + + # Update global variables with current state + node_ips=("${current_node_ips[@]}") + for ip in "${current_node_ips[@]}"; do + node_slots["$ip"]=${current_node_slots["$ip"]} + done + + # Find new nodes + declare -a detected_new_nodes + declare -A detected_new_node_slots + find_new_nodes + + if [[ ${#detected_new_nodes[@]} -gt 0 ]]; then + echo "Charmrun> Found ${#detected_new_nodes[@]} new node(s): ${detected_new_nodes[*]}" + + # Clean up new nodes + echo "Charmrun> Cleaning up new nodes..." + cleanup_pids=() + for ip in "${detected_new_nodes[@]}"; do + echo "Charmrun> Cleaning up node: $ip" + slots=${detected_new_node_slots["$ip"]} + fifo_cmd="rm -f /dev/shm/numRestartProcs.txt /tmp/server_fifo_* /tmp/client_fifo_* /tmp/daemon_ready_*" + for ((slot=0; slot Starting memory daemons on new nodes..." + daemon_pids=() + for ip in "${detected_new_nodes[@]}"; do + slots=${detected_new_node_slots["$ip"]} + echo "Charmrun> Starting $slots daemon(s) on new node $ip" + for ((slot=0; slot /dev/null 2>&1 &" & + daemon_pids+=($!) + done + done + + # Update the original nodelist to include new nodes for future restarts + if [[ -n "$machinefile" ]]; then + cp "$machinefile" "$original_nodelist_file" + echo "Charmrun> Updated original nodelist with new nodes" + fi + + echo "Charmrun> New nodes setup completed" + else + echo "Charmrun> No new nodes detected" + fi + fi + + # Pass all script arguments to the executable + "$(dirname "$0")/charmrun" $pes_arg "${args[@]}" $restart_arg + + EXIT_CODE=$? + + if [ "$EXIT_CODE" -eq 100 ]; then + is_restart=true + echo "Restart signal (code 100) received. Looping again." + echo "----------------------------------------" + else + echo "Final exit signal (code $EXIT_CODE) received. Exiting loop." + # Clean up the background daemon processes on all nodes + for ip in "${node_ips[@]}"; do + ssh "$ip" "pkill -f hapi_memory_daemon" & + done + # Also kill any remaining SSH connection PIDs + for pid in "${daemon_pids[@]}"; do + kill "$pid" 2>/dev/null + done + # Clean up temporary files + rm -f "$original_nodelist_file" + echo "Charmrun> Cleaned up temporary nodelist file" + break + fi +done + +echo "Control loop finished." \ No newline at end of file diff --git a/examples/charm++/cuda/gpudirect/verify/helper.sh b/examples/charm++/cuda/gpudirect/verify/helper.sh new file mode 100755 index 0000000000..6eef9fb132 --- /dev/null +++ b/examples/charm++/cuda/gpudirect/verify/helper.sh @@ -0,0 +1,6 @@ +make clean +make verify +# srun --mpi=cray_shasta -n 2 ./verify +gpushm +gpuipceventpool 256 +allgpus +gpucommbuffer 256 +ppn 2 +# ./charmrun ++local ++p 2 ./verify +gpushm +gpuipceventpool 512 +allgpus +gpucommbuffer 128 +# srun -n 2 ./verify +ppn 2 +# srun -n 4 ./verify diff --git a/examples/charm++/cuda/gpudirect/verify/verbose b/examples/charm++/cuda/gpudirect/verify/verbose new file mode 100644 index 0000000000..77bdcd5018 --- /dev/null +++ b/examples/charm++/cuda/gpudirect/verify/verbose @@ -0,0 +1 @@ +[2026-04-14T10:22:34.557] error: *** STEP 17600030.21 ON gpub023 CANCELLED AT 2026-04-14T10:22:34 DUE to SIGNAL Killed *** diff --git a/examples/charm++/cuda/gpudirect/verify/verify.C b/examples/charm++/cuda/gpudirect/verify/verify.C index 9ab66369d8..260c13b03e 100644 --- a/examples/charm++/cuda/gpudirect/verify/verify.C +++ b/examples/charm++/cuda/gpudirect/verify/verify.C @@ -10,34 +10,35 @@ /* readonly */ CProxy_VerifyNodeGroup nodegroup_proxy; /* readonly */ int block_size; /* readonly */ int n_iters; +/* readonly */ int n_warpup_iters; /* readonly */ bool lb_test; -extern void invokeInitKernel(double*, int, double, cudaStream_t); +extern void invokeInitKernel(double*, int, double, hapiStream_t); struct Container { double* h_local_data; double* h_remote_data; double* d_local_data; double* d_remote_data; - cudaStream_t stream; + hapiStream_t stream; Container() : h_local_data(nullptr), h_remote_data(nullptr), d_local_data(nullptr), d_remote_data(nullptr) {} ~Container() { - hapiCheck(cudaFreeHost(h_local_data)); - hapiCheck(cudaFreeHost(h_remote_data)); - hapiCheck(cudaFree(d_local_data)); - hapiCheck(cudaFree(d_remote_data)); - hapiCheck(cudaStreamDestroy(stream)); + hapiCheck(hapiFreeHost(h_local_data)); + hapiCheck(hapiFreeHost(h_remote_data)); + hapiCheck(hapiFree(d_local_data)); + hapiCheck(hapiFree(d_remote_data)); + hapiCheck(hapiStreamDestroy(stream)); } void init(double val) { - hapiCheck(cudaMallocHost(&h_local_data, sizeof(double) * block_size)); - hapiCheck(cudaMallocHost(&h_remote_data, sizeof(double) * block_size)); - hapiCheck(cudaMalloc(&d_local_data, sizeof(double) * block_size)); - hapiCheck(cudaMalloc(&d_remote_data, sizeof(double) * block_size)); - hapiCheck(cudaStreamCreate(&stream)); + hapiCheck(hapiMallocHost(&h_local_data, sizeof(double) * block_size)); + hapiCheck(hapiMallocHost(&h_remote_data, sizeof(double) * block_size)); + hapiCheck(hapiMalloc(&d_local_data, sizeof(double) * block_size)); + hapiCheck(hapiMalloc(&d_remote_data, sizeof(double) * block_size)); + hapiCheck(hapiStreamCreate(&stream)); for (int i = 0; i < block_size; i++) { h_local_data[i] = val; @@ -45,13 +46,13 @@ struct Container { invokeInitKernel(d_local_data, block_size, val, stream); invokeInitKernel(d_remote_data, block_size, val, stream); - hapiCheck(cudaStreamSynchronize(stream)); + hapiCheck(hapiStreamSynchronize(stream)); } void verify(double val) { - hapiCheck(cudaMemcpyAsync(h_remote_data, d_remote_data, - sizeof(double) * block_size, cudaMemcpyDeviceToHost, stream)); - hapiCheck(cudaStreamSynchronize(stream)); + hapiCheck(hapiMemcpyAsync(h_remote_data, d_remote_data, + sizeof(double) * block_size, hapiMemcpyDeviceToHost, stream)); + hapiCheck(hapiStreamSynchronize(stream)); for (int i = 0; i < block_size; i++) { if (fabs(h_remote_data[i] - val) > ERROR_TOLERANCE) { @@ -59,6 +60,8 @@ struct Container { i, val, h_remote_data[i]); } } + + CmiPrintf("Data verified, looks OK!\n"); } }; @@ -69,15 +72,16 @@ class Main : public CBase_Main { public: Main(CkArgMsg* m) { main_proxy = thisProxy; - block_size = 128; - n_iters = 100; + block_size = 1024 * 128; + n_iters = 150; + n_warpup_iters = 3; test_nodegroup = true; lb_test = false; // Check if there are 2 PEs - if (CkNumPes() != 2) { - CkAbort("Should be run with 2 PEs"); - } + // if (CkNumPes() != 2) { + // CkAbort("Should be run with 2 PEs"); + // } // Don't do nodegroup test if run with 1 process if (CmiNumNodes() == 1) { @@ -104,7 +108,7 @@ public: delete m; // Print info - CkPrintf("[CUDA Zerocopy Verification Test]\n" + CkPrintf("[hapi Zerocopy Verification Test]\n" "Block size: %d, Iters: %d, Nodegroup: %s, LB test: %s\n", block_size, n_iters, test_nodegroup ? "true" : "false", lb_test ? "true" : "false"); @@ -119,32 +123,44 @@ public: } void test() { + // warm up + for (int i = 0; i < n_warpup_iters; i++) { + array_proxy[0].send(); + // CkWaitQD(); + printf("[ITER] %d DONE!", i); + fflush(stdout); + } start_time = CkWallTimer(); - - CkPrintf("Testing chare array... "); + + CkPrintf("Testing chare array... \n"); for (int i = 0; i < n_iters; i++) { array_proxy[0].send(); - CkWaitQD(); + printf("[ITER] %d DONE!", i); + fflush(stdout); } + CkWaitQD(); CkPrintf("PASS\n"); - CkPrintf("Testing chare group... "); - for (int i = 0; i < n_iters; i++) { - group_proxy[0].send(); - CkWaitQD(); - } - CkPrintf("PASS\n"); + // CkPrintf("Testing chare group... \n"); + // for (int i = 0; i < n_iters; i++) { + // group_proxy[0].send(); + // } + // CkWaitQD(); + // CkPrintf("PASS\n"); - if (test_nodegroup) { - CkPrintf("Testing chare nodegroup... "); - for (int i = 0; i < n_iters; i++) { - nodegroup_proxy[0].send(); - CkWaitQD(); - } - CkPrintf("PASS\n"); - } + // if (test_nodegroup) { + // CkPrintf("Testing chare nodegroup... \n"); + // for (int i = 0; i < n_iters; i++) { + // nodegroup_proxy[0].send(); + // } + // CkWaitQD(); + // CkPrintf("PASS\n"); + // } + + // sleep(3); CkPrintf("Elapsed: %.6lf s\n", CkWallTimer() - start_time); + fflush(stdout); CkExit(); } }; @@ -168,7 +184,7 @@ public: } void send() { - thisProxy[1].recv(block_size, CkDeviceBuffer(container.d_local_data, + thisProxy[2].recv(block_size, CkDeviceBuffer(container.d_local_data, CkCallback(CkIndex_VerifyArray::reuse(), thisProxy[thisIndex]), container.stream)); if (lb_test) { @@ -179,7 +195,7 @@ public: void recv(int& size, double*& data, CkDeviceBufferPost* post) { data = container.d_remote_data; - post[0].cuda_stream = container.stream; + post[0].hapi_stream = container.stream; } void recv(int size, double* data) { @@ -188,9 +204,14 @@ public: pe = CkMyPe(); AtSync(); } + printf("[VERIFY] data received on PE: %d, Process: %d\n", CmiMyPe(), CmiMyNode()); + fflush(stdout); } - void reuse() {} + void reuse() { + printf("[VERIFY] source callback called on PE: %d, Process: %d\n", CmiMyPe(), CmiMyNode()); + fflush(stdout); + } void ResumeFromSync() {} }; @@ -204,12 +225,12 @@ public: } void send() { - thisProxy[1].recv(block_size, CkDeviceBuffer(container.d_local_data, container.stream)); + thisProxy[2].recv(block_size, CkDeviceBuffer(container.d_local_data, container.stream)); } void recv(int& size, double*& data, CkDeviceBufferPost* post) { data = container.d_remote_data; - post[0].cuda_stream = container.stream; + post[0].hapi_stream = container.stream; } void recv(int size, double* data) { @@ -231,7 +252,7 @@ public: void recv(int& size, double*& data, CkDeviceBufferPost* post) { data = container.d_remote_data; - post[0].cuda_stream = container.stream; + post[0].hapi_stream = container.stream; } void recv(int size, double* data) { diff --git a/examples/charm++/cuda/gpudirect/verify/verify.cu b/examples/charm++/cuda/gpudirect/verify/verify.cu index 987c5aed26..6b2819f7e8 100644 --- a/examples/charm++/cuda/gpudirect/verify/verify.cu +++ b/examples/charm++/cuda/gpudirect/verify/verify.cu @@ -10,11 +10,11 @@ __global__ void initKernel(double* data, int count, double val) { } } -void invokeInitKernel(double* data, int count, double val, cudaStream_t stream) { +void invokeInitKernel(double* data, int count, double val, hapiStream_t stream) { dim3 block_dim(BLOCK_SIZE); dim3 grid_dim((count + block_dim.x - 1) / block_dim.x); initKernel<<>>(data, count, val); - hapiCheck(cudaPeekAtLastError()); + // hapiCheck(hapiPeekAtLastError()); } diff --git a/examples/charm++/cuda/stencil2d/Makefile b/examples/charm++/cuda/stencil2d/Makefile index 6bde15412c..afa7284a61 100644 --- a/examples/charm++/cuda/stencil2d/Makefile +++ b/examples/charm++/cuda/stencil2d/Makefile @@ -1,11 +1,11 @@ -include ../../../../common.mk CHARMC = ../../../../bin/charmc $(OPTS) -OPTS = -O3 -fopt-info-vec-optimized #-DUSE_NVTX +OPTS = -O3 -fopt-info-vec-optimized -g #-DHAPI_CUDA_CALLBACK #-DUSE_NVTX # set CUDATOOLKIT_HOME to the CUDA toolkit directory CUDATOOLKIT_HOME ?= /usr/local/cuda NVCC = $(CUDATOOLKIT_HOME)/bin/nvcc -NVCC_FLAGS = -O3 -c -std=c++11 -DTILE_SIZE=16 -use_fast_math +NVCC_FLAGS = -O3 -c -std=c++11 -DTILE_SIZE=16 -use_fast_math -g NVCC_INC = -I$(CUDATOOLKIT_HOME)/include CHARM_INC = -I../../../../include LD_LIBS = #-lnvToolsExt @@ -16,7 +16,7 @@ all: $(TARGET) OBJS = $(TARGET).o $(TARGET)CUDA.o $(TARGET): $(OBJS) - $(CHARMC) -language charm++ -o $@ $(OBJS) $(LD_LIBS) + $(CHARMC) -language charm++ -module EveryLB -o $@ $(OBJS) $(LD_LIBS) $(TARGET).decl.h: $(TARGET).ci $(CHARMC) $< diff --git a/examples/charm++/cuda/stencil2d/stencil2d.C b/examples/charm++/cuda/stencil2d/stencil2d.C index 7e1cbce385..bcd061206c 100644 --- a/examples/charm++/cuda/stencil2d/stencil2d.C +++ b/examples/charm++/cuda/stencil2d/stencil2d.C @@ -15,7 +15,7 @@ #define BOTTOM 4 #define DIVIDEBY5 0.2 -#define USE_CUSTOM_MAP 1 // Should be set to 1 to use GPU handler PEs +#define USE_CUSTOM_MAP 0 // Should be set to 1 to use GPU handler PEs /* readonly */ CProxy_Main mainProxy; /* readonly */ int grid_x; @@ -31,8 +31,8 @@ /* readonly */ bool gpu_prio; /* readonly */ int gpu_pes; -extern void invokeKernel(cudaStream_t stream, double* d_temperature, - double* d_new_temperature, int block_x, int block_y, +extern void invokeKernel(cudaStream_t stream, float* d_temperature, + float* d_new_temperature, int block_x, int block_y, int thread_size); // Calculate the number of digits. @@ -85,15 +85,9 @@ class CustomMap : public CkArrayMap { } }; -// Used to specify LIFO ordering on callbacks. -class CallbackMsg : public CMessage_CallbackMsg { - public: - CallbackMsg() {} -}; - class Main : public CBase_Main { - double init_start_time; - double start_time; + float init_start_time; + float start_time; public: CProxy_Stencil stencils; @@ -214,6 +208,21 @@ class Main : public CBase_Main { stencils.init(); } + void pup(PUP::er& p) { + p | grid_x; + p | grid_y; + p | block_x; + p | block_y; + p | num_iters; + p | global_exec_mode; + p | thread_size; + p | offload_ratio; + p | gpu_prio; + p | gpu_pes; + p | stencils; + p | init_start_time; + } + void initDone() { #ifdef USE_NVTX NVTXTracer nvtx_range("Main::initDone", NVTXColor::Emerald); @@ -225,16 +234,23 @@ class Main : public CBase_Main { start_time = CkWallTimer(); // Start stencil iterations - CallbackMsg* m = new CallbackMsg(); - stencils.iterate(m); + stencils.iterate(); } - void done(double time) { + void done(float *times, int size) { #ifdef USE_NVTX NVTXTracer nvtx_range("Main::done", NVTXColor::Emerald); #endif - CkPrintf("\nAverage time per iteration: %lf\n", - time / ((num_chares_x * num_chares_y) * num_iters)); + if (size != 2) { + CkAbort("Received reduction of incorrect size!"); + } + float agg_time = times[0]; + float gpu_time = times[1]; + CkPrintf("Total times are: %lf CPU time, %lf GPU time\n", + agg_time, gpu_time); + CkPrintf("\nAverage time per iteration: %lf CPU time, %lf GPU time\n", + agg_time / ((num_chares_x * num_chares_y) * num_iters), + gpu_time / ((num_chares_x * num_chares_y) * num_iters)); CkPrintf("Finished due to max iterations %d, total time %lf seconds\n", num_iters, CkWallTimer() - start_time); CkExit(); @@ -250,14 +266,14 @@ class Stencil : public CBase_Stencil { int neighbors; int remote_count; - double* __restrict__ temperature; - double* __restrict__ new_temperature; - double* __restrict__ d_temperature; - double* __restrict__ d_new_temperature; - double* __restrict__ left_ghost; - double* __restrict__ right_ghost; - double* __restrict__ bottom_ghost; - double* __restrict__ top_ghost; + float* temperature; + float* new_temperature; + float* d_temperature; + float* d_new_temperature; + float* left_ghost; + float* right_ghost; + float* bottom_ghost; + float* top_ghost; cudaStream_t stream; @@ -265,20 +281,36 @@ class Stencil : public CBase_Stencil { int local_exec_mode; bool left_bound, right_bound, top_bound, bottom_bound; - double iter_start_time; - double agg_time; + float iter_start_time; + float agg_time; - Stencil() {} + Stencil() { + usesAtSync = true; + } + + Stencil(CkMigrateMessage* msg) : CBase_Stencil(msg) { + cudaStreamCreate(&stream); + hapiCheck( + hapiMallocHost((void**)&temperature, + sizeof(float) * (block_x + 2) * (block_y + 2))); + //hapiCheck( + // hapiMallocHost((void**)&left_ghost, sizeof(float) * block_y)); + //hapiCheck( + // hapiMallocHost((void**)&right_ghost, sizeof(float) * block_y)); + //hapiCheck( + // hapiMallocHost((void**)&bottom_ghost, sizeof(float) * block_x)); + //hapiCheck(hapiMallocHost((void**)&top_ghost, sizeof(float) * block_x)); + } ~Stencil() { if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) { - hapiCheck(cudaFreeHost(temperature)); - hapiCheck(cudaFree(d_temperature)); - hapiCheck(cudaFree(d_new_temperature)); - hapiCheck(cudaFreeHost(left_ghost)); - hapiCheck(cudaFreeHost(right_ghost)); - hapiCheck(cudaFreeHost(top_ghost)); - hapiCheck(cudaFreeHost(bottom_ghost)); + hapiCheck(hapiFreeHost(temperature)); + hapiCheck(hapiFree(d_temperature)); + hapiCheck(hapiFree(d_new_temperature)); + hapiCheck(hapiFreeHost(left_ghost)); + hapiCheck(hapiFreeHost(right_ghost)); + hapiCheck(hapiFreeHost(top_ghost)); + hapiCheck(hapiFreeHost(bottom_ghost)); cudaStreamDestroy(stream); } else { // CPU_MODE @@ -291,6 +323,40 @@ class Stencil : public CBase_Stencil { } } + void pup(PUP::er& p) { + p | n_digits; + p | my_iter; + p | neighbors; + p | remote_count; + p | iter_start_time; + p | agg_time; + p | thisFlatIndex; + p | left_bound; + p | right_bound; + p | top_bound; + p | bottom_bound; + p | local_exec_mode; + + if (p.isUnpacking()) { + //hapiMallocHost((void**)&temperature, sizeof(float) * (block_x + 2) * (block_y + 2)); + hapiMalloc((void**)&d_temperature, sizeof(float) * (block_x + 2) * (block_y + 2)); + hapiMalloc((void**)&d_new_temperature, sizeof(float) * (block_x + 2) * (block_y + 2)); + hapiMallocHost((void**)&left_ghost, sizeof(float) * block_y); + hapiMallocHost((void**)&right_ghost, sizeof(float) * block_y); + hapiMallocHost((void**)&bottom_ghost, sizeof(float) * block_x); + hapiMallocHost((void**)&top_ghost, sizeof(float) * block_x); + } + + //p(temperature, (block_x + 2) * (block_y + 2)); + //p(new_temperature, (block_x + 2) * (block_y + 2)); + p(d_temperature, (block_x + 2) * (block_y + 2), PUP::PUPMode::DEVICE); + p(d_new_temperature, (block_x + 2) * (block_y + 2), PUP::PUPMode::DEVICE); + p(left_ghost, block_y); + p(right_ghost, block_y); + p(bottom_ghost, block_x); + p(top_ghost, block_x); + } + void init() { thisFlatIndex = num_chares_y * thisIndex.x + thisIndex.y; @@ -336,7 +402,7 @@ class Stencil : public CBase_Stencil { mode_string = "HAPI"; break; } - CkPrintf("[%*d] Mode: %s, PE: %d\n", n_digits, thisFlatIndex, mode_string.c_str(), CkMyPe()); + // CkPrintf("[%*d] Mode: %s, PE: %d\n", n_digits, thisFlatIndex, mode_string.c_str(), CkMyPe()); // Initialize values my_iter = 0; @@ -365,28 +431,28 @@ class Stencil : public CBase_Stencil { // Allocate memory and create CUDA stream if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) { hapiCheck( - cudaMallocHost((void**)&temperature, - sizeof(double) * (block_x + 2) * (block_y + 2))); - hapiCheck(cudaMalloc((void**)&d_temperature, - sizeof(double) * (block_x + 2) * (block_y + 2))); - hapiCheck(cudaMalloc((void**)&d_new_temperature, - sizeof(double) * (block_x + 2) * (block_y + 2))); + hapiMallocHost((void**)&temperature, + sizeof(float) * (block_x + 2) * (block_y + 2))); + hapiCheck(hapiMalloc((void**)&d_temperature, + sizeof(float) * (block_x + 2) * (block_y + 2))); + hapiCheck(hapiMalloc((void**)&d_new_temperature, + sizeof(float) * (block_x + 2) * (block_y + 2))); hapiCheck( - cudaMallocHost((void**)&left_ghost, sizeof(double) * block_y)); + hapiMallocHost((void**)&left_ghost, sizeof(float) * block_y)); hapiCheck( - cudaMallocHost((void**)&right_ghost, sizeof(double) * block_y)); + hapiMallocHost((void**)&right_ghost, sizeof(float) * block_y)); hapiCheck( - cudaMallocHost((void**)&bottom_ghost, sizeof(double) * block_x)); - hapiCheck(cudaMallocHost((void**)&top_ghost, sizeof(double) * block_x)); + hapiMallocHost((void**)&bottom_ghost, sizeof(float) * block_x)); + hapiCheck(hapiMallocHost((void**)&top_ghost, sizeof(float) * block_x)); cudaStreamCreate(&stream); } else { // CPU_MODE - temperature = new double[(block_x + 2) * (block_y + 2)]; - new_temperature = new double[(block_x + 2) * (block_y + 2)]; - left_ghost = new double[block_y]; - right_ghost = new double[block_y]; - top_ghost = new double[block_x]; - bottom_ghost = new double[block_x]; + temperature = new float[(block_x + 2) * (block_y + 2)]; + new_temperature = new float[(block_x + 2) * (block_y + 2)]; + left_ghost = new float[block_y]; + right_ghost = new float[block_y]; + top_ghost = new float[block_x]; + bottom_ghost = new float[block_x]; } // Initialize temperature data @@ -414,8 +480,8 @@ class Stencil : public CBase_Stencil { if ((local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) && my_iter == 0) { hapiCheck( - cudaMemcpyAsync(d_temperature, temperature, - sizeof(double) * (block_x + 2) * (block_y + 2), + hapiMemcpyAsync(d_temperature, temperature, + sizeof(float) * (block_x + 2) * (block_y + 2), cudaMemcpyHostToDevice, stream)); } @@ -445,17 +511,17 @@ class Stencil : public CBase_Stencil { thisProxy(x, y - 1).receiveGhosts(my_iter, TOP, block_x, bottom_ghost); } - void processGhosts(int dir, int width, double* gh) { + void processGhosts(int dir, int width, float* gh) { #ifdef USE_NVTX NVTXTracer nvtx_range(std::to_string(thisFlatIndex) + " Stencil::processGhosts", NVTXColor::WetAsphalt); #endif switch (dir) { case LEFT: if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) { - memcpy(left_ghost, gh, width * sizeof(double)); - hapiCheck(cudaMemcpy2DAsync( - d_temperature + (block_x + 2), (block_x + 2) * sizeof(double), - left_ghost, sizeof(double), sizeof(double), block_y, + memcpy(left_ghost, gh, width * sizeof(float)); + hapiCheck(hapiMemcpy2DAsync( + d_temperature + (block_x + 2), (block_x + 2) * sizeof(float), + left_ghost, sizeof(float), sizeof(float), block_y, cudaMemcpyHostToDevice, stream)); } else { for (int j = 0; j < width; j++) { @@ -465,11 +531,11 @@ class Stencil : public CBase_Stencil { break; case RIGHT: if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) { - memcpy(right_ghost, gh, width * sizeof(double)); - hapiCheck(cudaMemcpy2DAsync( + memcpy(right_ghost, gh, width * sizeof(float)); + hapiCheck(hapiMemcpy2DAsync( d_temperature + (block_x + 2) + (block_x + 1), - (block_x + 2) * sizeof(double), right_ghost, sizeof(double), - sizeof(double), block_y, cudaMemcpyHostToDevice, stream)); + (block_x + 2) * sizeof(float), right_ghost, sizeof(float), + sizeof(float), block_y, cudaMemcpyHostToDevice, stream)); } else { for (int j = 0; j < width; j++) { temperature[(block_x + 2) * (1 + j) + (block_x + 1)] = gh[j]; @@ -478,9 +544,9 @@ class Stencil : public CBase_Stencil { break; case BOTTOM: if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) { - memcpy(bottom_ghost, gh, width * sizeof(double)); - hapiCheck(cudaMemcpyAsync(d_temperature + 1, bottom_ghost, - block_x * sizeof(double), + memcpy(bottom_ghost, gh, width * sizeof(float)); + hapiCheck(hapiMemcpyAsync(d_temperature + 1, bottom_ghost, + block_x * sizeof(float), cudaMemcpyHostToDevice, stream)); } else { for (int j = 0; j < width; j++) { @@ -490,10 +556,10 @@ class Stencil : public CBase_Stencil { break; case TOP: if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) { - memcpy(top_ghost, gh, width * sizeof(double)); - hapiCheck(cudaMemcpyAsync( + memcpy(top_ghost, gh, width * sizeof(float)); + hapiCheck(hapiMemcpyAsync( d_temperature + (block_x + 2) * (block_y + 1) + 1, top_ghost, - block_x * sizeof(double), cudaMemcpyHostToDevice, stream)); + block_x * sizeof(float), cudaMemcpyHostToDevice, stream)); } else { for (int j = 0; j < width; j++) { temperature[(block_x + 2) * (block_y + 1) + (1 + j)] = gh[j]; @@ -505,60 +571,82 @@ class Stencil : public CBase_Stencil { } } + void ResumeFromSync() + { + CkCallback cb(CkReductionTarget(Stencil, compute), thisProxy); + contribute(cb); + } + + void iterate() + { + if (my_iter > 0 && my_iter < num_iters && my_iter % 1000 == 0) + { + cudaStreamSynchronize(stream); + CkPrintf("Load balancing: %d/%d, iteration %d. GPU Load = %f\n", + thisFlatIndex, num_chares_x * num_chares_y, my_iter, getObjGPUTime()); + AtSync(); + } + else + { + if (thisFlatIndex == 0 && my_iter % 100 == 0) + CkPrintf("[%*d] Iteration %d\n", n_digits, thisFlatIndex, my_iter); + thisProxy(thisIndex.x, thisIndex.y).compute(); + } + } + // Updates local data with stencil computation. void update() { #ifdef USE_NVTX NVTXTracer nvtx_range(std::to_string(thisFlatIndex) + " Stencil::update", NVTXColor::Amethyst); #endif - CallbackMsg* m = new CallbackMsg(); if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) { // Invoke 2D stencil kernel invokeKernel(stream, d_temperature, d_new_temperature, block_x, block_y, thread_size); // Transfer left ghost - hapiCheck(cudaMemcpy2DAsync(left_ghost, sizeof(double), + hapiCheck(hapiMemcpy2DAsync(left_ghost, sizeof(float), d_new_temperature + (block_x + 2), - (block_x + 2) * sizeof(double), sizeof(double), + (block_x + 2) * sizeof(float), sizeof(float), block_y, cudaMemcpyDeviceToHost, stream)); // Transfer right ghost hapiCheck( - cudaMemcpy2DAsync(right_ghost, sizeof(double), + hapiMemcpy2DAsync(right_ghost, sizeof(float), d_new_temperature + (block_x + 2) + (block_x + 1), - (block_x + 2) * sizeof(double), sizeof(double), + (block_x + 2) * sizeof(float), sizeof(float), block_y, cudaMemcpyDeviceToHost, stream)); // Transfer bottom ghost - hapiCheck(cudaMemcpyAsync(bottom_ghost, d_new_temperature + 1, - block_x * sizeof(double), cudaMemcpyDeviceToHost, + hapiCheck(hapiMemcpyAsync(bottom_ghost, d_new_temperature + 1, + block_x * sizeof(float), cudaMemcpyDeviceToHost, stream)); // Transfer top ghost - hapiCheck(cudaMemcpyAsync( + hapiCheck(hapiMemcpyAsync( top_ghost, d_new_temperature + (block_x + 2) * (block_y + 1) + 1, - block_x * sizeof(double), cudaMemcpyDeviceToHost, stream)); + block_x * sizeof(float), cudaMemcpyDeviceToHost, stream)); // Copy final temperature data back to host (on last iteration) if (my_iter == num_iters - 1) { hapiCheck( - cudaMemcpyAsync(temperature, d_new_temperature, - sizeof(double) * (block_x + 2) * (block_y + 2), + hapiMemcpyAsync(temperature, d_new_temperature, + sizeof(float) * (block_x + 2) * (block_y + 2), cudaMemcpyDeviceToHost, stream)); } if (local_exec_mode == CUDA_MODE) { cudaStreamSynchronize(stream); - thisProxy(thisIndex.x, thisIndex.y).iterate(m); + thisProxy(thisIndex.x, thisIndex.y).iterate(); } else { CkArrayIndex2D myIndex = CkArrayIndex2D(thisIndex); CkCallback* cb = - new CkCallback(CkIndex_Stencil::iterate(NULL), myIndex, thisProxy); - if (gpu_prio) - CkSetQueueing(m, CK_QUEUEING_LIFO); - hapiAddCallback(stream, cb, m); + new CkCallback(CkIndex_Stencil::iterate(), myIndex, thisProxy); + //if (gpu_prio) + // CkSetQueueing(m, CK_QUEUEING_LIFO); + hapiAddCallback(stream, cb); } } else { // CPU_MODE for (int i = 1; i <= block_x; ++i) { @@ -573,12 +661,12 @@ class Stencil : public CBase_Stencil { DIVIDEBY5; } } - double* tmp; + float* tmp; tmp = temperature; temperature = new_temperature; new_temperature = tmp; - thisProxy(thisIndex.x, thisIndex.y).iterate(m); + thisProxy(thisIndex.x, thisIndex.y).iterate(); } } diff --git a/examples/charm++/cuda/stencil2d/stencil2d.ci b/examples/charm++/cuda/stencil2d/stencil2d.ci index e39009f78b..467596bb2f 100644 --- a/examples/charm++/cuda/stencil2d/stencil2d.ci +++ b/examples/charm++/cuda/stencil2d/stencil2d.ci @@ -17,23 +17,21 @@ mainmodule stencil2d { entry CustomMap(); }; - message CallbackMsg; mainchare Main { entry Main(CkArgMsg* m); entry [reductiontarget] void initDone(); - entry [reductiontarget] void done(double time); + entry [reductiontarget] void done(float times[n], int n); }; array [2D] Stencil { entry Stencil(void); entry void init(); - entry void receiveGhosts(int ref, int dir, int w, double gh[w]); + entry void receiveGhosts(int ref, int dir, int w, float gh[w]); + entry void iterate(); - entry void iterate(CallbackMsg* m) { + entry [reductiontarget] void compute() { serial { - delete m; - // Measure iteration time if (my_iter > 0) { agg_time += CkWallTimer() - iter_start_time; @@ -42,10 +40,9 @@ mainmodule stencil2d { // Terminate if all iterations are complete if (my_iter >= num_iters) { - CkPrintf("[%*d] Average time per iteration: %lf\n", n_digits, - thisFlatIndex, agg_time / num_iters); + float times[2] = {agg_time, getObjGPUTime()}; CkCallback cb(CkReductionTarget(Main, done), mainProxy); - contribute(sizeof(double), &agg_time, CkReduction::sum_double, cb); + contribute(2 * sizeof(float), times, CkReduction::sum_float, cb); } // Send ghost data to neighbors @@ -54,7 +51,7 @@ mainmodule stencil2d { // Receive ghost data from neighbors for (remote_count = 0; remote_count < neighbors; remote_count++) { - when receiveGhosts[my_iter](int ref, int dir, int w, double buf[w]) serial { + when receiveGhosts[my_iter](int ref, int dir, int w, float buf[w]) serial { processGhosts(dir, w, buf); } } diff --git a/examples/charm++/cuda/stencil2d/stencil2d.cu b/examples/charm++/cuda/stencil2d/stencil2d.cu index aff99c88dc..051e26d72c 100644 --- a/examples/charm++/cuda/stencil2d/stencil2d.cu +++ b/examples/charm++/cuda/stencil2d/stencil2d.cu @@ -4,7 +4,7 @@ #define DIVIDEBY5 0.2 #endif -__global__ void stencil2DKernel(double* temperature, double* new_temperature, +__global__ void stencil2DKernel(float* temperature, float* new_temperature, int block_x, int block_y, int thread_size) { int i_start = (blockDim.x * blockIdx.x + threadIdx.x) * thread_size + 1; int i_finish = @@ -32,8 +32,8 @@ __global__ void stencil2DKernel(double* temperature, double* new_temperature, int j = jstart + threadIdx.y + blockDim.y*blockIdx.y; if (i < ifinish && j < jfinish) { - __shared__ double shared_temperature[TILE_SIZE][TILE_SIZE]; - double center = temperature[j*(block_x+2)+i]; + __shared__ float shared_temperature[TILE_SIZE][TILE_SIZE]; + float center = temperature[j*(block_x+2)+i]; shared_temperature[threadIdx.x][threadIdx.y] = center; __syncthreads(); @@ -55,8 +55,8 @@ __global__ void stencil2DKernel(double* temperature, double* new_temperature, */ } -void invokeKernel(cudaStream_t stream, double* d_temperature, - double* d_new_temperature, int block_x, int block_y, +void invokeKernel(cudaStream_t stream, float* d_temperature, + float* d_new_temperature, int block_x, int block_y, int thread_size) { dim3 block_dim(TILE_SIZE, TILE_SIZE); dim3 grid_dim( @@ -64,7 +64,10 @@ void invokeKernel(cudaStream_t stream, double* d_temperature, (block_y + (block_dim.y * thread_size - 1)) / (block_dim.y * thread_size)); - stencil2DKernel<<>>( - d_temperature, d_new_temperature, block_x, block_y, thread_size); - hapiCheck(cudaPeekAtLastError()); + // stencil2DKernel<<>>( + // d_temperature, d_new_temperature, block_x, block_y, thread_size); + // hapiCheck(cudaPeekAtLastError()); + + hapiCheck(hapiLaunchKernelWrapper(stencil2DKernel, grid_dim, block_dim, 0, stream, + d_temperature, d_new_temperature, block_x, block_y, thread_size)); } diff --git a/examples/charm++/cuda/vecadd/Makefile b/examples/charm++/cuda/vecadd/Makefile index 8ac6ee0487..c4dfd1967b 100644 --- a/examples/charm++/cuda/vecadd/Makefile +++ b/examples/charm++/cuda/vecadd/Makefile @@ -6,12 +6,12 @@ CHARMC = ../../../../bin/charmc $(DEFS) $(OPTS) DEFS = #-DUSE_WR -USE_NVTX # set CUDATOOLKIT_HOME to the CUDA toolkit directory -CUDATOOLKIT_HOME ?= /usr/local/cuda +CUDATOOLKIT_HOME ?= /usr/ NVCC = $(CUDATOOLKIT_HOME)/bin/nvcc -NVCC_FLAGS = -c -std=c++11 $(DEFS) +NVCC_FLAGS = -c -std=c++11 -lcuda -lnccl $(DEFS) NVCC_INC = -I$(CUDATOOLKIT_HOME)/include CHARM_INC = -I../../../../include -LD_LIBS = #-lnvToolsExt +LD_LIBS = -lcuda -lnccl #-lnvToolsExt TARGET = vecadd all: $(TARGET) diff --git a/examples/charm++/cuda/vecadd/vecadd.C b/examples/charm++/cuda/vecadd/vecadd.C index c32cf79a0f..8ccd0ff057 100644 --- a/examples/charm++/cuda/vecadd/vecadd.C +++ b/examples/charm++/cuda/vecadd/vecadd.C @@ -4,23 +4,21 @@ #ifdef USE_NVTX #include "hapi_nvtx.h" #endif +#include "nccl.h" /* readonly */ CProxy_Main mainProxy; -/* readonly */ int vectorSize; +/* readonly */ CProxy_NCCLManager ncclManagerProxy; -#ifdef USE_WR -extern void cudaVecAdd(int, float*, float*, float*, cudaStream_t, void*); -#else -extern void cudaVecAdd(int, float*, float*, float*, float*, float*, float*, - cudaStream_t, void*); -#endif + +extern void cudaVecAdd(int vectorSize, float* h_A, float* d_A); +extern void localReduce(float* A, float* result, int n); void randomInit(float* data, int size) { #ifdef USE_NVTX NVTXTracer nvtx_range("randomInit", NVTXColor::PeterRiver); #endif for (int i = 0; i < size; ++i) { - data[i] = rand() / (float)RAND_MAX; + data[i] = 10; } } @@ -39,7 +37,6 @@ class Main : public CBase_Main { // default values mainProxy = thisProxy; numChares = 4; - vectorSize = 1024; // handle arguments int c; @@ -48,9 +45,6 @@ class Main : public CBase_Main { case 'c': numChares = atoi(optarg); break; - case 's': - vectorSize = atoi(optarg); - break; default: CkPrintf("Usage: %s -c [chares] -s [vector size]\n", m->argv[0]); CkExit(); @@ -60,17 +54,23 @@ class Main : public CBase_Main { // print configuration CkPrintf("\n[CUDA vecadd example]\n"); - CkPrintf("Chares: %d\n", numChares); - CkPrintf("Vector size: %d\n", vectorSize); + CkPrintf("Chares: %d\n", 1); // create 1D chare array - workers = CProxy_Workers::ckNew(numChares); + workers = CProxy_Workers::ckNew(1024, 4 * CkNumPes()); + + ncclManagerProxy = CProxy_NCCLManager::ckNew(); // start measuring execution time startTime = CkWallTimer(); // fire off all chares in array + //workers.begin(); + } + + void nccl_done() { workers.begin(); + //ncclManagerProxy.ckLocalBranch()->localChares++; } void done() { @@ -83,103 +83,123 @@ class Main : public CBase_Main { } }; +class NCCLManager : public CBase_NCCLManager { + private: + int localChares; + ncclUniqueId id; + ncclComm_t comm; + float* localRed; + int redCount; + + public: + float* globalRed; + + NCCLManager() { + redCount = 0; + if (CkMyPe() == 0) { + ncclGetUniqueId(&id); + thisProxy.recvNCCLId(sizeof(ncclUniqueId), (char*)id.internal); + } + } + + NCCLManager(CkMigrateMessage* m) : CBase_NCCLManager(m) {} + + void registerChare() { + localChares++; + } + + void setupNCCL() { + CkPrintf("NCCL Unique ID generated by PE %d\n", CkMyPe()); + + CkCallback cb(CkIndex_Main::nccl_done(), mainProxy); + contribute(cb); + } + + void recvNCCLId(int size, char* id_buf) { + //CkPrintf("NCCL Unique ID received by PE %d\n", CkMyPe()); + memcpy(id.internal, id_buf, sizeof(ncclUniqueId)); + + ncclCommInitRank(&comm, CkNumPes(), id, CkMyPe()); + + CkCallback cb(CkIndex_Main::nccl_done(), mainProxy); + contribute(cb); + } + + void deviceContribute(int size, float* data) { + localReduce(localRed, data, size); + if (++redCount == localChares) { + ncclReduce((const void*)localRed, (void*)globalRed, size, + ncclFloat, ncclSum, 0, comm, cudaStreamDefault); + redCount = 0; + } + } +}; + class Workers : public CBase_Workers { private: + int vectorSize; float* h_A; - float* h_B; - float* h_C; -#ifndef USE_WR float* d_A; - float* d_B; - float* d_C; -#endif cudaStream_t stream; public: - Workers() { -#ifdef USE_NVTX - NVTXTracer nvtx_range("Workers::Workers", NVTXColor::WetAsphalt); -#endif - - int size = sizeof(float) * vectorSize; - hapiCheck(cudaMallocHost(&h_A, size)); - hapiCheck(cudaMallocHost(&h_B, size)); - hapiCheck(cudaMallocHost(&h_C, size)); + Workers(int size) : vectorSize(size) { + int dataSize = sizeof(float) * vectorSize; + hapiCheck(cudaMallocHost(&h_A, dataSize)); hapiCheck(cudaStreamCreate(&stream)); -#ifndef USE_WR - hapiCheck(cudaMalloc(&d_A, size)); - hapiCheck(cudaMalloc(&d_B, size)); - hapiCheck(cudaMalloc(&d_C, size)); -#endif + hapiCheck(hapiMalloc((void**) &d_A, dataSize)); srand(time(NULL)); randomInit(h_A, vectorSize); - randomInit(h_B, vectorSize); } - ~Workers() { -#ifdef USE_NVTX - NVTXTracer nvtx_range("Workers::~Workers", NVTXColor::WetAsphalt); -#endif + Workers(CkMigrateMessage* m) : CBase_Workers(m) + { + hapiCheck(cudaStreamCreate(&stream)); + } + ~Workers() { hapiCheck(cudaFreeHost(h_A)); - hapiCheck(cudaFreeHost(h_B)); - hapiCheck(cudaFreeHost(h_C)); hapiCheck(cudaStreamDestroy(stream)); -#ifndef USE_WR hapiCheck(cudaFree(d_A)); - hapiCheck(cudaFree(d_B)); - hapiCheck(cudaFree(d_C)); -#endif } - void begin() { -#ifdef USE_NVTX - NVTXTracer nvtx_range("Workers::begin", NVTXColor::Carrot); -#endif - - CkArrayIndex1D myIndex = CkArrayIndex1D(thisIndex); - CkCallback* cb = - new CkCallback(CkIndex_Workers::complete(), myIndex, thisArrayID); -#ifdef USE_WR - cudaVecAdd(vectorSize, h_A, h_B, h_C, stream, (void*)cb); -#else - cudaVecAdd(vectorSize, h_A, h_B, h_C, d_A, d_B, d_C, stream, (void*)cb); -#endif + void pup(PUP::er& p) { + p | vectorSize; + if (p.isUnpacking()) + { + hapiMalloc((void**) &d_A, vectorSize * sizeof(float)); + } + p(d_A, vectorSize, PUP::PUPMode::DEVICE); } - void complete() { -#ifdef USE_NVTX - NVTXTracer nvtx_range("Workers::complete", NVTXColor::Clouds); -#endif - -#ifdef DEBUG - CkPrintf("[%d] A\n", thisIndex); - for (int i = 0; i < vectorSize; i++) { - CkPrintf("%.2f ", h_A[i]); - } - CkPrintf("\n"); + void begin() { + ncclManagerProxy.ckLocalBranch()->registerChare(); + CkCallback cb(CkIndex_Workers::reduction(), thisProxy); + contribute(cb); + } - CkPrintf("[%d] B\n", thisIndex); - for (int i = 0; i < vectorSize; i++) { - CkPrintf("%.2f ", h_B[i]); - } - CkPrintf("\n"); + void reduction() { + ncclManagerProxy.ckLocalBranch()->deviceContribute(vectorSize, d_A); - CkPrintf("[%d] C\n", thisIndex); - for (int i = 0; i < vectorSize; i++) { - CkPrintf("%.2f ", h_C[i]); + if (thisIndex == 0) { + cudaMemcpy(h_A, ncclManagerProxy.ckLocalBranch()->globalRed, + sizeof(float) * vectorSize, cudaMemcpyDeviceToHost); } - CkPrintf("\n"); - CkPrintf("[%d] C-gold\n", thisIndex); - for (int j = 0; j < vectorSize; j++) { - h_C[j] = h_A[j] + h_B[j]; - CkPrintf("%.2f ", h_C[j]); + for (int i = 0; i < vectorSize; ++i) { + if (thisIndex == 0) { + // Expected value is 10 * total number of chares + float expected = 10.0f * 10; + if (h_A[i] != expected) { + CkPrintf("Error at index %d: Expected %.2f, Got %.2f\n", + i, expected, h_A[i]); + break; + } + } } - CkPrintf("\n"); -#endif + //hapiCheck(hapiFree(d_A)); contribute(CkCallback(CkIndex_Main::done(), mainProxy)); } }; diff --git a/examples/charm++/cuda/vecadd/vecadd.ci b/examples/charm++/cuda/vecadd/vecadd.ci index 4b77228354..319aa86af6 100644 --- a/examples/charm++/cuda/vecadd/vecadd.ci +++ b/examples/charm++/cuda/vecadd/vecadd.ci @@ -1,15 +1,21 @@ mainmodule vecadd { - readonly int vectorSize; readonly CProxy_Main mainProxy; + readonly CProxy_NCCLManager ncclManagerProxy; mainchare Main { entry Main(CkArgMsg* m); + entry [reductiontarget] void nccl_done(); entry [reductiontarget] void done(); }; + group NCCLManager { + entry NCCLManager(); + entry void recvNCCLId(int size, char id_buf[size]); + }; + array [1D] Workers { - entry Workers(); + entry Workers(int size); entry void begin(); - entry void complete(); + entry [reductiontarget] void reduction(); }; }; diff --git a/examples/charm++/cuda/vecadd/vecadd.cu b/examples/charm++/cuda/vecadd/vecadd.cu index 3dda85282a..844456dbf6 100644 --- a/examples/charm++/cuda/vecadd/vecadd.cu +++ b/examples/charm++/cuda/vecadd/vecadd.cu @@ -7,13 +7,13 @@ #define B_INDEX 1 #define C_INDEX 2 -__global__ void vecAdd(float* C, float* A, float* B, int n) { +__global__ void vecAdd(float* C, float* A, int n) { // Get our global thread ID int id = blockIdx.x * blockDim.x + threadIdx.x; // Make sure we do not go out of bounds if (id < n) { - C[id] = A[id] + B[id]; + C[id] = C[id] + A[id]; } } @@ -27,42 +27,12 @@ void run_VECADD_KERNEL(hapiWorkRequest* wr, cudaStream_t kernel_stream, } #endif -#ifdef USE_WR -void cudaVecAdd(int vectorSize, float* h_A, float* h_B, float* h_C, - cudaStream_t stream, void* cb) { -#else -void cudaVecAdd(int vectorSize, float* h_A, float* h_B, float* h_C, float* d_A, - float* d_B, float* d_C, cudaStream_t stream, void* cb) { -#endif - int size = vectorSize * sizeof(float); - dim3 dimBlock(BLOCK_SIZE, 1); - dim3 dimGrid((vectorSize - 1) / dimBlock.x + 1, 1); - -#ifdef USE_WR - // DEPRECATED - hapiWorkRequest* wr = hapiCreateWorkRequest(); - wr->setExecParams(dimGrid, dimBlock); - wr->setStream(stream); - wr->addBuffer(h_A, size, true, false, true); - wr->addBuffer(h_B, size, true, false, true); - wr->addBuffer(h_C, size, false, true, true); - wr->setCallback(cb); -#ifdef HAPI_TRACE - wr->setTraceName("vecadd"); -#endif - wr->setRunKernel(run_VECADD_KERNEL); - wr->copyUserData(&vectorSize, sizeof(int)); - - hapiEnqueue(wr); -#else - hapiCheck(cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream)); - hapiCheck(cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream)); - - vecAdd<<>>(d_C, d_A, d_B, vectorSize); - hapiCheck(cudaPeekAtLastError()); - - hapiCheck(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream)); +void localReduce(float* A, float* result, int n) { + vecAdd<<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>( + result, A, n); +} - hapiAddCallback(stream, cb); -#endif +void cudaVecAdd(int vectorSize, float* h_A, float* d_A) { + int size = vectorSize * sizeof(float); + hapiCheck(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice)); } diff --git a/examples/charm++/hello/1darray/hello.C b/examples/charm++/hello/1darray/hello.C index d83397da75..ba3d03570e 100644 --- a/examples/charm++/hello/1darray/hello.C +++ b/examples/charm++/hello/1darray/hello.C @@ -24,9 +24,10 @@ public: CkArrayOptions opts; opts.setNumInitial(nElements); CkCallback initCB(CkIndex_Main::initDone(), thisProxy); - opts.setInitCallback(initCB); + //opts.setInitCallback(initCB); opts.setStaticInsertion(true); arrProxy = CProxy_Hello::ckNew(opts); + //CkExit(); }; void initDone(void) { @@ -48,6 +49,7 @@ public: Hello() { CkPrintf("Hello %d created\n",thisIndex); + CkExit(); } Hello(CkMigrateMessage *m) {} diff --git a/examples/charm++/osu_bw/Makefile b/examples/charm++/osu_bw/Makefile new file mode 100644 index 0000000000..1b94ba7513 --- /dev/null +++ b/examples/charm++/osu_bw/Makefile @@ -0,0 +1,27 @@ +-include ../../common.mk +CHARMC=../../../bin/charmc -O3 $(OPTS) + +OBJS = osu_bw.o + +all: osu_bw + +osu_bw: $(OBJS) + $(CHARMC) -language charm++ -o osu_bw $(OBJS) + +proj: $(OBJS) + $(CHARMC) -language charm++ -tracemode projections -o osu_bw.prj $(OBJS) + +osu_bw.o: osu_bw.C osu_bw.decl.h + $(CHARMC) -c osu_bw.C + +osu_bw.decl.h: osu_bw.ci + $(CHARMC) osu_bw.ci + +clean: + rm -f *.decl.h *.def.h conv-host *.o osu_bw osu_bw.prj charmrun *~ *log *projrc *sts + +test: all + $(call run, +p4 ./osu_bw 32 32 4 ) + +testp: all + $(call run, +p$(P) ./osu_bw $$(( $(P) * 80 )) $$(( $(P) * 80 )) $$(( $(P) * 10 )) ) diff --git a/examples/charm++/osu_bw/osu_bw.C b/examples/charm++/osu_bw/osu_bw.C new file mode 100644 index 0000000000..17f51390bf --- /dev/null +++ b/examples/charm++/osu_bw/osu_bw.C @@ -0,0 +1,165 @@ +// osu_bw.C +#include "osu_bw.decl.h" +#include +#include + +static const int DEFAULT_MIN = 1; +static const int DEFAULT_MAX = 1<<22; // 4 MiB +static const int DEFAULT_ITERS = 1000; +static const int DEFAULT_SKIP = 100; +static const int DEFAULT_WIN = 64; + +class Endpoint; +class Main; + +class DataMsg : public CMessage_DataMsg { + public: + int size; + char* data; +}; + +class Main : public CBase_Main { + CProxy_Endpoint sender, receiver; + int minSize, maxSize, iters, skip, win; + int curSize; + double tMeasured; + int pendingReports; + int ready_count; + + public: + Main(CkArgMsg* m) { + // Parse arguments + minSize = DEFAULT_MIN; maxSize = DEFAULT_MAX; iters = DEFAULT_ITERS; skip = DEFAULT_SKIP; win = DEFAULT_WIN; + ready_count = 0; + for (int i=1; iargc; ++i) { + if (!strcmp(m->argv[i], "-m") && i+1argc) minSize = atoi(m->argv[++i]); + else if (!strcmp(m->argv[i], "-M") && i+1argc) maxSize = atoi(m->argv[++i]); + else if (!strcmp(m->argv[i], "-i") && i+1argc) iters = atoi(m->argv[++i]); + else if (!strcmp(m->argv[i], "-s") && i+1argc) skip = atoi(m->argv[++i]); + else if (!strcmp(m->argv[i], "-w") && i+1argc) win = atoi(m->argv[++i]); + } + delete m; + + // Create endpoints - handle case where we only have 1 PE + if (CkNumPes() < 2) { + CkPrintf("Error: Need at least 2 PEs to run bandwidth test\n"); + CkExit(); + return; + } + sender = CProxy_Endpoint::ckNew(thisProxy, iters, win, skip, 0); + receiver = CProxy_Endpoint::ckNew(thisProxy, iters, win, skip, 1); + sender.setPeer(receiver); + receiver.setPeer(sender); + + // Header like OMB + CkPrintf("# OSU-style Bandwidth (Charm++)\n# Size MB/s (MB=1e6)\n"); + + curSize = minSize; + pendingReports = 0; + } + + void ready() + { + ready_count++; + if (ready_count == 2) { + nextSize(); + } + } + + void nextSize() { + if (curSize > maxSize) { + finish(); + return; + } + pendingReports = 1; + sender.start(curSize); + } + + void doneOne(double seconds) { + // Compute bandwidth in MB/s (decimal) + double bytes = double(curSize) * double(iters) * double(win); + double mbps = bytes / seconds / 1.0e6; + CkPrintf("%-10d %.2f\n", curSize, mbps); + curSize = (curSize == 0) ? 1 : curSize * 2; + nextSize(); + } + + void finish() { + CkExit(); + } +}; + +class Endpoint : public CBase_Endpoint { + CProxy_Endpoint peer; + CProxy_Main mainProxy; + int size, iters, window, skip; + int iter, inFlightRecv, recvInIter; + double t0; + + public: + Endpoint(CProxy_Main m, int iters_, int window_, int skip_) : + mainProxy(m), size(0), iters(iters_), window(window_), skip(skip_), + iter(0), inFlightRecv(0), recvInIter(0), t0(0.0) {} + + void setPeer(CProxy_Endpoint p) { + peer = p; + mainProxy.ready(); + } + + void start(int size_) { + size = size_; iter = 0; recvInIter = 0; inFlightRecv = 0; t0 = 0.0; + // Warmups + measured + // Kick off first window - but only from sender (PE 0) + if (CkMyPe() == 0) { + //CkPrintf("Starting bandwidth test: size=%d, iters=%d, window=%d, skip=%d\n", + // size, iters, window, skip); + sendWindow(); + } else if (CkMyPe() == 1) { + CkPrintf("Receiver ready on PE %d\n", CkMyPe()); + } + } + + void sendWindow() { + // Start timer at end of warmup + if (iter == skip) t0 = CkWallTimer(); + for (int w = 0; w < window; ++w) { + DataMsg* m = new (size) DataMsg; + //DataMsg* m = (DataMsg*)CkAllocMsg(DataMsg, sizeof(DataMsg) + size); + m->size = size; + // touch payload to avoid lazy effects + if (size > 0) memset(m->data, w, size); + peer.recv(m); + } + // Wait for ack from receiver to proceed to next window/iter + } + + void recv(DataMsg* m) { + // Receiver counts messages and acks per window + recvInIter++; + //CkPrintf("Received message of size %d on PE %d, %d, %d\n", m->size, CkMyPe(), recvInIter, window); + if (recvInIter == window) { + recvInIter = 0; + peer.ack(); + } + delete m; + } + + void ack() { + // Sender advances iteration + iter++; + // After warmups + measured iterations, stop and report + if (iter == skip + iters) { + double t = CkWallTimer() - t0; + if (CkMyPe() == 0) { + //CkPrintf("Test completed, reporting results\n"); + mainProxy.doneOne(t); + } + return; + } + // Otherwise send next window + if (CkMyPe() == 0) sendWindow(); + } +}; + +#include "osu_bw.def.h" + diff --git a/examples/charm++/osu_bw/osu_bw.ci b/examples/charm++/osu_bw/osu_bw.ci new file mode 100644 index 0000000000..7f44084d1e --- /dev/null +++ b/examples/charm++/osu_bw/osu_bw.ci @@ -0,0 +1,23 @@ +// osu_bw.ci +mainmodule osu_bw { + message DataMsg { + char data[]; + }; + + mainchare Main { + entry Main(CkArgMsg* m); + entry void nextSize(); + entry void doneOne(double seconds); + entry void finish(); + entry void ready(); + }; + + chare Endpoint { + entry Endpoint(CProxy_Main m, int iters, int window, int skip); + entry void setPeer(CProxy_Endpoint p); + entry void start(int size); + entry void recv(DataMsg *m); + entry void ack(); + }; +} + diff --git a/examples/charm++/osu_latency/Makefile b/examples/charm++/osu_latency/Makefile new file mode 100644 index 0000000000..619fd1d90e --- /dev/null +++ b/examples/charm++/osu_latency/Makefile @@ -0,0 +1,21 @@ +-include ../../common.mk +CHARMC=../../../bin/charmc $(OPTS) + +OBJS = osu_latency.o + +all: osu_latency + +osu_latency: $(OBJS) + $(CHARMC) -language charm++ -o osu_latency $(OBJS) + +osu_latency.decl.h: osu_latency.ci + $(CHARMC) osu_latency.ci + +clean: + rm -f *.decl.h *.def.h conv-host *.o osu_latency charmrun + +osu_latency.o: osu_latency.C osu_latency.decl.h + $(CHARMC) -c osu_latency.C + +test: all + $(call run, +p2 ./osu_latency) diff --git a/examples/charm++/osu_latency/osu_latency.C b/examples/charm++/osu_latency/osu_latency.C new file mode 100644 index 0000000000..eeaf1dc61b --- /dev/null +++ b/examples/charm++/osu_latency/osu_latency.C @@ -0,0 +1,164 @@ +// osu_latency.C +#include "osu_latency.decl.h" +#include +#include + +static const int DEFAULT_MIN = 0; +static const int DEFAULT_MAX = 1<<22; // 4 MiB +static const int DEFAULT_ITERS = 10000; +static const int DEFAULT_SKIP = 1000; + +CProxy_Main mainProxy; + +class Endpoint; +class Main; + +class LatencyMsg : public CMessage_LatencyMsg { + public: + int size; + char* data; +}; + +class Main : public CBase_Main { + CProxy_Endpoint sender, receiver; + int minSize, maxSize, iters, skip; + int curSize; + int ready_count; + + public: + Main(CkArgMsg* m) { + // Parse arguments + minSize = DEFAULT_MIN; maxSize = DEFAULT_MAX; iters = DEFAULT_ITERS; skip = DEFAULT_SKIP; + ready_count = 0; + for (int i=1; iargc; ++i) { + if (!strcmp(m->argv[i], "-m") && i+1argc) minSize = atoi(m->argv[++i]); + else if (!strcmp(m->argv[i], "-M") && i+1argc) maxSize = atoi(m->argv[++i]); + else if (!strcmp(m->argv[i], "-i") && i+1argc) iters = atoi(m->argv[++i]); + else if (!strcmp(m->argv[i], "-s") && i+1argc) skip = atoi(m->argv[++i]); + } + delete m; + + // Create endpoints - handle case where we only have 1 PE + if (CkNumPes() < 2) { + CkPrintf("Error: Need at least 2 PEs to run latency test\n"); + CkExit(); + return; + } + sender = CProxy_Endpoint::ckNew(thisProxy, iters, skip, 0); + receiver = CProxy_Endpoint::ckNew(thisProxy, iters, skip, 1); + sender.setPeer(receiver); + receiver.setPeer(sender); + mainProxy = thisProxy; + + // Header like OMB + CkPrintf("# OSU-style Latency (Charm++)\n# Size Latency (us)\n"); + + curSize = minSize; + } + + void ready() { + ready_count++; + if (ready_count == 2) { + nextSize(); + } + } + + void nextSize() { + if (curSize > maxSize) { + finish(); + return; + } + sender.start(curSize); + } + + void doneOne(double seconds) { + // Compute latency in microseconds (round-trip / 2) + double latency_us = (seconds / (2.0 * double(iters))) * 1.0e6; + CkPrintf("%-12d %.2f\n", curSize, latency_us); + curSize = (curSize == 0) ? 1 : curSize * 2; + nextSize(); + } + + void finish() { + CkExit(); + } +}; + +class Endpoint : public CBase_Endpoint { + CProxy_Endpoint peer; + int size, iters, skip; + int iter; + double t0; + bool is_sender; + + public: + Endpoint(CProxy_Main m, int iters_, int skip_) : + size(0), iters(iters_), skip(skip_), + iter(0), t0(0.0), is_sender(false) {} + + void setPeer(CProxy_Endpoint p) { + peer = p; + is_sender = (CkMyPe() == 0); + mainProxy.ready(); + } + + void start(int size_) { + size = size_; + iter = 0; + t0 = 0.0; + + if (is_sender) { + // Start the ping-pong + sendPing(); + } + } + + void sendPing() { + // Start timer at end of warmup + if (iter == skip) { + t0 = CkWallTimer(); + } + + LatencyMsg* m = new (size) LatencyMsg; + m->size = size; + // touch payload to avoid lazy effects + if (size > 0) { + memset(m->data, iter % 256, size); + } + peer.ping(m); + } + + void ping(LatencyMsg* m) { + // Receiver gets ping and sends pong back + if (!is_sender) { + LatencyMsg* reply = new (m->size) LatencyMsg; + reply->size = m->size; + if (m->size > 0) { + memcpy(reply->data, m->data, m->size); + } + peer.pong(reply); + } + delete m; + } + + void pong(LatencyMsg* m) { + // Sender gets pong back, completes one iteration + if (is_sender) { + iter++; + + // After warmups + measured iterations, stop and report + if (iter == skip + iters) { + double t = CkWallTimer() - t0; + mainProxy.doneOne(t); + delete m; + return; + } + + // Otherwise send next ping + delete m; + sendPing(); + } + } +}; + +#include "osu_latency.def.h" diff --git a/examples/charm++/osu_latency/osu_latency.ci b/examples/charm++/osu_latency/osu_latency.ci new file mode 100644 index 0000000000..d8730a70f7 --- /dev/null +++ b/examples/charm++/osu_latency/osu_latency.ci @@ -0,0 +1,26 @@ +// osu_latency.ci +mainmodule osu_latency { + readonly CProxy_Main mainProxy; + + message LatencyMsg { + char data[]; + }; + + mainchare Main { + entry Main(CkArgMsg* m); + entry void ready(); + entry void nextSize(); + entry void doneOne(double seconds); + entry void finish(); + }; + + chare Endpoint { + entry Endpoint(CProxy_Main m, int iters, int skip); + entry void setPeer(CProxy_Endpoint p); + entry void start(int size); + entry void sendPing(); + entry void ping(LatencyMsg* m); + entry void pong(LatencyMsg* m); + }; + +}; diff --git a/examples/charm++/shrink_expand/README b/examples/charm++/shrink_expand/README index 685431a40d..986c8e0183 100644 --- a/examples/charm++/shrink_expand/README +++ b/examples/charm++/shrink_expand/README @@ -1,20 +1,25 @@ To be able to shrink and expand an application 1 - Needs to run with a load balancer - 2 - Ccs server option needs to be added buring runtime + 2 - Ccs server option needs to be added during runtime Example running command: -./charmrun +p4 jacobi2d 200 20 +balancer GreedyLB +LBDebug 3 ++nodelist ./mynodelistfile ++server ++server-port 1234 +./charmrun_elastic +p4 jacobi2d 200 20 +balancer GreedyLB +LBDebug 3 ++nodelist ./mynodelistfile ++server ++server-port 1234 + +Ignore the ++nodelist ./mynodelistfile argument when running locally. + +Use the MPI machinefile format for mynodelist. For example, + + slots= + slots= + slots= Use the client to send the shrink or expand command to the running application: -./client +./client For example this command will expand the application from 4 to 8 PEs: -./client valor 1234 4 8 +./client 1234 4 0 4 -NOTE 1: Charm needs to built with --enable-shrinkexpand option. +To shrink the application from 4 to 2 PEs by killing PEs 2, 3: +./client 1234 4 2 2 3 0 -NOTE 2: Let's say you want to shrink your application from 2 nodes to 1 node where -each node has 8 cores, you should have repetitive 8 entries in the nodelist file -for the number of cores in each node. Otherwise, you'll end up shrinking your -application in a way that it'll use 4 cores from each node whereas what you really -want is(usually) to only use 8 cores in one of the nodes after shrink. +NOTE 1: Charm needs to built with --enable-shrinkexpand option. diff --git a/examples/charm++/shrink_expand/client.C b/examples/charm++/shrink_expand/client.C index 0292894fdf..d360eadef2 100644 --- a/examples/charm++/shrink_expand/client.C +++ b/examples/charm++/shrink_expand/client.C @@ -14,58 +14,59 @@ int main (int argc, char **argv) { int OLDNPROCS, NEWNPROCS; - if (argc < 5) { - printf("Usage: %s \n", argv[0]); - return 1; - } - // Create a CcsServer and connect to the given hostname and port CcsServer server; - char host[BUF], *bitmap; - int i, port, cmdLen, mode; + char host[BUF], *msg; + int i, port, cmdLen, numKilled, numAdded; + bool isExpand; sprintf(host, "%s", argv[1]); sscanf(argv[2], "%d", &port); sscanf(argv[3], "%d", &OLDNPROCS); - sscanf(argv[4], "%d", &NEWNPROCS); + sscanf(argv[4], "%d", &numKilled); + int killedIndex[numKilled]; + + for (i = 0; i < numKilled; i++) { + sscanf(argv[5 + i], "%d", &killedIndex[i]); + } + + sscanf(argv[5 + numKilled], "%d", &numAdded); + + NEWNPROCS = OLDNPROCS - numKilled + numAdded; - if( NEWNPROCS > OLDNPROCS) - mode = EXPAND; - else if(OLDNPROCS > NEWNPROCS) - mode = SHRINK; - else{ - printf("Error: Old and new PE number is the same!\n"); + //printf("Connecting to server %s %d\n", host, port); + if (CcsConnect(&server, host, port, NULL) == -1) { + printf("0"); return 0; } - printf("Connecting to server %s %d\n", host, port); - CcsConnect(&server, host, port, NULL); - printf("Connected to server\n"); + //printf("Connected to server\n"); - cmdLen = OLDNPROCS * sizeof(char) + sizeof(int) + sizeof(char); - bitmap = (char *) malloc(cmdLen); - - if (mode == EXPAND) { - printf("Sending expand command.\n"); - for (i = 0; i < OLDNPROCS; i++) { - bitmap[i] = 1; + cmdLen = 2 * sizeof(int) + OLDNPROCS * sizeof(char); + msg = (char *) malloc(cmdLen); + memcpy(msg, &NEWNPROCS, sizeof(int)); + memcpy(&msg[sizeof(int)], &OLDNPROCS, sizeof(int)); + + int offset = 2 * sizeof(int); + int count = 0; + for (i = 0; i < OLDNPROCS; i++) { + if (i == killedIndex[count]) { + msg[i + offset] = 0; + count++; } + else + msg[i + offset] = 1; } - else { - printf("Sending shrink command.\n"); - for (i = 0; i < OLDNPROCS; i++) { - if (i < NEWNPROCS) - bitmap[i] = 1; - else - bitmap[i] = 0; - } + + for (i = 0; i < OLDNPROCS; i++) { + printf("PE %d: %d\n", i, msg[i + offset]); } - memcpy(&bitmap[OLDNPROCS], &NEWNPROCS, sizeof(int)); - bitmap[OLDNPROCS+sizeof(int)] = '\0'; - CcsSendRequest(&server, "set_bitmap", 0, cmdLen, bitmap); - printf("Waiting for reply...\n" ); - CcsRecvResponse(&server, cmdLen, bitmap , 180); - printf("Reply received.\n"); + //memcpy(&msg[sizeof(bool)], &NEWNPROCS, sizeof(int)); + CcsSendRequest(&server, "set_bitmap", 0, cmdLen, msg); + + //printf("Waiting for reply...\n" ); + //CcsRecvResponse(&server, cmdLen, msg , 180); + //printf("Reply received.\n"); return 0; } diff --git a/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d.C b/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d.C index 7a01e1a793..9e4a7cdd29 100644 --- a/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d.C +++ b/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d.C @@ -23,7 +23,9 @@ public: CProxy_Jacobi array; int num_chares; int iterations; + int iterations_after_restart; int total_iterations; + int lbTime; double stTime; double startTime; @@ -35,6 +37,7 @@ public: // set iteration counter to zero iterations=0; + iterations_after_restart=0; // store the main proxy mainProxy = thisProxy; @@ -55,8 +58,14 @@ public: total_iterations = atoi(m->argv[3]); } + if (m->argc > 4) { + lbTime = atoi(m->argv[4]); + } else { + lbTime = 100; + } + // Create new array of worker chares - array = CProxy_Jacobi::ckNew(num_chare_cols, num_chare_rows); + array = CProxy_Jacobi::ckNew(lbTime, num_chare_cols, num_chare_rows); // save the total number of worker chares we have in this simulation num_chares = num_chare_rows*num_chare_cols; @@ -81,6 +90,8 @@ public: // subtle: Chare proxy readonly needs to be updated manually because of // the object pointer inside it. mainProxy = thisProxy; + stTime = CkWallTimer(); + iterations_after_restart = 0; CkPrintf("Resuming Jacobi on %d processors with (%d,%d) elements\n", CkNumPes(), num_chare_rows, num_chare_cols); @@ -96,11 +107,13 @@ void report(int completed_iteration) { if (iterations == total_iterations || CkWallTimer()-stTime>=3000000) { CkPrintf("Program Done! avg_it:%.6f\n",(CkWallTimer()-stTime)/iterations); CkExit(); + //exit(0); } else { - if(iterations%1==0) CkPrintf("starting new iteration; iteration %d time: %.6lf time/itr::%.6f\n", iterations, CkWallTimer()-stTime,(CkWallTimer()-stTime)/iterations); - CkPrintf("Memory Usage: %ld bytes \n", CmiMemoryUsage()); + if(iterations%10==0) CkPrintf("starting new iteration; iteration %d time: %.6lf time/itr::%.6f\n", iterations, CkWallTimer()-stTime,(CkWallTimer()-stTime)/iterations_after_restart); + //CkPrintf("Memory Usage: %ld bytes \n", CmiMemoryUsage()); recieve_count=0; iterations++; + iterations_after_restart++; // Call begin_iteration on all worker chares in array startTime = CkWallTimer(); array.begin_iteration(); @@ -113,6 +126,7 @@ void pup(PUP::er &p){ p|num_chares; p|iterations; p|total_iterations; + p|lbTime; p|stTime; p|startTime; CkPrintf("Main's PUPer. \n"); @@ -130,13 +144,15 @@ public: int messages_due; int iteration; int useLB; + int lbTime; array2d temperature; // Constructor, initialize values - Jacobi() + Jacobi(int lbTime_) : messages_due(4) , iteration(0) , useLB(1) + , lbTime(lbTime_) , temperature(block_height + 2, array1d(block_width + 2, 0.0)) { usesAtSync = true; @@ -144,9 +160,11 @@ public: } void pup(PUP::er &p){ + //CkPrintf("[%d] Jacobi's PUPer. \n",CkMyPe()); p|messages_due; p|iteration; p|useLB; + p|lbTime; p|temperature; /* There may be some more variables used in doWork */ } @@ -169,15 +187,15 @@ public: // Perform one iteration of work // The first step is to send the local state to the neighbors void begin_iteration(void) { - if (iteration %50 ==0 && useLB ) { + if (((iteration > 0 && iteration % lbTime == 0) || iteration == 10) && useLB) { useLB = 0; - if(thisIndex.x==0 && thisIndex.y==0) CkPrintf("PROC#%d Calling LBD --------------------- iteration=%d\n",CkMyPe(),iteration); + //if(thisIndex.x==0 && thisIndex.y==0) CkPrintf("PROC#%d Calling LBD --------------------- iteration=%d\n",CkMyPe(),iteration); AtSync(); } else { useLB=1; - if(thisIndex.x==0 && thisIndex.y==0) CkPrintf("PROC#%d started --------------------- iteration=%d\n",CkMyPe(),iteration); - iteration++; + //if(thisIndex.x==0 && thisIndex.y==0) CkPrintf("PROC#%d started --------------------- iteration=%d\n",CkMyPe(),iteration); + iteration++; // Copy left column and right column into temporary arrays array1d left_edge(block_height); array1d right_edge(block_height); @@ -245,17 +263,28 @@ void ResumeFromSync() {begin_iteration();} // and write them to temperature[][] after all of the new values are computed. array2d new_temperature(block_height + 2, array1d(block_width + 2)); - for(int i=1;i Reading pes $num_pes from $pes_file" + pes_arg="+p $num_pes" + fi + fi + + # Pass all script arguments ("$@") to the executable + "$(dirname "$0")/charmrun" $pes_arg "${args[@]}" $restart_arg + + EXIT_CODE=$? + + if [ "$EXIT_CODE" -eq 100 ]; then + is_restart=true + echo "Restart signal (code 100) received. Looping again." + echo "----------------------------------------" + else + echo "Final exit signal (code $EXIT_CODE) received. Exiting loop." + break + fi +done +} + +echo "Control loop finished." \ No newline at end of file diff --git a/src/arch/common/charmrun_hapi b/src/arch/common/charmrun_hapi new file mode 100755 index 0000000000..2064a05b7c --- /dev/null +++ b/src/arch/common/charmrun_hapi @@ -0,0 +1,292 @@ +#!/bin/bash + +is_restart=false +original_args=("$@") +pes_file="/dev/shm/numRestartProcs.txt" +original_nodelist_file="/tmp/hapi_original_nodelist.txt" + +# --- Pre-parse to find the nodelist for daemon startup --- +machinefile="" +for ((i=0; i<${#original_args[@]}; ++i)); do + if [[ "${original_args[i]}" == "++nodelist" ]]; then + machinefile="${original_args[i+1]}" + break + fi +done + +num_nodes=0 +if [[ -n "$machinefile" ]]; then + if [[ ! -f "$machinefile" ]]; then + echo "Charmrun> Error: nodelist file not found: $machinefile" >&2 + exit 1 + fi + num_nodes=$(wc -l < "$machinefile") +else + echo "Charmrun> Warning: ++nodelist not found. Assuming 1 node for HAPI daemon." + num_nodes=1 +fi + +# --- Clean up and start the memory daemon in the background --- +# Read IP addresses and slots from nodelist file (format: ipaddress slots=X) +declare -A node_slots +node_ips=() +if [[ -n "$machinefile" ]]; then + while IFS= read -r line; do + # Extract IP address and slots count + ip=$(echo "$line" | awk '{print $1}') + slots=$(echo "$line" | grep -o 'slots=[0-9]*' | cut -d'=' -f2) + if [[ -n "$ip" ]]; then + node_ips+=("$ip") + # Default to 1 slot if not specified + node_slots["$ip"]=${slots:-1} + fi + done < "$machinefile" +else + # Default to localhost if no nodelist + node_ips=("localhost") + node_slots["localhost"]=1 +fi + +# Save the original nodelist for restart comparison (only on first run) +if [[ ! -f "$original_nodelist_file" ]]; then + if [[ -n "$machinefile" ]]; then + cp "$machinefile" "$original_nodelist_file" + echo "Charmrun> Saved original nodelist to $original_nodelist_file" + else + echo "localhost slots=1" > "$original_nodelist_file" + echo "Charmrun> Created default nodelist file at $original_nodelist_file" + fi +fi + +# Function to get nodes from a nodelist file +get_nodes_from_file() { + local file="$1" + local -A nodes_map + local -a nodes_list + + if [[ -f "$file" && -s "$file" ]]; then + while IFS= read -r line; do + # Skip empty lines and comments + [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue + ip=$(echo "$line" | awk '{print $1}') + slots=$(echo "$line" | grep -o 'slots=[0-9]*' | cut -d'=' -f2) + if [[ -n "$ip" ]]; then + nodes_map["$ip"]=${slots:-1} + nodes_list+=("$ip") + fi + done < "$file" + fi + + # Return both the list and the associative array (global variables) + eval "original_nodes=(${nodes_list[*]})" + for ip in "${nodes_list[@]}"; do + eval "original_node_slots[\"$ip\"]=${nodes_map["$ip"]}" + done +} + +# Function to find new nodes by comparing current nodelist with original (optimized with hashmap) +find_new_nodes() { + declare -A original_node_slots + declare -a original_nodes + + # Get original nodes + get_nodes_from_file "$original_nodelist_file" + + declare -A original_nodes_map + for orig_ip in "${original_nodes[@]}"; do + original_nodes_map["$orig_ip"]=1 + done + + local -a new_nodes + local -A new_node_slots + + # Compare current nodes with original nodes using hashmap lookup + for ip in "${node_ips[@]}"; do + # Check if node exists in original nodes hashmap (O(1) lookup) + if [[ -z "${original_nodes_map[$ip]}" ]]; then + echo "Charmrun> New node detected: $ip" + new_nodes+=("$ip") + new_node_slots["$ip"]=${node_slots["$ip"]} + fi + done + + # Return new nodes (use global variables) + eval "detected_new_nodes=(${new_nodes[*]})" + for ip in "${new_nodes[@]}"; do + eval "detected_new_node_slots[\"$ip\"]=${new_node_slots["$ip"]}" + done +} + +# Clean up on all nodes via SSH (async) +cleanup_pids=() +echo "Charmrun> Initial cleanup on ${#node_ips[@]} node(s): ${node_ips[*]}" +for ip in "${node_ips[@]}"; do + slots=${node_slots["$ip"]} + fifo_cmd="rm -f /dev/shm/numRestartProcs.txt /tmp/server_pipe_* /tmp/client_pipe_* /tmp/daemon_ready_*" + for ((slot=0; slot Starting memory daemons on all nodes..." +for ip in "${node_ips[@]}"; do + slots=${node_slots["$ip"]} + echo "Charmrun> Starting $slots daemon(s) on node $ip" + for ((slot=0; slot /dev/null 2>&1 &" & + daemon_pids+=($!) + done +done + +# Optional: Wait a brief moment for SSH connections to establish (non-blocking) +# sleep 1 + +# --- Main execution loop --- +while true; do + # Reset and parse arguments for each run + args=() + pes_arg="" + restart_arg="" + + temp_args=("${original_args[@]}") + i=0 + while [ $i -lt ${#temp_args[@]} ]; do + arg="${temp_args[$i]}" + case "$arg" in + +p|++p) + i=$((i+1)) + pes_arg="$arg ${temp_args[$i]}" + ;; + +p[0-9]*) + pes_arg="$arg" + ;; + ++p[0-9]*) + pes_arg="$arg" + ;; + *) + args+=("$arg") + ;; + esac + i=$((i+1)) + done + + # Check the flag. If it's a restart, prepare the extra arguments. + if [ "$is_restart" = true ]; then + restart_arg="+shrinkexpand +restart /dev/shm" + if [ -f "$pes_file" ]; then + num_pes=$(cat "$pes_file") + pes_arg="+p $num_pes" + fi + + echo "Charmrun> Restart detected - checking for new nodes..." + + # Re-read current nodelist to check for new nodes + declare -A current_node_slots + current_node_ips=() + if [[ -n "$machinefile" ]]; then + while IFS= read -r line; do + ip=$(echo "$line" | awk '{print $1}') + slots=$(echo "$line" | grep -o 'slots=[0-9]*' | cut -d'=' -f2) + if [[ -n "$ip" ]]; then + current_node_ips+=("$ip") + current_node_slots["$ip"]=${slots:-1} + fi + done < "$machinefile" + else + current_node_ips=("localhost") + current_node_slots["localhost"]=1 + fi + + # Update global variables with current state + node_ips=("${current_node_ips[@]}") + for ip in "${current_node_ips[@]}"; do + node_slots["$ip"]=${current_node_slots["$ip"]} + done + + # Find new nodes + declare -a detected_new_nodes + declare -A detected_new_node_slots + find_new_nodes + + if [[ ${#detected_new_nodes[@]} -gt 0 ]]; then + echo "Charmrun> Found ${#detected_new_nodes[@]} new node(s): ${detected_new_nodes[*]}" + + # Clean up new nodes + echo "Charmrun> Cleaning up new nodes..." + cleanup_pids=() + for ip in "${detected_new_nodes[@]}"; do + echo "Charmrun> Cleaning up node: $ip" + slots=${detected_new_node_slots["$ip"]} + fifo_cmd="rm -f /dev/shm/numRestartProcs.txt /tmp/server_fifo_* /tmp/client_fifo_* /tmp/daemon_ready_*" + for ((slot=0; slot Starting memory daemons on new nodes..." + daemon_pids=() + for ip in "${detected_new_nodes[@]}"; do + slots=${detected_new_node_slots["$ip"]} + echo "Charmrun> Starting $slots daemon(s) on new node $ip" + for ((slot=0; slot /dev/null 2>&1 &" & + daemon_pids+=($!) + done + done + + # Update the original nodelist to include new nodes for future restarts + if [[ -n "$machinefile" ]]; then + cp "$machinefile" "$original_nodelist_file" + echo "Charmrun> Updated original nodelist with new nodes" + fi + + echo "Charmrun> New nodes setup completed" + else + echo "Charmrun> No new nodes detected" + fi + fi + + # Pass all script arguments to the executable + "$(dirname "$0")/charmrun" $pes_arg "${args[@]}" $restart_arg + + EXIT_CODE=$? + + if [ "$EXIT_CODE" -eq 100 ]; then + is_restart=true + echo "Restart signal (code 100) received. Looping again." + echo "----------------------------------------" + else + echo "Final exit signal (code $EXIT_CODE) received. Exiting loop." + # Clean up the background daemon processes on all nodes + for ip in "${node_ips[@]}"; do + ssh "$ip" "pkill -f hapi_memory_daemon" & + done + # Also kill any remaining SSH connection PIDs + for pid in "${daemon_pids[@]}"; do + kill "$pid" 2>/dev/null + done + # Clean up temporary files + rm -f "$original_nodelist_file" + echo "Charmrun> Cleaned up temporary nodelist file" + break + fi +done + +echo "Control loop finished." \ No newline at end of file diff --git a/src/arch/common/conv-mach-common.h b/src/arch/common/conv-mach-common.h index 657366e07c..220d7dd7f4 100644 --- a/src/arch/common/conv-mach-common.h +++ b/src/arch/common/conv-mach-common.h @@ -122,5 +122,5 @@ enum cmiZCMsgType { /* GPU-aware communication is not supported by the machine layer by default */ #ifndef CMK_GPU_COMM -#define CMK_GPU_COMM 0 +#define CMK_GPU_COMM 1 #endif diff --git a/src/arch/common/conv-mach-cuda.sh b/src/arch/common/conv-mach-cuda.sh index aeab75527d..a95dcb6005 100644 --- a/src/arch/common/conv-mach-cuda.sh +++ b/src/arch/common/conv-mach-cuda.sh @@ -1,4 +1,4 @@ BUILD_CUDA=1 -CMK_INCDIR="-I$CUDA_DIR/include $CMK_INCDIR " -CMK_LIBDIR="-L$CUDA_DIR/lib64 $CMK_LIBDIR " -CMK_LIBS="-lcudahybridapi -lcudart -lrt $CMK_LIBS " +CMK_INCDIR="-I$CUDA_DIR/include -I$CUDA_DIR/extras/CUPTI/include $CMK_INCDIR " +CMK_LIBDIR="-L$CUDA_DIR/lib64 -L$CUDA_DIR/extras/CUPTI/lib64 $CMK_LIBDIR " +CMK_LIBS="-lhybridapi -lcudart -lcupti -lrt $CMK_LIBS " diff --git a/src/arch/common/conv-mach-hip.h b/src/arch/common/conv-mach-hip.h new file mode 100644 index 0000000000..f4fa6fc852 --- /dev/null +++ b/src/arch/common/conv-mach-hip.h @@ -0,0 +1,7 @@ +#undef CMK_HIP +#define CMK_HIP 1 + +#undef CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT +#define CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT 1 +#undef CMK_WHEN_PROCESSOR_IDLE_USLEEP +#define CMK_WHEN_PROCESSOR_IDLE_USLEEP 0 diff --git a/src/arch/common/conv-mach-hip.sh b/src/arch/common/conv-mach-hip.sh new file mode 100644 index 0000000000..de9998de1d --- /dev/null +++ b/src/arch/common/conv-mach-hip.sh @@ -0,0 +1,21 @@ +BUILD_HIP=1 +if [ -n "$ROCM_PATH" ] && [ -d "$ROCM_PATH/include" ]; then + CMK_ROCM_PATH="$ROCM_PATH" +elif [ -d "/opt/rocm/include" ]; then + CMK_ROCM_PATH="/opt/rocm" +elif [ -d "/opt/rocm-default/include" ]; then + CMK_ROCM_PATH="/opt/rocm-default" +elif [ -d "/opt/rocm-6.2.4/include" ]; then + CMK_ROCM_PATH="/opt/rocm-6.2.4" +else + CMK_ROCM_PATH="/opt/rocm" +fi + +CMK_ROCM_LIBDIR="$CMK_ROCM_PATH/lib" +if [ ! -d "$CMK_ROCM_LIBDIR" ] && [ -d "$CMK_ROCM_PATH/lib64" ]; then + CMK_ROCM_LIBDIR="$CMK_ROCM_PATH/lib64" +fi + +CMK_INCDIR="-I$CMK_ROCM_PATH/include $CMK_INCDIR " +CMK_LIBDIR="-L$CMK_ROCM_LIBDIR $CMK_LIBDIR " +CMK_LIBS="-lhybridapi -lamdhip64 $CMK_LIBS " diff --git a/src/arch/cuda/hybridAPI/Makefile b/src/arch/cuda/hybridAPI/Makefile index edcca35943..9228099bc4 100644 --- a/src/arch/cuda/hybridAPI/Makefile +++ b/src/arch/cuda/hybridAPI/Makefile @@ -7,16 +7,16 @@ FLAGS := $(OPTSATBUILDTIME) INC := -I$(CUDA_PATH)/include -I.. all: libs - cp libcudahybridapi.a $(CHARMDIR)/lib + cp libhybridapi.a $(CHARMDIR)/lib -libs: libcudahybridapi.a +libs: libhybridapi.a -install: libcudahybridapi.a - cp libcudahybridapi.a $(CHARMDIR)/lib +install: libhybridapi.a + cp libhybridapi.a $(CHARMDIR)/lib -libcudahybridapi.a: hybridapi.o buddy_allocator.o +libhybridapi.a: hybridapi.o buddy_allocator.o ck.o -rm -f $@ - ar q $@ hybridapi.o buddy_allocator.o + ar q $@ hybridapi.o buddy_allocator.o ck.o hybridapi.o: hapi_impl.cpp hapi_impl.h gpumanager.h devicemanager.h buddy_allocator.h hapi.h hapi_nvtx.h $(CHARMC) $(FLAGS) $(INC) -o $@ -c $< diff --git a/src/arch/cuda/hybridAPI/buddy_allocator.cpp b/src/arch/cuda/hybridAPI/buddy_allocator.cpp index 8a940d4a99..add2485e93 100644 --- a/src/arch/cuda/hybridAPI/buddy_allocator.cpp +++ b/src/arch/cuda/hybridAPI/buddy_allocator.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include namespace buddy { void allocator::print_status() { @@ -43,6 +43,18 @@ namespace buddy { return free; } + size_t allocator::get_lb_free_size() { + size_t free = 0; + lb_free_list* tmp = head->next; + while(tmp != tail) { + free += tmp->size; + tmp = tmp->next; + } + + free += lb_size - (size_t)(lb_ptr - lb_base_ptr); + return free; + } + int allocator::get_bucket(size_t size) { return (int)std::ceil(std::log2((double)size)) - 2; } @@ -51,41 +63,105 @@ namespace buddy { return (size_t)(ptr - base_ptr) / size; } - allocator::allocator(size_t size) : min_size(4), base_ptr(NULL) { - if (size == 0) { + allocator::allocator(size_t _comm_lb_size, size_t _comm_size) : min_size(4), base_ptr(NULL) { + if (_comm_size == 0) { fprintf(stderr, "Allocator size has to be larger than 0 bytes\n"); abort(); } // Request GPU memory (closest power of 2) - int total_size_log2 = std::ceil(std::log2((double)size)); - total_size = (size_t)std::pow(2, total_size_log2); - cudaError_t status = cudaMalloc(&base_ptr, total_size); - if (status != cudaSuccess) { + hapiError_t status = hapiMalloc(&base_ptr, _comm_lb_size); + this->comm_size = _comm_size; + if (status != hapiSuccess) { fprintf(stderr, "Failed to allocate GPU memory\n"); abort(); } DEBUG_PRINT("Initialized base_ptr %p with %zu bytes\n", (void*)base_ptr, total_size); // Initialize buckets and set up last bucket (for size min_size) + int total_size_log2 = std::ceil(std::log2((double)_comm_size)); bucket_count = total_size_log2 - 1; buckets = new std::list[bucket_count]; - buckets[bucket_count-1].emplace_back(base_ptr, total_size); + buckets[bucket_count-1].emplace_back(base_ptr, _comm_size); + + // Initialize vars for load balancing + if(_comm_lb_size != _comm_size) { + lb_free_pool.resize(128); + + head = &(lb_free_pool[0]); + lb_free_pool_taken[0] = true; + + tail = &(lb_free_pool[1]); + lb_free_pool_taken[1] = true; + + head->next = tail; + head->prev = nullptr; + tail->next = nullptr; + tail->prev = head; + + this->lb_size = _comm_lb_size - _comm_size; + this->lb_base_ptr = base_ptr + _comm_size; + this->lb_ptr = lb_base_ptr; + } + + this->total_size = _comm_lb_size; } allocator::~allocator() { // Free GPU memory - cudaError_t status = cudaFree(base_ptr); - if (status != cudaSuccess) { + hapiError_t status = hapiFree(base_ptr); + if (status != hapiSuccess) { fprintf(stderr, "Failed to free GPU memory\n"); abort(); } delete[] buckets; } - void* allocator::malloc(size_t request) { + void* allocator::malloc(size_t request, bool is_comm) { + if(!is_comm) { + // buffers for load balancing + if(request > lb_size) return nullptr; + lb_free_list* tmp = head->next; + + // see if existing free block can service request + while (tmp != tail) { + if(tmp->size == request) { + tmp->prev->next = tmp->next; + tmp->next->prev = tmp->prev; + void* ptr = tmp->ptr; + lb_ptr_size[ptr] = request; + + tmp->next = nullptr; + tmp->prev = nullptr; + tmp->size = 0; + lb_free_pool_taken[tmp->indx] = false; + tmp->ptr = nullptr; + + return ptr; + } else if(tmp->size > request) { + void* ptr = tmp->ptr; + lb_ptr_size[ptr] = request; + + tmp->size = tmp->size - request; + tmp->ptr = tmp->ptr + request; + + return ptr; + } + + tmp = tmp->next; + } + + // service the request from the tip of lb_ptr + if(lb_ptr + request > lb_base_ptr + lb_size) return nullptr; + void* ptr = lb_ptr; + lb_ptr_size[ptr] = request; + lb_ptr = lb_ptr + request; + return ptr; + } + + DEBUG_PRINT("REQUEST: %ld, TOTAL_SIZE: %ld", request, total_size); // Cannot satisfy request larger than total size - if (request > total_size) return nullptr; + if (request > comm_size) return nullptr; // Has to be larger than minimum allocation size (4 bytes) // Size is rounded up to the nearest power of 2 @@ -105,19 +181,14 @@ namespace buddy { } // Found bucket with free block, take it and start splitting if needed - FreeBlock& block = buckets[bucket].front(); + FreeBlock block = buckets[bucket].front(); uint8_t* ptr = block.ptr; size_t size = block.size; buckets[bucket].pop_front(); while (bucket-- > original_bucket) { - buckets[bucket].emplace_back(ptr, size / 2); - buckets[bucket].emplace_back(ptr + size / 2, size / 2); - - block = buckets[bucket].front(); - ptr = block.ptr; - size = block.size; - buckets[bucket].pop_front(); + size /= 2; + buckets[bucket].emplace_back(ptr + size, size); } // Store allocation info @@ -126,17 +197,110 @@ namespace buddy { std::forward_as_tuple(ptr), std::forward_as_tuple(size, request)); - DEBUG_PRINT("Allocated ptr %p (base_ptr + %zu) with %zu bytes, requested was %zu bytes\n", - (void*)ptr, (size_t)(ptr - base_ptr), size, request); - -#if BUDDY_DEBUG - print_status(); -#endif return ptr; } void allocator::free(void* ptr) { + if((uint8_t*)ptr >= lb_base_ptr) { + size_t alloc_size = lb_ptr_size[ptr]; + if(alloc_size == 0) { + printf("Load balancing allocator got a request to free buffer of size 0\n"); + fflush(stdout); + std::abort(); + } + + // see if the ptr is just before lb_ptr + if((uint8_t*)ptr + alloc_size == lb_ptr) { + lb_ptr -= alloc_size; + lb_ptr_size[ptr] = 0; + return; + } + + // see if mergeable with any existing free block + lb_free_list* tmp = head->next; + bool merged = false; + while(tmp != tail) { + if (((uint8_t*)tmp->ptr + tmp->size) == (uint8_t*)ptr) { + tmp->size = tmp->size + alloc_size; + lb_free_list* tmp_next = tmp->next; + + if (tmp_next != tail && (((uint8_t*)tmp->ptr + tmp->size) == (uint8_t*)tmp_next->ptr)) { + tmp->size = tmp->size + tmp_next->size; + + tmp_next->prev->next = tmp_next->next; + tmp_next->next->prev = tmp_next->prev; + + tmp_next->next = nullptr; + tmp_next->prev = nullptr; + tmp_next->size = 0; + lb_free_pool_taken[tmp_next->indx] = false; + tmp_next->ptr = nullptr; + } + + merged = true; + break; + } else if(((uint8_t*)ptr + alloc_size) == (uint8_t*)tmp->ptr) { + tmp->size = tmp->size + alloc_size; + tmp->ptr = ptr; + + merged = true; + break; + } else if (tmp->ptr > ptr) { + break; + } + + tmp = tmp->next; + } + + // see if merging reached the lb_ptr + if(merged) { + if((uint8_t*)tmp->ptr + tmp->size == lb_ptr) { + lb_ptr -= tmp->size; + + tmp->prev->next = tmp->next; + tmp->next->prev = tmp->prev; + + tmp->next = nullptr; + tmp->prev = nullptr; + tmp->size = 0; + lb_free_pool_taken[tmp->indx] = false; + tmp->ptr = nullptr; + + } + + lb_ptr_size[ptr] = 0; + return; + } + + // add a free node just before tmp + size_t free_space_indx = 2; + while(lb_free_pool_taken[free_space_indx] && free_space_indx < lb_free_pool.size()) + free_space_indx++; + lb_free_list* free_node; + if(free_space_indx == lb_free_pool.size()) { + // TODO : Implement this logic or just increase the default size of + // lb_free_pool + printf("Load balancing allocator does not have any more free nodes\n"); + fflush(stdout); + std::abort(); + } else { + free_node = &(lb_free_pool[free_space_indx]); + lb_free_pool_taken[free_space_indx] = true; + } + + free_node->indx = free_space_indx; + free_node->ptr = ptr; + free_node->size = lb_ptr_size[ptr]; + free_node->prev = tmp->prev; + free_node->next = tmp; + + tmp->prev->next = free_node; + tmp->prev = free_node; + + lb_ptr_size[ptr] = 0; + return; + } // Find pointer in allocation map auto alloc_it = alloc_map.find((uint8_t*)ptr); if (alloc_it == alloc_map.end()) { @@ -167,29 +331,21 @@ namespace buddy { uint8_t* buddy_ptr = block_index_even ? (merge_ptr + merge_size) : (merge_ptr - merge_size); // If buddy is also free, merge + bool merged = false; for (std::list::iterator it = buckets[i].begin(); it != buckets[i].end(); it++) { - const auto& block = *it; - if (block.ptr == buddy_ptr) { + if (it->ptr == buddy_ptr) { buckets[i+1].emplace_back(block_index_even ? merge_ptr : buddy_ptr, 2 * merge_size); - buckets[i].erase(it); // Iterator is invalid after this erase + buckets[i].erase(it); buckets[i].pop_back(); + merged = true; break; } - else { - // Did not find free buddy block, stop merging - goto merge_done; - } } + if (!merged) break; + if (!block_index_even) merge_ptr = buddy_ptr; merge_size *= 2; } - -merge_done: - DEBUG_PRINT("Freed ptr %p with %zu bytes, requested was %zu bytes\n", ptr, size, requested); - -#if BUDDY_DEBUG - print_status(); -#endif } } diff --git a/src/arch/cuda/hybridAPI/buddy_allocator.h b/src/arch/cuda/hybridAPI/buddy_allocator.h index 5c7fa17c36..f78469ecef 100644 --- a/src/arch/cuda/hybridAPI/buddy_allocator.h +++ b/src/arch/cuda/hybridAPI/buddy_allocator.h @@ -5,6 +5,9 @@ #include #include #include +#include +#include +#include // A cached memory allocator with GPU memory as the backing store. // A fixed size allocation is initially made to the backing store, @@ -47,13 +50,30 @@ namespace buddy { AllocBlock(size_t size_, size_t requested_) : size(size_), requested(requested_) {} }; + struct lb_free_list { + lb_free_list* next; + lb_free_list* prev; + void* ptr; + size_t size; + size_t indx; + }; + lb_free_list* head; + lb_free_list* tail; + std::vector lb_free_pool; + std::unordered_map lb_free_pool_taken; + std::unordered_map lb_ptr_size; + uint8_t* lb_ptr; + // Allocation size limits + size_t comm_size; + size_t lb_size; size_t total_size; const size_t min_size; // Base pointer of the initial allocation uint8_t* base_ptr; - + uint8_t* lb_base_ptr; + // Buckets each with a free list std::list* buckets; int bucket_count; @@ -64,13 +84,14 @@ namespace buddy { // Utility functions void print_status(); size_t get_free_size(); + size_t get_lb_free_size(); int get_bucket(size_t size); int get_block_index(uint8_t* ptr, size_t size); // Allocation functions - allocator(size_t size); + allocator(size_t size, size_t); ~allocator(); - void* malloc(size_t request); + void* malloc(size_t request, bool is_comm); void free(void* ptr); }; } diff --git a/src/arch/cuda/hybridAPI/devicemanager.h b/src/arch/cuda/hybridAPI/devicemanager.h index 092a99d2e8..2351048aa7 100644 --- a/src/arch/cuda/hybridAPI/devicemanager.h +++ b/src/arch/cuda/hybridAPI/devicemanager.h @@ -1,7 +1,7 @@ #ifndef __DEVICEMANAGER_H_ #define __DEVICEMANAGER_H_ -#include +#include #include "converse.h" #include "buddy_allocator.h" @@ -37,13 +37,13 @@ struct DeviceManager { return comm_buffer; } - void create_comm_buffer(size_t size) { + void create_comm_buffer(size_t total_size, size_t comm_size) { if (comm_buffer == nullptr) - comm_buffer = new buddy::allocator(size); + comm_buffer = new buddy::allocator(total_size, comm_size); } - void* alloc_comm_buffer(size_t size) { - return comm_buffer->malloc(size); + void* alloc_comm_buffer(size_t size, bool is_comm = true) { + return comm_buffer->malloc(size, is_comm); } void free_comm_buffer(size_t offset) { @@ -54,6 +54,10 @@ struct DeviceManager { return comm_buffer->get_free_size(); } + size_t get_lb_buffer_free_size() { + return comm_buffer->get_lb_free_size(); + } + void destroy_comm_buffer() { if (comm_buffer) { delete comm_buffer; diff --git a/src/arch/cuda/hybridAPI/gpumanager.h b/src/arch/cuda/hybridAPI/gpumanager.h index eab0064d94..6a96d516b3 100644 --- a/src/arch/cuda/hybridAPI/gpumanager.h +++ b/src/arch/cuda/hybridAPI/gpumanager.h @@ -1,16 +1,19 @@ #ifndef __GPUMANAGER_H_ #define __GPUMANAGER_H_ -#include #include #include #include +#include "hapi_portable.h" #include "converse.h" #include "hapi.h" #include "hapi_impl.h" #include "devicemanager.h" +#include +#include + // Initial size of the user-addressed portion of host/device buffer arrays; // the system-addressed portion of host/device buffer arrays (used when there // is no need to share buffers between work requests) will be equivalant in size. @@ -24,19 +27,26 @@ // CUDA IPC Event related struct, stored in host-wide shared memory. // One object is used for each interaction/message between sender and receiver. // The number of these objects per device will be equal to the CUDA IPC event pool size. -struct cuda_ipc_event_shared { - cudaIpcEventHandle_t src_event_handle; - cudaIpcEventHandle_t dst_event_handle; +struct hapi_ipc_event_shared { + hapiIpcEventHandle_t src_event_handle; + hapiIpcEventHandle_t dst_event_handle; bool src_flag; // Unused for now bool dst_flag; pthread_mutex_t lock; }; +#if CMK_LBDB_ON +struct CuptiBufferItem { + uint8_t* buffer; + size_t validSize; +}; +#endif + // Per-device struct containing data for CUDA IPC. // Use SMP lock in DeviceManager if needed. -struct cuda_ipc_device_info { - std::vector src_event_pool; - std::vector dst_event_pool; +struct hapi_ipc_device_info { + std::vector src_event_pool; + std::vector dst_event_pool; // Flag per event pair (0: free, 1: used) std::vector event_pool_flags; // Offset in device comm buffer (per event) @@ -87,7 +97,7 @@ struct GPUManager { // specifies an invalid buffer ID. int next_buffer_; - cudaStream_t *streams_; + hapiStream_t *streams_; int n_streams_; int last_stream_id_; @@ -121,9 +131,6 @@ struct GPUManager { CmiNodeLock device_mapping_lock; #endif -#ifdef HAPI_CUDA_CALLBACK -#endif - int device_count; // GPU devices usable by this process (could be less than the number of visible devices) int device_count_on_physical_node; int pes_per_device; @@ -134,8 +141,12 @@ struct GPUManager { // Device communication buffer size_t comm_buffer_size; + // Device load-balancing buffer + size_t lb_buffer_size; + // POSIX shared memory for sharing CUDA IPC handles between processes on the same host bool use_shm; + bool test_field; void* shm_ptr; std::string shm_name; int shm_file; @@ -144,12 +155,23 @@ struct GPUManager { void* shm_my_ptr; // CUDA IPC event pool - int cuda_ipc_event_pool_size_pe; - int cuda_ipc_event_pool_size_total; + int hapi_ipc_event_pool_size_pe; + int hapi_ipc_event_pool_size_total; // CUDA IPC handles opened for processes on the same node // Vector size is equal to the number of devices on the physical node - std::vector cuda_ipc_device_infos; + std::vector hapi_ipc_device_infos; + + //CUPTI load balancing +#ifdef CMK_LBDB_ON + std::unordered_map cupti_correlation_db_;//correlationID -> ObjectID + + std::unordered_map cupti_obj_gpu_times_;//objectID -> accumulated GPU time in ns + + std::queue cupti_buffer_queue_; + + bool cupti_initialized_; +#endif void init() { next_buffer_ = NUM_BUFFERS; @@ -189,8 +211,8 @@ struct GPUManager { shm_my_ptr = NULL; // Number of CUDA IPC events per PE - cuda_ipc_event_pool_size_pe = -1; - cuda_ipc_event_pool_size_total = -1; + hapi_ipc_event_pool_size_pe = -1; + hapi_ipc_event_pool_size_total = -1; // Allocate host/device buffers array (both user and system-addressed) host_buffers_ = new void*[NUM_BUFFERS*2]; @@ -245,7 +267,7 @@ struct GPUManager { // Destroy streams if (streams_) { for (int i = 0; i < n_streams_; i++) { - hapiCheck(cudaStreamDestroy(streams_[i])); + hapiCheck(hapiStreamDestroy(streams_[i])); } } @@ -277,9 +299,9 @@ struct GPUManager { // Returns the number of created streams. int createStreams() { int device; - cudaDeviceProp device_prop; - hapiCheck(cudaGetDevice(&device)); - hapiCheck(cudaGetDeviceProperties(&device_prop, device)); + hapiDeviceProp device_prop; + hapiCheck(hapiGetDevice(&device)); + hapiCheck(hapiGetDeviceProperties(&device_prop, device)); int new_n_streams = 0; @@ -311,7 +333,7 @@ struct GPUManager { // Allocate total physical streams between GPU managers sharing a device... // i.e. PEs / num devices int device_count; - hapiCheck(cudaGetDeviceCount(&device_count)); + hapiCheck(hapiGetDeviceCount(&device_count)); int pes_per_device = CmiNumPesOnPhysicalNode(0) / device_count; pes_per_device = pes_per_device > 0 ? pes_per_device : 1; new_n_streams = (new_n_streams + pes_per_device - 1) / pes_per_device; @@ -327,9 +349,9 @@ struct GPUManager { return n_streams_; } - cudaStream_t* old_streams = streams_; + hapiStream_t* old_streams = streams_; - streams_ = new cudaStream_t[new_n_streams]; + streams_ = new hapiStream_t[new_n_streams]; int i = 0; // Copy old streams @@ -340,7 +362,7 @@ struct GPUManager { // Create new streams for (; i < new_n_streams; i++) { - hapiCheck(cudaStreamCreate(&streams_[i])); + hapiCheck(hapiStreamCreate(&streams_[i])); } // Update @@ -350,7 +372,7 @@ struct GPUManager { return n_streams_; } - cudaStream_t getNextStream() { + hapiStream_t getNextStream() { if (streams_ == NULL) return NULL; @@ -358,7 +380,7 @@ struct GPUManager { return streams_[last_stream_id_]; } - cudaStream_t getStream(int i) { + hapiStream_t getStream(int i) { if (streams_ == NULL) return NULL; @@ -418,7 +440,7 @@ struct GPUManager { if (device_buffers_[index] == NULL) { // allocate device memory - hapiCheck(cudaMalloc((void **)&device_buffers_[index], size)); + hapiCheck(hapiMalloc((void **)&device_buffers_[index], size)); #ifdef HAPI_DEBUG CmiPrintf("[HAPI] allocated buffer %d at %p, time: %.2f, size: %zu\n", @@ -438,8 +460,8 @@ struct GPUManager { host_buffers_[index] = bi.host_buffer; if (bi.transfer_to_device) { - hapiCheck(cudaMemcpyAsync(device_buffers_[index], host_buffers_[index], size, - cudaMemcpyHostToDevice, wr->stream)); + hapiCheck(hapiMemcpyAsync(device_buffers_[index], host_buffers_[index], size, + hapiMemcpyHostToDevice, wr->stream)); #ifdef HAPI_DEBUG CmiPrintf("[HAPI] transferring buffer %d from host to device, time: %.2f, " @@ -457,8 +479,8 @@ struct GPUManager { size_t size = bi.size; if (bi.transfer_to_host) { - hapiCheck(cudaMemcpyAsync(host_buffers_[index], device_buffers_[index], size, - cudaMemcpyDeviceToHost, wr->stream)); + hapiCheck(hapiMemcpyAsync(host_buffers_[index], device_buffers_[index], size, + hapiMemcpyDeviceToHost, wr->stream)); #ifdef HAPI_DEBUG CmiPrintf("[HAPI] transferring buffer %d from device to host, time %.2f, " @@ -475,7 +497,7 @@ struct GPUManager { int index = bi.id; if (bi.need_free) { - hapiCheck(cudaFree(device_buffers_[index])); + hapiCheck(hapiFree(device_buffers_[index])); device_buffers_[index] = NULL; #ifdef HAPI_DEBUG diff --git a/src/arch/cuda/hybridAPI/hapi.h b/src/arch/cuda/hybridAPI/hapi.h index 39a1f9c4a2..a2689ea664 100644 --- a/src/arch/cuda/hybridAPI/hapi.h +++ b/src/arch/cuda/hybridAPI/hapi.h @@ -1,6 +1,6 @@ #ifndef __HAPI_H_ #define __HAPI_H_ -#include +#include "hapi_portable.h" /* See hapi_functions.h for the majority of function declarations provided * by the Hybrid API. */ @@ -74,10 +74,10 @@ typedef struct hapiWorkRequest { #endif // Pointer to host-side function that actually invokes the kernel. - // The user implements this function, using the given CUDA stream and + // The user implements this function, using the given hapi stream and // device buffers (which are indexed by hapiBufferInfo->id). // Could be set to NULL if no kernel needs to be executed. - void (*runKernel)(struct hapiWorkRequest* wr, cudaStream_t kernel_stream, + void (*runKernel)(struct hapiWorkRequest* wr, hapiStream_t kernel_stream, void** device_buffers); // flag used for control by the system @@ -89,8 +89,8 @@ typedef struct hapiWorkRequest { // flags determining whether memory should be freed on destruction bool free_user_data; - // CUDA stream index provided by the user or assigned by GPUManager - cudaStream_t stream; + // hapi stream index provided by the user or assigned by GPUManager + hapiStream_t stream; #ifdef HAPI_INSTRUMENT_WRS double phase_start_time; @@ -151,15 +151,15 @@ typedef struct hapiWorkRequest { } #endif - void setRunKernel(void (*_runKernel)(struct hapiWorkRequest*, cudaStream_t, void**)) { + void setRunKernel(void (*_runKernel)(struct hapiWorkRequest*, hapiStream_t, void**)) { runKernel = _runKernel; } - void setStream(cudaStream_t _stream) { + void setStream(hapiStream_t _stream) { stream = _stream; } - cudaStream_t getStream() { + hapiStream_t getStream() { return stream; } @@ -189,7 +189,7 @@ typedef struct hapiWorkRequest hapiWorkRequest; #endif /* defined __cplusplus */ -// Provides support for detecting errors with CUDA API calls. +// Provides support for detecting errors with hapi API calls. #ifndef HAPI_CHECK_OFF #define hapiCheck(code) hapiErrorDie(code, #code, __FILE__, __LINE__) #else @@ -228,22 +228,54 @@ extern "C" { #ifdef __cplusplus // Provide a C++-only stub for this function's default parameter. -void hapiAddCallback(cudaStream_t stream, const CkCallback& cb, void* cb_msg); -static inline void hapiAddCallback(cudaStream_t stream, const CkCallback& cb) { +void hapiAddCallback(hapiStream_t stream, const CkCallback& cb, void* cb_msg); +static inline void hapiAddCallback(hapiStream_t stream, const CkCallback& cb) { hapiAddCallback(stream, cb, nullptr); } -static inline void hapiAddCallback(cudaStream_t stream, void* cb) { +static inline void hapiAddCallback(hapiStream_t stream, void* cb) { hapiAddCallback(stream, cb, nullptr); } // Overloaded C++ wrappers for selecting whether to pool or not using a bool. -static inline cudaError_t hapiMallocHost(void** ptr, size_t size, bool pool) { +static inline hapiError_t hapiMallocHost_Pool(void** ptr, size_t size, bool pool) { return pool ? hapiPoolMalloc(ptr, size) : hapiMallocHost(ptr, size); } -static inline cudaError_t hapiFreeHost(void* ptr, bool pool) { +static inline hapiError_t hapiFreeHost_Pool(void* ptr, bool pool) { return pool ? hapiPoolFree(ptr) : hapiFreeHost(ptr); } +void hapiRecordTime(hapiStream_t stream, hapiEvent_t start); +#ifdef CMK_LBDB_ON +void hapiCuptiInit(); +void hapiCuptiFinalize(); +uint64_t hapiCuptiPushObjCorrelation(); +void hapiCuptiPopObjCorrelation(); +void hapiProcessCuptiBuffers(); +void hapiClearCuptiData(); +#endif + +#ifdef CMK_LBDB_ON +#define HAPI_LAUNCH_KERNEL_WRAPPER(call, stream)\ + hapiEvent_t start;\ + hapiEventCreate(&start);\ + hapiEventRecord(start, stream);\ + call;\ + hapiRecordTime(stream, start); +#else +#define HAPI_LAUNCH_KERNEL_WRAPPER(call, stream)\ + call; +#endif + +#ifdef CMK_LBDB_ON +#define CUPTI_LAUNCH_WRAPPER(call)\ + hapiCuptiPushObjCorrelation();\ + call;\ + hapiCuptiPopObjCorrelation(); +#else +#define CUPTI_LAUNCH_WRAPPER(call)\ + call; +#endif + #endif /* defined __cplusplus */ #endif /* !defined AMPI_INTERNAL_SKIP_FUNCTIONS */ diff --git a/src/arch/cuda/hybridAPI/hapi_functions.h b/src/arch/cuda/hybridAPI/hapi_functions.h index ee2bcd2120..8d3fd917f2 100644 --- a/src/arch/cuda/hybridAPI/hapi_functions.h +++ b/src/arch/cuda/hybridAPI/hapi_functions.h @@ -30,26 +30,33 @@ AMPI_CUSTOM_FUNC(int, hapiCreateStreams, void) // Get a CUDA stream that was created by the runtime. Current scheme is to // hand out streams in a round-robin fashion. -AMPI_CUSTOM_FUNC(cudaStream_t, hapiGetStream, void) +AMPI_CUSTOM_FUNC(hapiStream_t, hapiGetStream, void) // Add a Charm++ callback function to be invoked after the previous operation // in the stream completes. This call should be placed after data transfers or // a kernel invocation. -AMPI_CUSTOM_FUNC(void, hapiAddCallback, cudaStream_t, void*, void*) +AMPI_CUSTOM_FUNC(void, hapiAddCallback, hapiStream_t, void*, void*) // Thin wrappers for memory related CUDA API calls. -AMPI_CUSTOM_FUNC(cudaError_t, hapiMalloc, void**, size_t) -AMPI_CUSTOM_FUNC(cudaError_t, hapiFree, void*) -AMPI_CUSTOM_FUNC(cudaError_t, hapiMallocHost, void**, size_t) -AMPI_CUSTOM_FUNC(cudaError_t, hapiFreeHost, void*) -AMPI_CUSTOM_FUNC(cudaError_t, hapiMemcpyAsync, void*, const void*, size_t, enum cudaMemcpyKind, cudaStream_t) +// AMPI_CUSTOM_FUNC(cudaError_t, hapiMalloc, void**, size_t) +// AMPI_CUSTOM_FUNC(cudaError_t, hapiFree, void*) +// AMPI_CUSTOM_FUNC(cudaError_t, hapiMallocHost, void**, size_t) +// AMPI_CUSTOM_FUNC(cudaError_t, hapiFreeHost, void*) +// AMPI_CUSTOM_FUNC(cudaError_t, hapiMemcpyAsync, void*, const void*, size_t, enum cudaMemcpyKind, cudaStream_t) +// AMPI_CUSTOM_FUNC(cudaError_t, hapiMemcpy2DAsync, void*, size_t, const void*, size_t, size_t, size_t, enum cudaMemcpyKind, cudaStream_t) + +// Kernel launch wrapper +AMPI_CUSTOM_FUNC(hapiError_t, hapiLaunchKernel, const void*, dim3, dim3, void**, size_t, hapiStream_t) // Explicit memory allocations using pinned memory pool. -AMPI_CUSTOM_FUNC(cudaError_t, hapiPoolMalloc, void**, size_t) -AMPI_CUSTOM_FUNC(cudaError_t, hapiPoolFree, void*) +AMPI_CUSTOM_FUNC(hapiError_t, hapiPoolMalloc, void**, size_t) +AMPI_CUSTOM_FUNC(hapiError_t, hapiPoolFree, void*) // Provides support for detecting errors with CUDA API calls. -AMPI_CUSTOM_FUNC(void, hapiErrorDie, cudaError_t, const char*, const char*, int) +AMPI_CUSTOM_FUNC(void, hapiErrorDie, hapiError_t, const char*, const char*, int) + +// Returns the GPU device index this PE is mapped to (set during hapiMapping). +AMPI_CUSTOM_FUNC(uint64_t, hapiMyDevice, void) #ifdef HAPI_INSTRUMENT_WRS AMPI_CUSTOM_FUNC(void, hapiInitInstrument, int n_chares, char n_types) diff --git a/src/arch/cuda/hybridAPI/hapi_impl.cpp b/src/arch/cuda/hybridAPI/hapi_impl.cpp index b5b3bb5fb6..42e3ad4f6e 100644 --- a/src/arch/cuda/hybridAPI/hapi_impl.cpp +++ b/src/arch/cuda/hybridAPI/hapi_impl.cpp @@ -9,12 +9,16 @@ #include #include #include +#include -#define CUDA_API_PER_THREAD_DEFAULT_STREAM -#include +#define hapi_API_PER_THREAD_DEFAULT_STREAM +#include "hapi_portable.h" #include "converse.h" -#include "conv-mach-opt.h" /* for CMK_CUDA */ +#include "conv-mach-opt.h" /* for CMK_hapi */ +#include "ckrescale.h" +#include "charm++.h" + #include "hapi.h" #include "hapi_impl.h" #include "gpumanager.h" @@ -22,6 +26,38 @@ #include "hapi_nvtx.h" #endif +#if CMK_LBDB_ON +#if CMK_CUDA +#include +#endif +#include "LBManager.h" + +#if CMK_CUDA +#define CUPTI_SAFE_CALL(call) \ + do { \ + CUptiResult _status = call; \ + if (_status != CUPTI_SUCCESS) { \ + const char *errstr; \ + cuptiGetResultString(_status, &errstr); \ + CmiPrintf("HAPI CUPTI error: %s at %s:%d\n", errstr, __FILE__, __LINE__); \ + } \ + } while (0) +#endif +#endif + +#define SERVER_FIFO_TEMPLATE "/tmp/server_pipe_%ld" +#define CLIENT_FIFO_TEMPLATE "/tmp/client_pipe_%ld" +#define BUFFER_SIZE 256 +#define STREAM_BUF_SIZE 1024 + +#if defined HAPI_TRACE || defined HAPI_INSTRUMENT_WRS +// extern "C" double CmiWallTimer(); +#endif + +extern int Cmi_isOldProcess; + +extern int CmiSetCPUAffinityLogical(int core); + static void createPool(int *nbuffers, int n_slots, std::vector &pools); static void releasePool(std::vector &pools); @@ -36,19 +72,27 @@ struct hapiCallbackMessage { #ifndef HAPI_CUDA_CALLBACK typedef struct hapiEvent { - cudaEvent_t event; + hapiEvent_t event; CkCallback cb; void* cb_msg; hapiWorkRequest* wr; // if this is not NULL, buffers and request itself are deallocated + CkMigratable* obj; // pointer to the object whose load we want to set + hapiEvent_t start_ev; // event to record the start time - hapiEvent(cudaEvent_t event_, const CkCallback& cb_, void* cb_msg_, hapiWorkRequest* wr_ = NULL) - : event(event_), cb(cb_), cb_msg(cb_msg_), wr(wr_) {} + hapiEvent(hapiEvent_t event_, const CkCallback& cb_, void* cb_msg_, hapiWorkRequest* wr_ = NULL, CkMigratable* obj_ = NULL, hapiEvent_t start_ev_ = NULL) + : event(event_), cb(cb_), cb_msg(cb_msg_), wr(wr_), obj(obj_), start_ev(start_ev_) {} } hapiEvent; CpvDeclare(std::queue, hapi_event_queue); +CpvDeclare(std::queue, hapi_event_pool); #endif // HAPI_CUDA_CALLBACK CpvDeclare(int, n_hapi_events); +int firstRankForDevice = 0; // First rank for each device, used for mapping + +// Managing memory state in server +int hapiAllocId = 0; // Global allocation ID for HAPI + // Used to invoke user's Charm++ callback function void (*hapiInvokeCallback)(void*, void*) = NULL; @@ -63,8 +107,11 @@ void (*hapiQdProcess)(int) = NULL; CsvDeclare(GPUManager, gpu_manager); CpvDeclare(int, my_device); // GPU device that this thread is mapped to +CpvDeclare(int, my_device_id); // index to the deviceManager that stores info about the device CpvDeclare(bool, device_rep); // Is this PE a device representative thread? (1 per device) +void hapiSendMemoryRequest(char* msg, int size); + // Returns the local rank of the logical node (process) that the given PE belongs to static inline int CmiNodeRankLocal(int pe) { // Logical node index % Number of logical nodes per physical node @@ -77,14 +124,14 @@ static inline int CmiMyNodeRankLocal() { } // HAPI internal function declarations -static void hapiInitCsv(); +static void hapiInitCsv(char** argv); static void hapiInitCpv(); static void hapiExitCsv(); static void hapiMapping(char** argv); static void hapiRegisterCallbacks(); -// CUDA IPC related functions +// hapi IPC related functions static void shmInit(); static void shmSetup(); static void shmCreate(); @@ -95,6 +142,53 @@ static void shmCleanup(); static void ipcHandleCreate(); static void ipcHandleOpen(); +#ifdef CMK_LBDB_ON + +#if CMK_CUDA +static void CUPTIAPI cuptiBufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { + *size = 5*1024 * 1024; // 5MB per buffer + *buffer = (uint8_t *)malloc(*size); + *maxNumRecords = 0; +} + +//TODO: handle SMP mode +static void CUPTIAPI cuptiBufferCompleted(CUcontext ctx, uint32_t streamId, + uint8_t *buffer, size_t size, size_t validSize) { + GPUManager& gm = CsvAccess(gpu_manager); + + gm.cupti_buffer_queue_.push({buffer, validSize}); +} +#endif + +// Initialize CUPTI activity tracing — called once per process +void hapiCuptiInit() { +#if CMK_CUDA + CmiPrintf("HAPI: Initializing CUPTI...\n"); + hapiDeviceSynchronize(); + GPUManager& gm = CsvAccess(gpu_manager); + if (gm.cupti_initialized_) return; + + CUPTI_SAFE_CALL(cuptiActivityRegisterCallbacks(cuptiBufferRequested, cuptiBufferCompleted)); + CUPTI_SAFE_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); + CUPTI_SAFE_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); + CUPTI_SAFE_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); + + gm.cupti_initialized_ = true; +#endif +} + +void hapiCuptiFinalize() { + CmiPrintf("HAPI: Finalizing CUPTI...\n"); + hapiDeviceSynchronize(); // Ensure all activity records are flushed + GPUManager& gm = CsvAccess(gpu_manager); + if(gm.cupti_initialized_== false) return; + gm.cupti_initialized_ = false; +#if CMK_CUDA + CUPTI_SAFE_CALL(cuptiFinalize()); +#endif +} +#endif + #ifndef HAPI_CUDA_CALLBACK #if CSD_NO_SCHEDLOOP # error please disable CSD_NO_SCHEDLOOP to use HAPI @@ -105,7 +199,7 @@ static void ipcHandleOpen(); void hapiInit(char** argv) { if (!CmiInCommThread()) { if (CmiMyRank() == 0) { - hapiInitCsv(); // Initialize per-process variables (GPUManager) + hapiInitCsv(argv); // Initialize per-process variables (GPUManager) } hapiInitCpv(); // Initialize per-PE variables @@ -113,6 +207,13 @@ void hapiInit(char** argv) { hapiMapping(argv); // Perform PE-device mapping +#if CMK_SHRINK_EXPAND + hapiStartMemoryDaemon(argv); +#else + int& cpv_my_device = CpvAccess(my_device); + hapiCheck(hapiSetDevice(cpv_my_device)); +#endif + #ifndef HAPI_CUDA_CALLBACK // Register polling function to be invoked at every scheduler loop CcdCallOnConditionKeep(CcdSCHEDLOOP, (CcdCondFn)hapiPollEvents, NULL); @@ -123,7 +224,7 @@ void hapiInit(char** argv) { if (CmiInCommThread()) { // FIXME: Comm. thread sets its device to be the same as worker thread 0 - cudaSetDevice(CsvAccess(gpu_manager).comm_thread_device); + hapiSetDevice(CsvAccess(gpu_manager).comm_thread_device); } shmInit(); @@ -131,10 +232,143 @@ void hapiInit(char** argv) { hapiRegisterCallbacks(); // Register callback functions } + +void hapiStartMemoryDaemon(char** argv) +{ +#if CMK_SHRINK_EXPAND + // start client FIFO + long pid = getpid(); + char client_fifo_path[BUFFER_SIZE]; + sprintf(client_fifo_path, CLIENT_FIFO_TEMPLATE, pid); + std::remove(client_fifo_path); + mkfifo(client_fifo_path, 0666); + + int& cpv_my_device = CpvAccess(my_device); + CkPrintf("Device = %i\n", cpv_my_device); + hapiCheck(hapiSetDevice(cpv_my_device)); + + if (CmiPhysicalRank(CmiMyPe()) != firstRankForDevice) + { + CmiBarrier(); + return; + } + + char server_fifo_path[BUFFER_SIZE]; + sprintf(server_fifo_path, SERVER_FIFO_TEMPLATE, cpv_my_device); + + // Create a ready signal FIFO for synchronization + if (!CmiGetArgFlagDesc(argv,"+shrinkexpand","Restarting of already running prcoess")) { + char ready_fifo_path[BUFFER_SIZE]; + sprintf(ready_fifo_path, "/tmp/daemon_ready_%d", cpv_my_device); + + CmiPrintf("Parent: Waiting for daemon to be ready...\n"); + + int ready_fd = open(ready_fifo_path, O_RDONLY); + if (ready_fd == -1) { + perror("Parent: open ready FIFO"); + CmiAbort("Failed to open ready FIFO"); + } + + char ready_signal; + read(ready_fd, &ready_signal, 1); + close(ready_fd); + unlink(ready_fifo_path); // Clean up + + CmiPrintf("Parent: Daemon is ready!\n"); + } + + CmiBarrier(); + return; +#endif +} + +int hapiCheckpoint(void* devPtr, int size) { + pid_t pid = getpid(); + + char client_fifo_path[BUFFER_SIZE]; + sprintf(client_fifo_path, CLIENT_FIFO_TEMPLATE, pid); + + hapiIpcMemHandle_t ipc_handle; + hapiCheck(hapiIpcGetMemHandle(&ipc_handle, devPtr)); + + char msg_buf[BUFFER_SIZE]; + int offset = sprintf(msg_buf, "CKPT:%ld:%d:%d:", pid, CkMyPe(), size); + memcpy(msg_buf + offset, &ipc_handle, sizeof(hapiIpcMemHandle_t)); + int total_size = offset + sizeof(hapiIpcMemHandle_t); + + hapiSendMemoryRequest(msg_buf, total_size); + + int client_fd = open(client_fifo_path, O_RDONLY); + int alloc_id; + read(client_fd, &alloc_id, sizeof(int)); + close(client_fd); + + return alloc_id; +} + +void hapiRestore(void* devPtr, int size, int alloc_id) { + pid_t pid = getpid(); + + char client_fifo_path[BUFFER_SIZE]; + sprintf(client_fifo_path, CLIENT_FIFO_TEMPLATE, pid); + + char msg_buf[BUFFER_SIZE]; + sprintf(msg_buf, "GET:%ld:%d", pid, alloc_id); + + hapiSendMemoryRequest(msg_buf, strlen(msg_buf) + 1); + + int client_fd = open(client_fifo_path, O_RDONLY); + hapiIpcMemHandle_t ipc_handle; + read(client_fd, &ipc_handle, sizeof(hapiIpcMemHandle_t)); + close(client_fd); + + void* srcPtr; + hapiCheck(hapiIpcOpenMemHandle(&srcPtr, ipc_handle, hapiIpcMemLazyEnablePeerAccess)); + hapiCheck(hapiMemcpy(devPtr, srcPtr, size, hapiMemcpyDeviceToDevice)); + hapiCheck(hapiIpcCloseMemHandle(srcPtr)); + + char free_msg[BUFFER_SIZE]; + sprintf(free_msg, "FREE:%ld:%d", pid, alloc_id); + hapiSendMemoryRequest(free_msg, strlen(free_msg) + 1); + + client_fd = open(client_fifo_path, O_RDONLY); + char status; + read(client_fd, &status, sizeof(char)); + close(client_fd); +} + void hapiExit() { // Ensure all PEs have finished GPU work + CmiPrintf("Exit called on PE %d\n", CmiMyPe()); CmiNodeBarrier(); +#if CMK_SHRINK_EXPAND + char client_fifo_path[BUFFER_SIZE]; + sprintf(client_fifo_path, CLIENT_FIFO_TEMPLATE, getpid()); + + if (!get_shrinkexpand_exit() && CmiPhysicalRank(CmiMyPe()) == firstRankForDevice) + { + char msg_buf[BUFFER_SIZE]; + sprintf(msg_buf, "KILL:%ld:0", getpid()); + hapiSendMemoryRequest(msg_buf, strlen(msg_buf) + 1); + + int client_fd = open(client_fifo_path, O_RDONLY); + char status; + read(client_fd, &status, sizeof(char)); + close(client_fd); + } + + if (!get_shrinkexpand_exit()) + { + // Attempt to delete the file + if (std::remove(client_fifo_path) == 0) { + CmiPrintf("File '%s' deleted successfully.\n", client_fifo_path); + } else { + CmiPrintf("Error deleting file '%s': %s\n", client_fifo_path, strerror(errno)); + } + } +#endif + if (CmiMyRank() == 0) { shmCleanup(); @@ -143,23 +377,130 @@ void hapiExit() { } // Initialize per-process variables -static void hapiInitCsv() { +static void hapiInitCsv(char** argv) { // Create and initialize GPU Manager object CsvInitialize(GPUManager, gpu_manager); CsvAccess(gpu_manager).init(); + #if CMK_LBDB_ON + CmiPrintf("HAPI: seeing _lb_args.statsOn() = %d\n", _lb_args.statsOn()); + if (LBHasBalancersRegistered() && _lb_args.statsOn()) + hapiCuptiInit(); + #endif } + +#ifdef CMK_LBDB_ON + +void hapiProcessCuptiBuffers() { + #if CMK_CUDA + GPUManager& gm = CsvAccess(gpu_manager); + + uint32_t kernel_count = 0; + uint32_t corr_count = 0; + while (true) { + uint32_t record_count = 0; + CuptiBufferItem item; + + // Pop one buffer from the queue + if (gm.cupti_buffer_queue_.empty()) { + break; + } + item = gm.cupti_buffer_queue_.front(); + gm.cupti_buffer_queue_.pop(); + + // Parse records in this buffer + CUpti_Activity *record = NULL; + // ckout<<"valid size for the CUPTI buffer: "<kind == CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION) { + CUpti_ActivityExternalCorrelation *corr = (CUpti_ActivityExternalCorrelation *)record; + corr_count++; + if(gm.cupti_correlation_db_.find(corr->correlationId)!=gm.cupti_correlation_db_.end()) + { + //out of order block + uint64_t curr_kernel_time = gm.cupti_correlation_db_[corr->correlationId]; + gm.cupti_obj_gpu_times_[corr->externalId] += curr_kernel_time; + gm.cupti_correlation_db_.erase(corr->correlationId); // Remove correlation ID after processing + } + else + { + gm.cupti_correlation_db_[corr->correlationId] = corr->externalId; + } + } + else if (record->kind == CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL || + record->kind == CUPTI_ACTIVITY_KIND_KERNEL) { + kernel_count++; + CUpti_ActivityKernel4 *kernel = (CUpti_ActivityKernel4 *)record; + uint64_t duration_ns = kernel->end - kernel->start; + // ckout<<"the current kernel's duration is "<correlationId); + if (it != gm.cupti_correlation_db_.end()) { + uint64_t obj_id = it->second; + gm.cupti_obj_gpu_times_[obj_id] += duration_ns; + gm.cupti_correlation_db_.erase(it); // Remove correlation ID after processing + } + else + { + // CmiPrintf("found an out of order entry\n"); + gm.cupti_correlation_db_[kernel->correlationId] = duration_ns; + } + } + } + + // ckout<<"number of CUPTI records in this buffer: "<, hapi_event_queue); + CpvInitialize(std::queue, hapi_event_pool); + // for(int i = 0; i < 8; i++) { + // hapiEvent_t ev; + // hapiEventCreateWithFlags(&ev, hapiEventDisableTiming); + // CpvAccess(hapi_event_pool).push(ev); + // } #endif CpvInitialize(int, n_hapi_events); CpvAccess(n_hapi_events) = 0; // Device mapping CpvInitialize(int, my_device); + CpvInitialize(int, my_device_id); + CpvAccess(my_device_id) = 0; CpvAccess(my_device) = 0; CpvInitialize(bool, device_rep); CpvAccess(device_rep) = false; @@ -176,15 +517,20 @@ static void hapiExitCsv() { if (csv_gpu_manager.mempool_initialized_) { releasePool(csv_gpu_manager.mempool_free_bufs_); } +#ifndef HAPI_CUDA_CALLBACK + auto& hapi_event_pool_ = CpvAccess(hapi_event_pool); + while(!hapi_event_pool_.empty()) { + hapiEventDestroy(hapi_event_pool_.front()); + hapi_event_pool_.pop(); + } +#endif } // Set up PE to GPU mapping, invoked from all PEs // TODO: Support custom mappings static void hapiMapping(char** argv) { GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); - Mapping map_type = Mapping::Block; // Default is block mapping - bool all_gpus = false; // If true, all GPUs are visible to all processes. - // Otherwise, only a subset are visible (e.g. with jsrun) + Mapping map_type = Mapping::RoundRobin; // Default is round robin char* gpumap = NULL; // Process +gpumap @@ -206,15 +552,6 @@ static void hapiMapping(char** argv) { } } - // Process +allgpus - if (CmiGetArgFlagDesc(argv, "+allgpus", - "all GPUs are visible to all processes")) { - all_gpus = true; - if (CmiMyPe() == 0) { - CmiPrintf("HAPI> All GPUs are visible to all processes\n"); - } - } - // No mapping specified, user assumes responsibility if (map_type == Mapping::None) { if (CmiMyPe() == 0) { @@ -226,19 +563,19 @@ static void hapiMapping(char** argv) { CmiAssert(map_type != Mapping::None); if (CmiMyRank() == 0) { + printf("number of physical nodes is %d\n", CmiNumPhysicalNodes()); + printf("number of nodes is %d\n", CmiNumNodes()); + printf("my rank is %d\n", CmiMyRank()); // Count number of GPU devices used by each process int visible_device_count; - hapiCheck(cudaGetDeviceCount(&visible_device_count)); + hapiCheck(hapiGetDeviceCount(&visible_device_count)); if (visible_device_count <= 0) { CmiAbort("Unable to perform PE-GPU mapping, no GPUs found!"); } int& device_count = csv_gpu_manager.device_count; - if (all_gpus) { - device_count = visible_device_count / (CmiNumNodes() / CmiNumPhysicalNodes()); - } else { - device_count = visible_device_count; - } + device_count = visible_device_count / (CmiNumNodes() / CmiNumPhysicalNodes());//????? + ckout<<"device count "<& device_managers = csv_gpu_manager.device_managers; - for (int i = 0; i < device_count; i++) { - device_managers.emplace_back(i, device_count * CmiMyNodeRankLocal() + i); + // We also need to handle the case where the number of GPUs are less than the + // number of processes launched on a physical node. Thus multiple processes can + // share a GPU. In this case device_count would be 0, but instead, we will assign + // at least one gpu to each process + if(device_count == 0) { + device_count = 1; } - // Count number of PEs per device csv_gpu_manager.pes_per_device = CmiNodeSize(CmiMyNode()) / device_count; // Count number of devices on a physical node - csv_gpu_manager.device_count_on_physical_node = - device_count * (CmiNumNodes() / CmiNumPhysicalNodes()); + csv_gpu_manager.device_count_on_physical_node = visible_device_count; + + // Create a DeviceManager per GPU device + std::vector& device_managers = csv_gpu_manager.device_managers; + if(map_type == Mapping::RoundRobin) { + for (int i = 0; i < device_count; i++) { + device_managers.emplace_back(i, (device_count * CmiMyNodeRankLocal() + i) % visible_device_count); + } + } + else if(map_type == Mapping::Block) + { + for (int i = 0; i < device_count; i++) { + device_managers.emplace_back(i, (CmiMyNodeRankLocal() * visible_device_count + i)/(CmiNumNodes() / CmiNumPhysicalNodes())); + } + } + else + { + CmiAbort("Unsupported mapping type!"); + } } if (CmiMyPe() == 0) { @@ -275,32 +630,35 @@ static void hapiMapping(char** argv) { CmiNodeBarrier(); // Perform mapping and set device representative PE - int my_rank = all_gpus ? CmiPhysicalRank(CmiMyPe()) : CmiMyRank(); + int my_rank = CmiMyRank(); int& cpv_my_device = CpvAccess(my_device); + int& cpv_my_device_id = CpvAccess(my_device_id); bool& cpv_device_rep = CpvAccess(device_rep); switch (map_type) { - case Mapping::Block: - cpv_my_device = my_rank / csv_gpu_manager.pes_per_device; - if(cpv_my_device >= csv_gpu_manager.device_count) - cpv_my_device = csv_gpu_manager.device_count - 1; - if (my_rank % csv_gpu_manager.pes_per_device == 0) cpv_device_rep = true; + case Mapping::Block:{ + cpv_my_device_id = (my_rank*csv_gpu_manager.device_count) / CmiNodeSize(CmiMyNode()); + cpv_my_device = csv_gpu_manager.device_managers[cpv_my_device_id].global_index; + if (my_rank < csv_gpu_manager.device_count) cpv_device_rep = true; + firstRankForDevice = cpv_my_device; + } break; - case Mapping::RoundRobin: - cpv_my_device = my_rank % csv_gpu_manager.device_count; + case Mapping::RoundRobin: { + cpv_my_device_id = my_rank % csv_gpu_manager.device_count; + cpv_my_device = csv_gpu_manager.device_managers[cpv_my_device_id].global_index; if (my_rank < csv_gpu_manager.device_count) cpv_device_rep = true; + firstRankForDevice = cpv_my_device; + } break; - default: + default: CmiAbort("Unsupported mapping type!"); } - - // Set device and store PE-device mapping - hapiCheck(cudaSetDevice(cpv_my_device)); + + hapiCheck(hapiSetDevice(cpv_my_device)); #if CMK_SMP CmiLock(csv_gpu_manager.device_mapping_lock); #endif - csv_gpu_manager.device_map.emplace(CmiMyPe(), - &(csv_gpu_manager.device_managers[cpv_my_device])); + csv_gpu_manager.device_map.emplace(CmiMyPe(), &(csv_gpu_manager.device_managers[cpv_my_device_id])); #if CMK_SMP CmiUnlock(csv_gpu_manager.device_mapping_lock); #endif @@ -320,14 +678,17 @@ static void hapiMapping(char** argv) { } if (CmiMyRank() == 0) { - if (use_shm) csv_gpu_manager.use_shm = true; + if (use_shm) { + csv_gpu_manager.use_shm = true; + } + // csv_gpu_manager.test_field = true; } CmiNodeBarrier(); if (csv_gpu_manager.use_shm) { // Process device communication buffer parameters (in MB) - int input_comm_buffer_size; + int input_comm_buffer_size = 0; if (CmiGetArgIntDesc(argv, "+gpucommbuffer", &input_comm_buffer_size, "GPU communication buffer size (in MB)")) { if (CmiMyRank() == 0) { @@ -338,10 +699,23 @@ static void hapiMapping(char** argv) { } } + // Process device communication buffer parameters (in MB) + int input_lb_buffer_size = 0; + if (CmiGetArgIntDesc(argv, "+gpulbbuffer", &input_lb_buffer_size, + "GPU load balancing buffer size (in MB)")) { + if (CmiMyRank() == 0) { + csv_gpu_manager.lb_buffer_size = (size_t)input_lb_buffer_size * 1024 * 1024; + } + } + if (CmiMyPe() == 0) { CmiPrintf("HAPI> GPU communication buffer size: %zu MB " "(rounded up to the nearest power of two)\n", csv_gpu_manager.comm_buffer_size / (1024 * 1024)); + + CmiPrintf("HAPI> GPU load balancing buffer size: %zu MB " + "\n", + csv_gpu_manager.lb_buffer_size / (1024 * 1024)); } CmiNodeBarrier(); // Ensure device communication buffer size is set @@ -353,27 +727,27 @@ static void hapiMapping(char** argv) { #if CMK_SMP CmiLock(dm->lock); #endif - dm->create_comm_buffer(csv_gpu_manager.comm_buffer_size); + dm->create_comm_buffer(csv_gpu_manager.comm_buffer_size + csv_gpu_manager.lb_buffer_size, csv_gpu_manager.comm_buffer_size); #if CMK_SMP CmiUnlock(dm->lock); #endif } - // Process custom size for CUDA IPC event pool - int input_cuda_ipc_event_pool_size; - if (!CmiGetArgIntDesc(argv, "+gpuipceventpool", &input_cuda_ipc_event_pool_size, + // Process custom size for hapi IPC event pool + int input_hapi_ipc_event_pool_size; + if (!CmiGetArgIntDesc(argv, "+gpuipceventpool", &input_hapi_ipc_event_pool_size, "GPU IPC event pool size per PE")) { - input_cuda_ipc_event_pool_size = 16; + input_hapi_ipc_event_pool_size = 16; } if (CmiMyRank() == 0) { - csv_gpu_manager.cuda_ipc_event_pool_size_pe = input_cuda_ipc_event_pool_size; - csv_gpu_manager.cuda_ipc_event_pool_size_total = input_cuda_ipc_event_pool_size * csv_gpu_manager.pes_per_device; + csv_gpu_manager.hapi_ipc_event_pool_size_pe = input_hapi_ipc_event_pool_size; + csv_gpu_manager.hapi_ipc_event_pool_size_total = input_hapi_ipc_event_pool_size * csv_gpu_manager.pes_per_device; } if (CmiMyPe() == 0) { - CmiPrintf("HAPI> CUDA IPC event pool size - %d per PE, %d per device\n", - csv_gpu_manager.cuda_ipc_event_pool_size_pe, csv_gpu_manager.cuda_ipc_event_pool_size_total); + CmiPrintf("HAPI> hapi IPC event pool size - %d per PE, %d per device\n", + csv_gpu_manager.hapi_ipc_event_pool_size_pe, csv_gpu_manager.hapi_ipc_event_pool_size_total); } } @@ -396,9 +770,9 @@ static void hapiMapping(char** argv) { if (i != cpv_my_device) { int can_access_peer; - hapiCheck(cudaDeviceCanAccessPeer(&can_access_peer, cpv_my_device, i)); + hapiCheck(hapiDeviceCanAccessPeer(&can_access_peer, cpv_my_device, i)); if (can_access_peer) { - cudaDeviceEnablePeerAccess(i, 0); + hapiDeviceEnablePeerAccess(i, 0); } } } @@ -411,13 +785,25 @@ static void hapiMapping(char** argv) { } #ifndef HAPI_CUDA_CALLBACK -void recordEvent(cudaStream_t stream, const CkCallback& cb, void* cb_msg, hapiWorkRequest* wr = NULL) { - // create CUDA event and insert into stream - cudaEvent_t ev; - cudaEventCreateWithFlags(&ev, cudaEventDisableTiming); - cudaEventRecord(ev, stream); +void recordEvent(hapiStream_t stream, const CkCallback& cb, void* cb_msg, hapiWorkRequest* wr = NULL, CkMigratable* obj = NULL, hapiEvent_t start_ev = NULL) { + // if(obj!=NULL) + // CmiAbort("non null without HAPI hapi CALLBACK"); + // create hapi event / get hapi event from the pool and insert into stream + hapiEvent_t ev; + auto& hapi_event_pool_local = CpvAccess(hapi_event_pool); + if(hapi_event_pool_local.size() == 0) { + #if CMK_LBDB_ON + hapiEventCreateWithFlags(&ev, hapiEventDefault); + #else + hapiEventCreateWithFlags(&ev, hapiEventDisableTiming); + #endif + } else { + ev = hapi_event_pool_local.front(); + hapi_event_pool_local.pop(); + } + hapiEventRecord(ev, stream); - hapiEvent hev(ev, cb, cb_msg, wr); + hapiEvent hev(ev, cb, cb_msg, wr, obj, start_ev); // push event information in queue CpvAccess(hapi_event_queue).push(hev); @@ -532,15 +918,15 @@ static void hapiRegisterCallbacks() { } #ifdef HAPI_CUDA_CALLBACK -// Callback function invoked by the CUDA runtime certain parts of GPU work are +// Callback function invoked by the hapi runtime certain parts of GPU work are // complete. It sends a converse message to the original PE to free the relevant // device memory and invoke the user's callback. The reason for this method is -// that a thread created by the CUDA runtime does not have access to any of the +// that a thread created by the hapi runtime does not have access to any of the // CpvDeclare'd variables as it is not one of the threads created by the Charm++ // runtime. -static void CUDACallback(void *data) { +static void hapiCallback(void *data) { #ifdef HAPI_NVTX_PROFILE - NVTXTracer nvtx_range("CUDACallback", NVTXColor::Silver); + NVTXTracer nvtx_range("hapiCallback", NVTXColor::Silver); #endif // send message to the original PE @@ -558,7 +944,7 @@ enum CallbackStage { static void addCallback(hapiWorkRequest *wr, CallbackStage stage) { GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); - // create converse message to be delivered to this PE after CUDA callback + // create converse message to be delivered to this PE after hapi callback char *conv_msg = (char *)CmiAlloc(CmiMsgHeaderSizeBytes + sizeof(int) + sizeof(hapiWorkRequest *)); // FIXME memory leak? *((int *)(conv_msg + CmiMsgHeaderSizeBytes)) = CmiMyRank(); @@ -581,8 +967,8 @@ static void addCallback(hapiWorkRequest *wr, CallbackStage stage) { } CmiSetHandler(conv_msg, handlerIdx); - // add callback into CUDA stream - hapiCheck(cudaLaunchHostFunc(wr->stream, CUDACallback, (void*)conv_msg)); + // add callback into hapi stream + hapiCheck(hapiLaunchHostFunc(wr->stream, hapiCallback, (void*)conv_msg)); } #endif // HAPI_CUDA_CALLBACK @@ -674,8 +1060,8 @@ hapiWorkRequest::hapiWorkRequest() : chare_index = -1; #endif - // Use CUDA per-thread default stream - stream = cudaStreamPerThread; + // Use hapi per-thread default stream + stream = hapiStreamPerThread; // Charm++ callbacks are not set by default host_to_device_cb = CkCallback(CkCallback::ignore); @@ -694,30 +1080,30 @@ static void shmInit() { if (!CsvAccess(gpu_manager).use_shm) return; if (CmiMyRank() == 0) { - shmSetup(); + if (!CmiInCommThread()) shmSetup(); if (CmiMyNodeRankLocal() == 0) { - shmCreate(); // Create a per-host shared memory region + if (!CmiInCommThread()) shmCreate(); // Create a per-host shared memory region CmiBarrier(); // FIXME: Only needs to be a host-wide barrier } else { CmiBarrier(); - shmOpen(); // Open the shared memory region created by local logical node 0 + if (!CmiInCommThread()) shmOpen(); // Open the shared memory region created by local logical node 0 } - shmMap(); // Map the shared memory file into memory + if (!CmiInCommThread()) shmMap(); // Map the shared memory file into memory } else { CmiBarrier(); } - CmiNodeBarrier(); // Ensure shared memory has been mapped into the logical node + if (!CmiInCommThread()) CmiNodeBarrier(); // Ensure shared memory has been mapped into the logical node - ipcHandleCreate(); // Create CUDA IPC handles + if (!CmiInCommThread()) ipcHandleCreate(); // Create hapi IPC handles - // Ensure CUDA IPC handles are available for all processes + // Ensure hapi IPC handles are available for all processes // Note: Causes a hang when this barrier is placed after CPU topology initialization // FIXME: This only needs to be a host-wide synchronization CmiBarrier(); if (CmiMyRank() == 0) { - ipcHandleOpen(); // Open CUDA IPC handles for accessing other processes' device memory + if (!CmiInCommThread()) ipcHandleOpen(); // Open hapi IPC handles for accessing other processes' device memory } } @@ -725,16 +1111,16 @@ static void shmSetup() { GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); // Set up shared memory file name - csv_gpu_manager.shm_name.assign("charm-cuda-host"); + csv_gpu_manager.shm_name.assign("charm-hapi-host"); int host_id = CmiPhysicalNodeID(CmiMyPe()); csv_gpu_manager.shm_name.append(std::to_string(host_id)); const char* shm_name = csv_gpu_manager.shm_name.c_str(); // Calculate shared memory region size - csv_gpu_manager.shm_chunk_size = sizeof(cudaIpcMemHandle_t) + - sizeof(cuda_ipc_event_shared) * csv_gpu_manager.cuda_ipc_event_pool_size_total; + csv_gpu_manager.shm_chunk_size = sizeof(hapiIpcMemHandle_t) + + sizeof(hapi_ipc_event_shared) * csv_gpu_manager.hapi_ipc_event_pool_size_total; csv_gpu_manager.shm_size = csv_gpu_manager.shm_chunk_size * - csv_gpu_manager.device_count_on_physical_node; + csv_gpu_manager.device_count * ((CmiNumNodes() / CmiNumPhysicalNodes())); } // Create POSIX shared memory region accessible to all processes on the same host @@ -817,12 +1203,12 @@ static void shmMap() { // Store pointer to my process' portion of the shared memory region csv_gpu_manager.shm_my_ptr = (void*)((char*)csv_gpu_manager.shm_ptr + - csv_gpu_manager.shm_chunk_size * csv_gpu_manager.device_count * - CmiMyNodeRankLocal()); + csv_gpu_manager.shm_chunk_size * (csv_gpu_manager.device_count * + CmiMyNodeRankLocal())); // Allocate memory for local storage - for (int i = 0; i < csv_gpu_manager.device_count_on_physical_node; i++) { - csv_gpu_manager.cuda_ipc_device_infos.emplace_back(); + for (int i = 0; i < csv_gpu_manager.device_count * ((CmiNumNodes() / CmiNumPhysicalNodes())); i++) { + csv_gpu_manager.hapi_ipc_device_infos.emplace_back(); } } @@ -851,7 +1237,7 @@ static void shmCleanup() { } } -// Create CUDA IPC handles and populate shared memory region +// Create hapi IPC handles and populate shared memory region // Invoked by all PEs static void ipcHandleCreate() { // Only device reps should continue to perform the following operations @@ -859,36 +1245,44 @@ static void ipcHandleCreate() { if (!CpvAccess(device_rep)) return; GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); - int& cpv_my_device = CpvAccess(my_device); + int& cpv_my_device_id = CpvAccess(my_device_id); - // Create CUDA IPC memory handle in shared memory - DeviceManager& my_dm = csv_gpu_manager.device_managers[cpv_my_device]; + // Create hapi IPC memory handle in shared memory + auto it = csv_gpu_manager.device_map.find(CmiMyPe()); + if (it == csv_gpu_manager.device_map.end()) { + CmiAbort("PE not found in device_map during ipcHandleCreate"); + } + DeviceManager& my_dm = *(it->second); auto comm_buffer = my_dm.get_comm_buffer(); CmiAssert(comm_buffer); - cudaIpcMemHandle_t* shm_mem_handle = (cudaIpcMemHandle_t*)((char*)csv_gpu_manager.shm_my_ptr + - csv_gpu_manager.shm_chunk_size * cpv_my_device); + + // Use local device index (0 to device_count-1) for shm_mem_handle offset + // int local_device_idx = my_dm.local_index; + hapiIpcMemHandle_t* shm_mem_handle = (hapiIpcMemHandle_t*)((char*)csv_gpu_manager.shm_my_ptr + + csv_gpu_manager.shm_chunk_size * cpv_my_device_id); + void* device_ptr = comm_buffer->base_ptr; - hapiCheck(cudaIpcGetMemHandle(shm_mem_handle, device_ptr)); + hapiCheck(hapiIpcGetMemHandle(shm_mem_handle, device_ptr)); - // Create CUDA IPC events and store them locally (in cuda_ipc_device_info), + // Create hapi IPC events and store them locally (in hapi_ipc_device_info), // and create corresponding IPC handles in shared memory - cuda_ipc_device_info& my_device_info = csv_gpu_manager.cuda_ipc_device_infos[my_dm.global_index]; - cuda_ipc_event_shared* shm_event_shared = (cuda_ipc_event_shared*)((char*)shm_mem_handle + sizeof(cudaIpcMemHandle_t)); + hapi_ipc_device_info& my_device_info = csv_gpu_manager.hapi_ipc_device_infos[csv_gpu_manager.device_count * CmiMyNodeRankLocal() + cpv_my_device_id]; + hapi_ipc_event_shared* shm_event_shared = (hapi_ipc_event_shared*)((char*)shm_mem_handle + sizeof(hapiIpcMemHandle_t)); - for (int i = 0; i < csv_gpu_manager.cuda_ipc_event_pool_size_total; i++) { - cuda_ipc_event_shared* cur_shm_event_shared = shm_event_shared + i; + for (int i = 0; i < csv_gpu_manager.hapi_ipc_event_pool_size_total; i++) { + hapi_ipc_event_shared* cur_shm_event_shared = shm_event_shared + i; my_device_info.event_pool_flags.push_back(0); my_device_info.event_pool_buff_offsets.push_back(0); my_device_info.src_event_pool.emplace_back(); my_device_info.dst_event_pool.emplace_back(); - hapiCheck(cudaEventCreateWithFlags(&my_device_info.src_event_pool[i], - cudaEventDisableTiming | cudaEventInterprocess)); - hapiCheck(cudaEventCreateWithFlags(&my_device_info.dst_event_pool[i], - cudaEventDisableTiming | cudaEventInterprocess)); - hapiCheck(cudaIpcGetEventHandle(&cur_shm_event_shared->src_event_handle, + hapiCheck(hapiEventCreateWithFlags(&my_device_info.src_event_pool[i], + hapiEventDisableTiming | hapiEventInterprocess)); + hapiCheck(hapiEventCreateWithFlags(&my_device_info.dst_event_pool[i], + hapiEventDisableTiming | hapiEventInterprocess)); + hapiCheck(hapiIpcGetEventHandle(&cur_shm_event_shared->src_event_handle, my_device_info.src_event_pool[i])); - hapiCheck(cudaIpcGetEventHandle(&cur_shm_event_shared->dst_event_handle, + hapiCheck(hapiIpcGetEventHandle(&cur_shm_event_shared->dst_event_handle, my_device_info.dst_event_pool[i])); } @@ -896,7 +1290,7 @@ static void ipcHandleCreate() { my_device_info.buffer = device_ptr; } -// Open CUDA IPC handles created by other processes +// Open hapi IPC handles created by other processes // Invoked by PE rank 0 of each process static void ipcHandleOpen() { GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); @@ -908,30 +1302,30 @@ static void ipcHandleOpen() { // Loop through GPU devices per process for (int j = 0; j < csv_gpu_manager.device_count; j++) { int device_index = csv_gpu_manager.device_count * i + j; - cuda_ipc_device_info& cur_device_info = csv_gpu_manager.cuda_ipc_device_infos[device_index]; + hapi_ipc_device_info& cur_device_info = csv_gpu_manager.hapi_ipc_device_infos[device_index]; // Open memory handle - cudaIpcMemHandle_t* shm_mem_handle = - (cudaIpcMemHandle_t*)((char*)csv_gpu_manager.shm_ptr + hapiIpcMemHandle_t* shm_mem_handle = + (hapiIpcMemHandle_t*)((char*)csv_gpu_manager.shm_ptr + csv_gpu_manager.shm_chunk_size * device_index); - hapiCheck(cudaIpcOpenMemHandle(&cur_device_info.buffer, *shm_mem_handle, - cudaIpcMemLazyEnablePeerAccess)); + hapiCheck(hapiIpcOpenMemHandle(&cur_device_info.buffer, *shm_mem_handle, + hapiIpcMemLazyEnablePeerAccess)); // Open event handles - cuda_ipc_event_shared* shm_event_shared = - (cuda_ipc_event_shared*)((char*)shm_mem_handle + sizeof(cudaIpcMemHandle_t)); + hapi_ipc_event_shared* shm_event_shared = + (hapi_ipc_event_shared*)((char*)shm_mem_handle + sizeof(hapiIpcMemHandle_t)); cur_device_info.event_pool_flags.clear(); cur_device_info.event_pool_buff_offsets.clear(); - for (int k = 0; k < csv_gpu_manager.cuda_ipc_event_pool_size_total; k++) { - cuda_ipc_event_shared* cur_shm_event_shared = shm_event_shared + k; + for (int k = 0; k < csv_gpu_manager.hapi_ipc_event_pool_size_total; k++) { + hapi_ipc_event_shared* cur_shm_event_shared = shm_event_shared + k; cur_device_info.src_event_pool.emplace_back(); cur_device_info.dst_event_pool.emplace_back(); - hapiCheck(cudaIpcOpenEventHandle(&cur_device_info.src_event_pool[k], + hapiCheck(hapiIpcOpenEventHandle(&cur_device_info.src_event_pool[k], cur_shm_event_shared->src_event_handle)); - hapiCheck(cudaIpcOpenEventHandle(&cur_device_info.dst_event_pool[k], + hapiCheck(hapiIpcOpenEventHandle(&cur_device_info.dst_event_pool[k], cur_shm_event_shared->dst_event_handle)); } } @@ -946,7 +1340,7 @@ static inline void gpuEventStart(hapiWorkRequest* wr, int* index, GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); gpuEventTimer* shared_gpu_events_ = csv_gpu_manager.gpu_events_; int shared_time_idx_ = csv_gpu_manager.time_idx_++; - shared_gpu_events_[shared_time_idx_].cmi_start_time = CmiWallTimer(); + // shared_gpu_events_[shared_time_idx_].cmi_start_time = CmiWallTimer(); shared_gpu_events_[shared_time_idx_].event_type = event; shared_gpu_events_[shared_time_idx_].trace_name = wr->trace_name; *index = shared_time_idx_; @@ -963,7 +1357,7 @@ static inline void gpuEventStart(hapiWorkRequest* wr, int* index, static inline void gpuEventEnd(int index) { #ifdef HAPI_TRACE GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); - csv_gpu_manager.gpu_events_[index].cmi_end_time = CmiWallTimer(); + // csv_gpu_manager.gpu_events_[index].cmi_end_time = CmiWallTimer(); traceUserBracketEvent(csv_gpu_manager.gpu_events_[index].stage, csv_gpu_manager.gpu_events_[index].cmi_start_time, csv_gpu_manager.gpu_events_[index].cmi_end_time); @@ -978,7 +1372,7 @@ static inline void gpuEventEnd(int index) { static inline void hapiWorkRequestStartTime(hapiWorkRequest* wr) { #ifdef HAPI_INSTRUMENT_WRS - wr->phase_start_time = CmiWallTimer(); + // wr->phase_start_time = CmiWallTimer(); #endif } @@ -992,7 +1386,7 @@ static inline void profileWorkRequestEvent(hapiWorkRequest* wr, #endif if (csv_gpu_manager.init_instr_) { - double tt = CmiWallTimer() - (wr->phase_start_time); + // double tt = CmiWallTimer() - (wr->phase_start_time); int index = wr->chare_index; char type = wr->comp_type; char phase = wr->comp_phase; @@ -1042,9 +1436,9 @@ static void createPool(int *n_buffers, int n_slots, std::vector &poo } int device; - cudaDeviceProp device_prop; - hapiCheck(cudaGetDevice(&device)); - hapiCheck(cudaGetDeviceProperties(&device_prop, device)); + hapiDeviceProp device_prop; + hapiCheck(hapiGetDevice(&device)); + hapiCheck(hapiGetDeviceProperties(&device_prop, device)); // divide by # of PEs on physical node and multiply by # of PEs in logical node size_t available_memory = device_prop.totalGlobalMem / @@ -1078,7 +1472,7 @@ static void createPool(int *n_buffers, int n_slots, std::vector &poo // pin host memory in a contiguous block for a slot void* pinned_chunk; - hapiCheck(cudaMallocHost(&pinned_chunk, buf_size * num_buffers)); + hapiCheck(hapiMallocHost(&pinned_chunk, buf_size * num_buffers)); // initialize header structs for (int j = num_buffers - 1; j >= 0; j--) { @@ -1099,11 +1493,11 @@ static void createPool(int *n_buffers, int n_slots, std::vector &poo static void releasePool(std::vector &pools){ int device; - hapiCheck(cudaGetDevice(&device)); + hapiCheck(hapiGetDevice(&device)); for (int i = 0; i < pools.size(); i++) { void* chunk = pools[i].chunk; if (chunk != NULL) { - hapiCheck(cudaFreeHost(chunk)); + hapiCheck(hapiFreeHost(chunk)); } } pools.clear(); @@ -1120,7 +1514,7 @@ static int findPool(size_t size){ csv_gpu_manager.mempool_boundaries_.push_back(size); BufferPool newpool; - hapiCheck(cudaMallocHost((void**)&newpool.head, size + sizeof(BufferPoolHeader))); + hapiCheck(hapiMallocHost((void**)&newpool.head, size + sizeof(BufferPoolHeader))); if (newpool.head == NULL) { CmiPrintf("[HAPI (%d)] findPool: failed to allocate newpool %d head, size %zu\n", CmiMyPe(), boundary_array_len, size); @@ -1163,7 +1557,7 @@ static void* getBufferFromPool(int pool, size_t size){ } else if (csv_gpu_manager.mempool_free_bufs_[pool].head == NULL) { BufferPoolHeader* hd; - hapiCheck(cudaMallocHost((void**)&hd, sizeof(BufferPoolHeader) + + hapiCheck(hapiMallocHost((void**)&hd, sizeof(BufferPoolHeader) + csv_gpu_manager.mempool_free_bufs_[pool].size)); #ifdef HAPI_MEMPOOL_DEBUG CmiPrintf("[HAPI (%d)] getBufferFromPool, pool: %d, size: %zu expand by 1\n", @@ -1196,7 +1590,7 @@ static void returnBufferToPool(int pool, BufferPoolHeader* hd) { #endif } -cudaError_t hapiPoolMalloc(void** ptr, size_t size) { +hapiError_t hapiPoolMalloc(void** ptr, size_t size) { GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); #if CMK_SMP @@ -1242,7 +1636,7 @@ cudaError_t hapiPoolMalloc(void** ptr, size_t size) { CmiUnlock(csv_gpu_manager.mempool_lock_); #endif - return cudaErrorMemoryAllocation; + return hapiErrorMemoryAllocation; } *ptr = getBufferFromPool(pool, size); @@ -1255,15 +1649,15 @@ cudaError_t hapiPoolMalloc(void** ptr, size_t size) { CmiUnlock(csv_gpu_manager.mempool_lock_); #endif - return cudaSuccess; + return hapiSuccess; } -cudaError_t hapiPoolFree(void* ptr) { +hapiError_t hapiPoolFree(void* ptr) { GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); // Check if mempool was initialized if (!csv_gpu_manager.mempool_initialized_) - return cudaErrorInitializationError; + return hapiErrorInitializationError; BufferPoolHeader* hd = ((BufferPoolHeader*)ptr) - 1; int pool = hd->slot; @@ -1288,7 +1682,7 @@ cudaError_t hapiPoolFree(void* ptr) { csv_gpu_manager.mempool_free_bufs_[pool].num); #endif - return cudaSuccess; + return hapiSuccess; } #ifdef HAPI_INSTRUMENT_WRS @@ -1364,9 +1758,20 @@ void hapiPollEvents(void* param) { std::queue& queue = CpvAccess(hapi_event_queue); while (!queue.empty()) { hapiEvent hev = queue.front(); - if (cudaEventQuery(hev.event) == cudaSuccess) { + if (hapiEventQuery(hev.event) == hapiSuccess) { queue.pop(); // TODO: investigate possible race condition with charm4py futures - temporarily resolved by popping here +#if CMK_LBDB_ON + if (hev.obj) { + // CmiPrintf("should not be printed w/o hapi hapi callback \n"); + float gpu_time; + hapiEventElapsedTime(&gpu_time, hev.start_ev, hev.event); + // hapiEventElapsedTime returns ms, convert to seconds to match wallTime units + double gpu_time_s = gpu_time / 1000.0; + hev.obj->setObjGPUTime(gpu_time_s + hev.obj->getObjGPUTime()); + hapiEventDestroy(hev.start_ev); + } else +#endif // invoke Charm++ callback if one was given hev.cb.send(hev.cb_msg); @@ -1374,7 +1779,7 @@ void hapiPollEvents(void* param) { if (hev.wr) { hapiWorkRequestCleanup(hev.wr); } - cudaEventDestroy(hev.event); + CpvAccess(hapi_event_pool).push(hev.event); CpvAccess(n_hapi_events)--; // inform QD that an event was processed @@ -1405,14 +1810,14 @@ int hapiCreateStreams() { return ret; } -cudaStream_t hapiGetStream() { +hapiStream_t hapiGetStream() { GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); #if CMK_SMP CmiLock(csv_gpu_manager.stream_lock_); #endif - cudaStream_t ret = csv_gpu_manager.getNextStream(); + hapiStream_t ret = csv_gpu_manager.getNextStream(); #if CMK_SMP CmiUnlock(csv_gpu_manager.stream_lock_); @@ -1420,11 +1825,67 @@ cudaStream_t hapiGetStream() { return ret; } +#if CMK_LBDB_ON +// Lightweight HAPI, to be invoked after data transfer or kernel execution. +void hapiRecordTime(hapiStream_t stream, hapiEvent_t start) { + Chare* obj = CkActiveObj(); + if (obj && dynamic_cast(obj)) { + + #ifndef HAPI_CUDA_CALLBACK + // record hapi event + recordEvent(stream, CkCallback(), NULL, NULL, dynamic_cast(obj), start); +#else + #error hapi record time with HAPI_CUDA_CALLBACK not supported +#endif + + // while there is an ongoing workrequest, quiescence should not be detected + // even if all PEs seem idle + CmiAssert(hapiQdCreate); + hapiQdCreate(1); + } +} +#endif + +uint64_t hapiCuptiPushObjCorrelation() { + // printf("seeing CsvAccess(gpu_manager).cupti_initialized_ as %d\n", CsvAccess(gpu_manager).cupti_initialized_); + if (!CsvAccess(gpu_manager).cupti_initialized_) return 0; + + // Get the active Charm++ object + Chare* chare = CkActiveObj(); + if (!chare) + CmiAbort("hapiCuptiPushObjCorrelation call without active object is not possible"); + + CkMigratable* mig = dynamic_cast(chare); + // printf("mig %p\n", mig); + if (!mig) return 0; + + // Use the raw element ID as the external correlation ID + // CmiUInt8 is a 64-bit unique object identifier + uint64_t obj_id = (uint64_t)mig->ckGetID(); +#if CMK_CUDA + CUPTI_SAFE_CALL(cuptiActivityPushExternalCorrelationId( + CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, obj_id)); +#endif + // printf("pushed corr id\n"); + + return obj_id; +} + +void hapiCuptiPopObjCorrelation() { + if (!CsvAccess(gpu_manager).cupti_initialized_) return; + + // printf("popped corr id\n"); + uint64_t tag; +#if CMK_CUDA + CUPTI_SAFE_CALL(cuptiActivityPopExternalCorrelationId( + CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, &tag)); +#endif +} // Lightweight HAPI, to be invoked after data transfer or kernel execution. -void hapiAddCallback(cudaStream_t stream, const CkCallback& cb, void* cb_msg) { +void hapiAddCallback(hapiStream_t stream, const CkCallback& cb, void* cb_msg) { #ifndef HAPI_CUDA_CALLBACK - // record CUDA event + // record hapi event recordEvent(stream, cb, cb_msg); #else GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); @@ -1435,15 +1896,15 @@ void hapiAddCallback(cudaStream_t stream, const CkCallback& cb, void* cb_msg) { #endif */ - // create converse message to be delivered to this PE after CUDA callback + // create converse message to be delivered to this PE after hapi callback hapiCallbackMessage* conv_msg = (hapiCallbackMessage*)CmiAlloc(sizeof(hapiCallbackMessage)); // FIXME memory leak? conv_msg->rank = CmiMyRank(); conv_msg->cb = cb; conv_msg->cb_msg = cb_msg; CmiSetHandler(conv_msg, csv_gpu_manager.light_cb_idx_); - // push into CUDA stream - hapiCheck(cudaLaunchHostFunc(stream, CUDACallback, (void*)conv_msg)); + // push into hapi stream + hapiCheck(hapiLaunchHostFunc(stream, hapiCallback, (void*)conv_msg)); /* #if CMK_SMP @@ -1458,33 +1919,77 @@ void hapiAddCallback(cudaStream_t stream, const CkCallback& cb, void* cb_msg) { hapiQdCreate(1); } -void hapiAddCallback(cudaStream_t stream, void* cb, void* cb_msg) { +void hapiAddCallback(hapiStream_t stream, void* cb, void* cb_msg) { hapiAddCallback(stream, *(CkCallback*)cb, cb_msg); } -cudaError_t hapiMalloc(void** devPtr, size_t size) { - return cudaMalloc(devPtr, size); -} +void hapiSendMemoryRequest(char* msg, int size) +{ + int cpv_my_device = CpvAccess(my_device); + + char server_fifo[BUFFER_SIZE]; + sprintf(server_fifo, SERVER_FIFO_TEMPLATE, cpv_my_device); + CmiPrintf("Sending request to %s\n", server_fifo); + + int server_fd = open(server_fifo, O_WRONLY | O_NONBLOCK); + if (server_fd == -1) { + perror("open server FIFO for writing"); + return; + } -cudaError_t hapiFree(void* devPtr) { - return cudaFree(devPtr); + ssize_t written = write(server_fd, msg, size); + if (written == -1) { + perror("write to server FIFO"); + } else { + //CmiPrintf("Successfully wrote %zd bytes to server FIFO\n", written); + } + + close(server_fd); } -cudaError_t hapiMallocHost(void** ptr, size_t size) { - return cudaMallocHost(ptr, size); -} -cudaError_t hapiFreeHost(void* ptr) { - return cudaFreeHost(ptr); +// hapiError_t hapiMemcpyAsync(void* dst, const void* src, size_t count, hapiMemcpyKind kind, hapiStream_t stream = 0) { +// hapiError_t err; +// #if CMK_LBDB_ON +// hapiEvent_t start; + +// hapiEventCreate(&start); +// hapiEventRecord(start, stream); +// #endif + +// err = hapiMemcpyAsync(dst, src, count, kind, stream); +// #if CMK_LBDB_ON +// hapiRecordTime(stream, start); +// #endif +// return err; +// } + +// hapiError_t hapiMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hapiMemcpyKind kind, hapiStream_t stream = 0) { +// hapiError_t err; +// #if CMK_LBDB_ON +// hapiEvent_t start; + +// hapiEventCreate(&start); +// hapiEventRecord(start, stream); +// #endif +// err = hapiMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream); +// #if CMK_LBDB_ON +// hapiRecordTime(stream, start); +// #endif +// return err; +// } + + +void hapiErrorDie(hapiError_t retCode, const char* code, const char* file, int line) { + if (retCode != hapiSuccess) { + fprintf(stderr, "Fatal hapi Error [%d] %s at %s:%d\n", retCode, hapiGetErrorString(retCode), file, line); + CmiAbort("Exit due to hapi error"); + } } -cudaError_t hapiMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0) { - return cudaMemcpyAsync(dst, src, count, kind, stream); +uint64_t hapiMyDevice() { + int physical_node_id = CmiPhysicalNodeID(CmiMyPe()); + int my_device = CpvAccess(my_device); + return (static_cast(physical_node_id) << 32) | my_device; } -void hapiErrorDie(cudaError_t retCode, const char* code, const char* file, int line) { - if (retCode != cudaSuccess) { - fprintf(stderr, "Fatal CUDA Error [%d] %s at %s:%d\n", retCode, cudaGetErrorString(retCode), file, line); - CmiAbort("Exit due to CUDA error"); - } -} diff --git a/src/arch/cuda/hybridAPI/hapi_impl.h b/src/arch/cuda/hybridAPI/hapi_impl.h index 63d8074f17..42d057dec7 100644 --- a/src/arch/cuda/hybridAPI/hapi_impl.h +++ b/src/arch/cuda/hybridAPI/hapi_impl.h @@ -13,8 +13,12 @@ extern "C" { // Scale the amount of memory each node pins. #define HAPI_MEMPOOL_SCALE 1.0 + // HAPI init & exit functions void hapiInit(char** argv); +void hapiStartMemoryDaemon(char** argv); +int hapiCheckpoint(void* devPtr, int size); +void hapiRestore(void* devPtr, int size, int alloc_id); void hapiExit(); // Polls for GPU work completion. Does not do anything if HAPI_CUDA_CALLBACK is defined. diff --git a/src/arch/cuda/hybridAPI/hapi_memory_daemon.cpp b/src/arch/cuda/hybridAPI/hapi_memory_daemon.cpp new file mode 100644 index 0000000000..9e751e7162 --- /dev/null +++ b/src/arch/cuda/hybridAPI/hapi_memory_daemon.cpp @@ -0,0 +1,279 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hapi_portable.h" + +#define HAPI_CHECK(call) do { \ + hapiError_t err = call; \ + if (err != hapiSuccess) { \ + fprintf(stderr, "HAPI> hapi call failed at %s:%d: %s\n", __FILE__, __LINE__, hapiGetErrorString(err)); \ + } \ +} while(0) + +#define SERVER_FIFO_TEMPLATE "/tmp/server_pipe_%ld" +#define CLIENT_FIFO_TEMPLATE "/tmp/client_pipe_%ld" +#define BUFFER_SIZE 256 +#define STREAM_BUF_SIZE 1024 + +// Managing memory state in server +std::unordered_map> hapiMemoryMap; +int allocId = 0; + +void hapiProcessMemoryRequest(int server_fd, int my_device, char* buf) +{ + long client_pid; + char command[BUFFER_SIZE]; + sscanf(buf, "%[^:]:", command); + + char* pid_str = strchr(buf, ':'); + if (pid_str) client_pid = atol(pid_str + 1); else return; + + printf("HAPI> Processing memory request: %s from client %ld\n", command, client_pid); + + char client_fifo_path[BUFFER_SIZE]; + sprintf(client_fifo_path, CLIENT_FIFO_TEMPLATE, client_pid); + int client_fd = open(client_fifo_path, O_WRONLY); + + if (strcmp(command, "CKPT") == 0) + { + int client_pe, size; + // This sscanf is fine, it extracts the needed integer values + sscanf(buf, "CKPT:%ld:%d:%d:", &client_pid, &client_pe, &size); + + // Correctly find the start of the handle by looking for the 4th colon. + char* handle_start = buf; + for (int i = 0; i < 4; ++i) { + handle_start = strchr(handle_start, ':'); + if (!handle_start) { + printf("DAEMON: Error parsing CKPT message, could not find 4 colons.\n"); + close(client_fd); + return; + } + handle_start++; // Move past the found colon + } + + hapiIpcMemHandle_t ipc_handle; + memcpy(&ipc_handle, handle_start, sizeof(hapiIpcMemHandle_t)); + + void* client_ptr; + HAPI_CHECK(hapiIpcOpenMemHandle(&client_ptr, ipc_handle, hapiIpcMemLazyEnablePeerAccess)); + + std::pair allocation = std::make_pair((void*) NULL, size); + HAPI_CHECK(hapiMalloc((void**) &(allocation.first), size)); + + HAPI_CHECK(hapiMemcpy((void*) allocation.first, client_ptr, size, hapiMemcpyDeviceToDevice)); + hapiMemoryMap[allocId] = allocation; + + HAPI_CHECK(hapiIpcCloseMemHandle(client_ptr)); + write(client_fd, &allocId, sizeof(int)); + allocId++; + } + else if (strcmp(command, "GET") == 0) + { + int alloc_id; + sscanf(buf, "GET:%ld:%d", &client_pid, &alloc_id); + + void* ptr = hapiMemoryMap[alloc_id].first; + hapiIpcMemHandle_t ipc_handle; + HAPI_CHECK(hapiIpcGetMemHandle(&ipc_handle, ptr)); + write(client_fd, &ipc_handle, sizeof(hapiIpcMemHandle_t)); + } + else if (strcmp(command, "FREE") == 0) + { + int alloc_id; + sscanf(buf, "FREE:%ld:%d", &client_pid, &alloc_id); + + auto it = hapiMemoryMap.find(alloc_id); + if (it != hapiMemoryMap.end()) { + HAPI_CHECK(hapiFree(it->second.first)); + hapiMemoryMap.erase(it); + } + write(client_fd, "\0", 1); + } + else if (strcmp(command, "KILL") == 0) + { + printf("Server: KILL command received from client %ld\n", client_pid); + write(client_fd, "\0", 1); + close(server_fd); + + char server_fifo[BUFFER_SIZE]; + sprintf(server_fifo, SERVER_FIFO_TEMPLATE, my_device); + if (remove(server_fifo) == 0) { + printf("File '%s' deleted successfully.\n", server_fifo); + } else { + printf("Error deleting file '%s': %s\n", server_fifo, strerror(errno)); + } + exit(0); + } + + close(client_fd); +} + +void hapiStartMemoryDaemon(int my_device) { + + int current_cpu = sched_getcpu(); + printf("Daemon: Current CPU is %d\n", current_cpu); + + // Child process (daemon) + printf("DAEMON: Starting daemon process PID=%d\n", getpid()); + + // Set up the daemon's hapi context + hapiSetDevice(my_device); + + char server_fifo[BUFFER_SIZE]; + sprintf(server_fifo, SERVER_FIFO_TEMPLATE, my_device); + mkfifo(server_fifo, 0666); + + // Open server FIFO for reading (this may block until a writer connects) + char server_fifo_path[BUFFER_SIZE]; + sprintf(server_fifo_path, SERVER_FIFO_TEMPLATE, my_device); + printf("DAEMON: Opening server FIFO %s\n", server_fifo_path); + int server_fd = open(server_fifo_path, O_RDONLY | O_NONBLOCK); + if (server_fd == -1) { + perror("DAEMON: open server FIFO"); + exit(1); + } + + // Make it blocking for actual reads + int flags = fcntl(server_fd, F_GETFL); + fcntl(server_fd, F_SETFL, flags & ~O_NONBLOCK); + + char ready_fifo_path[BUFFER_SIZE]; + sprintf(ready_fifo_path, "/tmp/daemon_ready_%d", my_device); + + // Signal parent that daemon is ready + int ready_fd = open(ready_fifo_path, O_WRONLY); + if (ready_fd == -1) { + perror("DAEMON: open ready FIFO for writing"); + exit(1); + } + write(ready_fd, "1", 1); + close(ready_fd); + + printf("DAEMON: Ready signal sent to parent\n"); + + // Main daemon loop + char stream_buf[STREAM_BUF_SIZE]; + size_t data_in_stream = 0; + int bytes_read; + + while (1) + { + // read() will block here until data is available + bytes_read = read(server_fd, stream_buf + data_in_stream, + STREAM_BUF_SIZE - data_in_stream); + + if (bytes_read > 0) + { + printf("DAEMON: Read %d bytes from server FIFO\n", bytes_read); + data_in_stream += bytes_read; + + if (data_in_stream >= STREAM_BUF_SIZE) { + printf("DAEMON: Stream buffer overflow"); + exit(1); + } + + // Process all complete messages in the buffer + while (data_in_stream > 0) + { + size_t msg_len = 0; + // We need at least 4 bytes to identify a command + if (data_in_stream < 4) break; + + if (strncmp(stream_buf, "CKPT", 4) == 0) { + // CKPT message format: "CKPT::::" + // Find the end of the text part (after the 4th colon) + const char *p = stream_buf; + int colons = 0; + size_t header_len = 0; + for (size_t i = 0; i < data_in_stream; ++i) { + if (p[i] == ':') { + colons++; + if (colons == 4) { // This must be 4 + header_len = i + 1; + break; + } + } + } + + if (header_len == 0) { + // Header is incomplete, need more data + break; + } + + msg_len = header_len + sizeof(hapiIpcMemHandle_t); + if (data_in_stream < msg_len) { + // Full message not yet received + break; + } + } else { + // Other messages are simple null-terminated strings + char* msg_end = (char*)memchr(stream_buf, '\0', data_in_stream); + if (msg_end == NULL) { + // Incomplete message + break; + } + msg_len = (msg_end - stream_buf) + 1; + } + + if (msg_len == 0) break; // Should not happen + + char current_request[BUFFER_SIZE]; + if (msg_len > BUFFER_SIZE) { + printf("DAEMON: Error, received message too long (%zu bytes). Aborting.\n", msg_len); + exit(1); + } + memcpy(current_request, stream_buf, msg_len); + + // Process the request. Note: This may exit on a KILL command. + hapiProcessMemoryRequest(server_fd, my_device, current_request); + + // Remove processed message from buffer + data_in_stream -= msg_len; + memmove(stream_buf, stream_buf + msg_len, data_in_stream); + } + } + else if (bytes_read == 0) + { + // A writer closed the connection. The FIFO is still open. + // The next read() will block until a new writer connects. + // A small sleep prevents a potential tight spin-loop on misconfiguration. + usleep(1000); + } + else // bytes_read < 0 + { + // An error occurred. + if (errno == EINTR) { + continue; // Interrupted by a signal, just try again. + } + perror("DAEMON: read from server FIFO"); + break; // Exit on fatal error. + } + } + + close(server_fd); + exit(0); +} + +int main(int argc, char** argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + const char* local_rank_str = argv[1]; + int local_rank = atoi(local_rank_str); + hapiStartMemoryDaemon(local_rank); +} \ No newline at end of file diff --git a/src/arch/cuda/hybridAPI/hapi_portable.h b/src/arch/cuda/hybridAPI/hapi_portable.h new file mode 100644 index 0000000000..c772a7ceaf --- /dev/null +++ b/src/arch/cuda/hybridAPI/hapi_portable.h @@ -0,0 +1,200 @@ +#pragma once + +#undef CMK_CUDA +#undef CMK_HIP + +#include "conv-mach-opt.h" + +#ifdef CMK_CUDA + +#include +#include + +#define hapiStream_t cudaStream_t + +#define hapiEvent_t cudaEvent_t + +#define hapiSetDevice(dev) cudaSetDevice(dev) + +#define hapiDevAttrClockRate cudaDevAttrClockRate +#define hapiDeviceGetAttribute(a,b,c) cudaDeviceGetAttribute(a,b,c) + +#define hapiPeekAtLastError cudaPeekAtLastError +#define hapiGetLastError cudaGetLastError +#define hapiEventDefault cudaEventDefault +#define hapiEventDisableTiming cudaEventDisableTiming + +#define hapiGetDeviceCount(devCount) cudaGetDeviceCount(devCount) + +#define hapiDeviceCanAccessPeer(canAccess, dev1, dev2) \ + cudaDeviceCanAccessPeer(canAccess, dev1, dev2) + +#define hapiDeviceEnablePeerAccess(dev, flags) \ + cudaDeviceEnablePeerAccess(dev, flags) + +#define hapiEventCreateWithFlags(flags, event) cudaEventCreateWithFlags(flags, event) + +#define hapiEventRecord(event, stream) cudaEventRecord(event, stream) +#define hapiEventQuery(event) cudaEventQuery(event) +#define hapiEventDestroy(event) cudaEventDestroy(event) +#define hapiStreamWaitEvent(stream, event, flags) \ + cudaStreamWaitEvent(stream, event, flags) + +#define hapiStreamSynchronize(stream) cudaStreamSynchronize(stream) +#define hapiDeviceSynchronize cudaDeviceSynchronize +#define hapiEventElapsedTime(a, b, c) cudaEventElapsedTime(a, b, c) +#define hapiMemGetInfo(a, b) cudaMemGetInfo(a, b) +#define hapiStreamCreate(stream) cudaStreamCreate(stream) +#define hapiStreamDestroy cudaStreamDestroy +#define hapiStreamDefault cudaStreamDefault +#define hapiStreamNonBlocking cudaStreamNonBlocking +#define hapiStreamCreateWithPriority cudaStreamCreateWithPriority + +#define hapiLaunchHostFunc(stream, func, args) \ + cudaLaunchHostFunc(stream, func, args) + +#define hapiStreamPerThread cudaStreamPerThread + +#define hapiIpcMemHandle_t cudaIpcMemHandle_t + +#define hapiIpcEventHandle_t cudaIpcEventHandle_t + +#define hapiIpcGetMemHandle(handle, ptr) cudaIpcGetMemHandle(handle, ptr) +#define hapiIpcCloseMemHandle(handle) cudaIpcCloseMemHandle(handle) + +#define hapiIpcGetEventHandle(handle, event) cudaIpcGetEventHandle(handle, event) + +#define hapiIpcOpenMemHandle(ptr, handle, flags) \ + cudaIpcOpenMemHandle(ptr, handle, flags) + +#define hapiIpcOpenEventHandle(event, handle) \ + cudaIpcOpenEventHandle(event, handle) + +#define hapiDeviceProp cudaDeviceProp + +#define hapiGetDeviceProperties(prop, dev) cudaGetDeviceProperties(prop, dev) +#define hapiGetDevice(dev) cudaGetDevice(dev) + +#define hapiMalloc(ptr, size) cudaMalloc(ptr, size) +#define hapiFree(ptr) cudaFree(ptr) +#define hapiMallocHost(ptr, size) cudaMallocHost(ptr, size) +#define hapiFreeHost(ptr) cudaFreeHost(ptr) + +#define hapiErrorMemoryAllocation cudaErrorMemoryAllocation +#define hapiErrorInitializationError cudaErrorInitializationError +#define hapiSuccess cudaSuccess +#define hapiError_t cudaError_t + +#define hapiMemcpyKind cudaMemcpyKind +#define hapiMemcpyHostToHost cudaMemcpyHostToHost +#define hapiMemcpyHostToDevice cudaMemcpyHostToDevice +#define hapiMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define hapiMemcpyDeviceToDevice cudaMemcpyDeviceToDevice +#define hapiMemcpy(dst, src, count, kind) cudaMemcpy(dst, src, count, kind) +#define hapiMemcpy2D(dst, dpitch, src, spitch, width, height, kind) \ + cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kind) + +#define hapiGetErrorString(err) cudaGetErrorString(err) + +#define hapiEventInterprocess cudaEventInterprocess +#define hapiIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess + +#define hapiMemcpyAsync cudaMemcpyAsync +#define hapiMemcpy2DAsync cudaMemcpy2DAsync + +#endif // CMK_CUDA + +#ifdef CMK_HIP + +#include + +#define hapiStream_t hipStream_t + +#define hapiEvent_t hipEvent_t + +#define hapiSetDevice(dev) hipSetDevice(dev) +#define hapiGetDeviceCount(devCount) hipGetDeviceCount(devCount) +#define hapiDevAttrClockRate hipDeviceAttributeClockRate +#define hapiDeviceGetAttribute(a,b,c) hipDeviceGetAttribute(a,b,c) + +#define hapiPeekAtLastError hipPeekAtLastError +#define hapiGetLastError hipGetLastError + +#define hapiDeviceCanAccessPeer(canAccess, dev1, dev2) \ + hipDeviceCanAccessPeer(canAccess, dev1, dev2) +#define hapiDeviceEnablePeerAccess(dev, flags) \ + hipDeviceEnablePeerAccess(dev, flags) + +#define hapiEventCreateWithFlags(flags, event) hipEventCreateWithFlags(flags, event) +#define hapiEventRecord(event, stream) hipEventRecord(event, stream) +#define hapiEventQuery(event) hipEventQuery(event) +#define hapiEventDestroy(event) hipEventDestroy(event) +#define hapiStreamWaitEvent(stream, event, flags) \ + hipStreamWaitEvent(stream, event, flags) + +#define hapiStreamSynchronize(stream) hipStreamSynchronize(stream) +#define hapiDeviceSynchronize hipDeviceSynchronize +#define hapiEventElapsedTime(a, b, c) hipEventElapsedTime(a, b, c) +#define hapiMemGetInfo(a, b) hipMemGetInfo(a, b) +#define hapiLaunchHostFunc(stream, func, args) \ + hipLaunchHostFunc(stream, func, args) + +#define hapiStreamPerThread hipStreamPerThread + +#define hapiIpcMemHandle_t hipIpcMemHandle_t + +#define hapiIpcEventHandle_t hipIpcEventHandle_t + +#define hapiIpcGetMemHandle(handle, ptr) hipIpcGetMemHandle(handle, ptr) +#define hapiIpcCloseMemHandle(handle) hipIpcCloseMemHandle(handle) + +#define hapiIpcGetEventHandle(handle, event) hipIpcGetEventHandle(handle, event) + +#define hapiIpcOpenMemHandle(ptr, handle, flags) \ + hipIpcOpenMemHandle(ptr, handle, flags) + +#define hapiIpcOpenEventHandle(event, handle) \ + hipIpcOpenEventHandle(event, handle) + +#define hapiDeviceProp hipDeviceProp_t + +#define hapiGetDeviceProperties(prop, dev) hipGetDeviceProperties(prop, dev) +#define hapiGetDevice(dev) hipGetDevice(dev) +#define hapiStreamCreate(stream) hipStreamCreate(stream) +#define hapiStreamDestroy hipStreamDestroy +#define hapiStreamDefault hipStreamDefault +#define hapiStreamNonBlocking hipStreamNonBlocking +#define hapiStreamCreateWithPriority hipStreamCreateWithPriority + +#define hapiMalloc(ptr, size) hipMalloc(ptr, size) +#define hapiFree(ptr) hipFree(ptr) +#define hapiMallocHost(ptr, size) hipHostMalloc(ptr, size) +#define hapiFreeHost(ptr) hipHostFree(ptr) + +#define hapiErrorMemoryAllocation hipErrorMemoryAllocation +#define hapiErrorInitializationError hipErrorInitializationError +#define hapiSuccess hipSuccess +#define hapiError_t hipError_t +#define hapiStreamDestroy hipStreamDestroy +#define hapiStreamDefault hipStreamDefault +#define hapiStreamCreateWithPriority hipStreamCreateWithPriority + +#define hapiMemcpyKind hipMemcpyKind +#define hapiMemcpyHostToHost hipMemcpyHostToHost +#define hapiMemcpyHostToDevice hipMemcpyHostToDevice +#define hapiMemcpyDeviceToHost hipMemcpyDeviceToHost +#define hapiMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define hapiMemcpy(dst, src, count, kind) hipMemcpy(dst, src, count, kind) +#define hapiMemcpy2D(dst, dpitch, src, spitch, width, height, kind) \ + hipMemcpy2D(dst, dpitch, src, spitch, width, height, kind) +#define hapiGetErrorString(err) hipGetErrorString(err) + +#define hapiEventDisableTiming hipEventDisableTiming +#define hapiEventDefault hipEventDefault +#define hapiEventInterprocess hipEventInterprocess +#define hapiIpcMemLazyEnablePeerAccess hipIpcMemLazyEnablePeerAccess + +#define hapiMemcpyAsync hipMemcpyAsync +#define hapiMemcpy2DAsync hipMemcpy2DAsync + +#endif // CMK_HIP diff --git a/src/arch/mpi/charmrun b/src/arch/mpi/charmrun index 901df49fc4..e8f188ad9e 100755 --- a/src/arch/mpi/charmrun +++ b/src/arch/mpi/charmrun @@ -74,6 +74,11 @@ do args=("$1" "$2" "${args[@]}") shift ;; + ++nodelist) + machinefile="$2" + args=("-machinefile" "$2" "${args[@]}") + shift + ;; ++quiet) QUIET=1 ;; diff --git a/src/arch/mpi/charmrun_elastic b/src/arch/mpi/charmrun_elastic new file mode 100755 index 0000000000..f9e17224c3 --- /dev/null +++ b/src/arch/mpi/charmrun_elastic @@ -0,0 +1,69 @@ +#!/bin/bash + +is_restart=false + +original_args=("$@") + +if [[ "$(uname)" == "Darwin" ]]; then + TMPDIR="/tmp" +else + TMPDIR="/dev/shm" +fi + +pes_file="$TMPDIR/numRestartProcs.txt" + +time { +while true; do + args=() + pes_args="" + restart_arg="" + + temp_args=("${original_args[@]}") + i=0 + while [ $i -lt ${#temp_args[@]} ]; do + arg="${temp_args[$i]}" + case "$arg" in + +p|++p) + i=$((i+1)) + pes_arg="$arg ${temp_args[$i]}" + ;; + +p[0-9]*) + pes_arg="$arg" + ;; + ++p[0-9]*) + pes_arg="$arg" + ;; + *) + args+=("$arg") + ;; + esac + i=$((i+1)) + done + + # 2. Check the flag. If it's a restart, prepare the extra argument. + if [ "$is_restart" = true ]; then + restart_arg="+restart $TMPDIR" + if [ -f "$pes_file" ]; then + num_pes=$(cat "$pes_file") + echo "Charm> Reading pes $num_pes from $pes_file" + pes_arg="+p $num_pes" + fi + fi + + # Pass all script arguments ("$@") to the executable + "$(dirname "$0")/charmrun" $pes_arg "${args[@]}" $restart_arg + + EXIT_CODE=$? + + if [ "$EXIT_CODE" -eq 100 ]; then + is_restart=true + echo "Restart signal (code 100) received. Looping again." + echo "----------------------------------------" + else + echo "Final exit signal (code $EXIT_CODE) received. Exiting loop." + break + fi +done +} + +echo "Control loop finished." \ No newline at end of file diff --git a/src/arch/mpi/conv-common.h b/src/arch/mpi/conv-common.h index 06aaafd9e4..e3748b6237 100644 --- a/src/arch/mpi/conv-common.h +++ b/src/arch/mpi/conv-common.h @@ -50,3 +50,6 @@ #define CMK_USE_COMMON_LOCK 1 #define CMK_ONESIDED_IMPL 1 + +/* cuda aware mpi machine layer supports GPU-aware communication */ +#define CMK_GPU_COMM 1 diff --git a/src/arch/mpi/machine.C b/src/arch/mpi/machine.C index 368cb53d7c..7eb70ef7c9 100644 --- a/src/arch/mpi/machine.C +++ b/src/arch/mpi/machine.C @@ -1,16 +1,18 @@ - /** @file * MPI based machine layer * @ingroup Machine */ /*@{*/ +#include #include #include #include "converse.h" #include "cmirdmautils.h" #include #include +#include + #ifdef AMPI # warning "We got the AMPI version of mpi.h, instead of the system version--" @@ -41,6 +43,16 @@ static char* strsignal(int sig) { #include "machine.h" #include "pcqueue.h" +#include "conv-ccs.h" +#include "ccs-server.h" +#include "ckrescale.h" + +#if CMK_SHRINK_EXPAND +CcsDelayedReply shrinkExpandreplyToken; +extern int numProcessAfterRestart; +extern char *_shrinkexpand_basedir; +int mynewpe=0; +#endif /* Msg types to have different actions taken for different message types * REGULAR - Regular Charm++ message @@ -53,7 +65,12 @@ static char* strsignal(int sig) { * */ #define CMI_MSGTYPE(msg) ((CmiMsgHeaderBasic *)msg)->mpiMsgType -enum mpiMsgTypes { REGULAR, ONESIDED_BUFFER_SEND, ONESIDED_BUFFER_RECV, ONESIDED_BUFFER_DIRECT_RECV, ONESIDED_BUFFER_DIRECT_SEND, POST_DIRECT_RECV, POST_DIRECT_SEND}; +enum mpiMsgTypes { REGULAR, ONESIDED_BUFFER_SEND, ONESIDED_BUFFER_RECV, ONESIDED_BUFFER_DIRECT_RECV, ONESIDED_BUFFER_DIRECT_SEND, POST_DIRECT_RECV, POST_DIRECT_SEND, +#if CMK_CUDA + DEVICE_SEND_OP, + DEVICE_RECV_OP +#endif +}; /* =======Beginning of Definitions of Performance-Specific Macros =======*/ /* Whether to use multiple send queue in SMP mode */ @@ -349,6 +366,13 @@ typedef struct msg_list { struct msg_list *next; int size, destpe, mode, type; MPI_Request req; +#if CMK_CUDA + void* ptr; + size_t device_size; + DeviceRdmaOp* op; + uint64_t tag; + int dest_mpi_rank; +#endif #if CMK_ONESIDED_IMPL void *ref; // This field can store the pointer to any structure that might have to be accessed. @@ -409,6 +433,38 @@ static int SendMsgBuf(void); static void EnqueueMsg(void *m, int size, int node, int mode, int type, void *ref); #endif +#if CMK_CUDA + +CpvDeclare(int, tag_counter); + +MPI_Win globalDevWin = MPI_WIN_NULL; +void LrtsInitRMA() { + int result = MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &globalDevWin); + + if (result != MPI_SUCCESS) { + CmiAbort("RMA Window Creation Failed!"); + } +} + +void LrtsCleanupRMA() { + if (globalDevWin != MPI_WIN_NULL) { + MPI_Win_free(&globalDevWin); // This will fail if memory is still attached! + } +} + +#if CMK_SMP + +void* deviceRecvCallback(void* arg) { + DeviceRdmaOpMsg_* recv_msg = (DeviceRdmaOpMsg_*)arg; + CmiInvokeRecvHandler(recv_msg->op); + return NULL; +} +int deviceRecvCallbackHandler; + +#endif + +#endif + /* ### End of Machine-running Related Functions ### */ /* ### Beginning of Idle-state Related Functions ### */ @@ -431,6 +487,17 @@ void CmiNotifyIdleForMPI(void); #include "machine-ctrlmsg.C" #endif +void print_nodelist(char* arg_nodelist){ + FILE *f=fopen(arg_nodelist,"r"); + char c; + c = fgetc(f); + while (c != EOF) { + printf ("%c", c); + c = fgetc(f); + } + fclose(f); +} + SMSG_LIST *allocateSmsgList(char *msg, int destNode, int size, int mode, int type, void *ref) { SMSG_LIST *msg_tmp = (SMSG_LIST *) malloc(sizeof(SMSG_LIST)); msg_tmp->msg = msg; @@ -452,14 +519,6 @@ static void EnqueueMsg(void *m, int size, int node, int mode, int type, void *re /*SMSG_LIST *msg_tmp = (SMSG_LIST *) CmiAlloc(sizeof(SMSG_LIST));*/ SMSG_LIST *msg_tmp = allocateSmsgList((char *)m, node, size, mode, type, ref); MACHSTATE1(3,"EnqueueMsg to node %d {{ ", node); - msg_tmp->msg = (char *)m; - msg_tmp->size = size; - msg_tmp->destpe = node; - msg_tmp->next = 0; - msg_tmp->mode = mode; -#if CMK_ONESIDED_IMPL - msg_tmp->ref = NULL; -#endif #if MULTI_SENDQUEUE PCQueuePush(procState[CmiMyRank()].postMsgBuf,(char *)msg_tmp); @@ -625,6 +684,7 @@ static void ReleasePostedMessages(void) { MACHSTATE1(2,"ReleasePostedMessages begin on %d {", CmiMyPe()); while (msg_tmp!=0) { + int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); done =0; #if CMK_SMP_TRACE_COMMTHREAD || CMK_TRACE_COMMOVERHEAD double startT = CmiWallTimer(); @@ -678,8 +738,13 @@ static void ReleasePostedMessages(void) { // which is freed in the above code (either ONESIDED_BUFFER_DIRECT_RECV or // ONESIDED_BUFFER_DIRECT_SEND) } - else + #if CMK_CUDA + else if(msg_tmp->type == DEVICE_SEND_OP || msg_tmp->type == DEVICE_RECV_OP) { + // TODO: check if we can remove this + } + #endif #endif + else { CmiFree(msg_tmp->msg); } @@ -815,6 +880,7 @@ static int PumpMsgs(void) { CmiAbort("MPI_Iprobe failed\n"); if (!flg) break; + int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); CONDITIONAL_TRACE_USER_EVENT(70); /* MPI_Iprobe related user event */ recd = 1; @@ -824,7 +890,7 @@ static int PumpMsgs(void) { #if USE_ASYNC_RECV_FUNC if(nbytes >= IRECV_MSG_THRESHOLD) doSyncRecv = 0; #endif - if(doSyncRecv){ + if(doSyncRecv) { START_EVENT(); if (MPI_SUCCESS != MPI_Recv(msg,nbytes,MPI_BYTE,sts.MPI_SOURCE,sts.MPI_TAG, charmComm,&sts)) CmiAbort("PumpMsgs: MPI_Recv failed!\n"); @@ -846,7 +912,7 @@ static int PumpMsgs(void) { #endif /*end of !MPI_POST_RECV and !USE_MPI_CTRLMSG_SCHEME*/ - if(doSyncRecv){ + if (doSyncRecv) { MACHSTATE2(3,"PumpMsgs recv one from node:%d to rank:%d", sts.MPI_SOURCE, CMI_DEST_RANK(msg)); CMI_CHECK_CHECKSUM(msg, nbytes); #if CMK_ERROR_CHECKING @@ -1048,6 +1114,13 @@ static void PumpMsgsBlocking(void) { handleOneRecvedMsg(nbytes, msg); } +#if CMK_CUDA + #include + + std::vector> rdma_requests; + // a map to tell how many rdma requests are gone to some rank + std::map access_epochs; +#endif #if CMK_SMP @@ -1055,32 +1128,36 @@ static void PumpMsgsBlocking(void) { static int SendMsgBuf(void) { SMSG_LIST *msg_tmp; char *msg; - int node, rank, size; + // int node, rank, size; + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); int i; int sent = 0; -#if CMI_EXERT_SEND_CAP || CMI_DYNAMIC_EXERT_CAP - int sentCnt = 0; -#endif - -#if CMI_DYNAMIC_EXERT_CAP - dynamicSendCap = CMI_DYNAMIC_MAXCAPSIZE; -#endif - MACHSTATE(2,"SendMsgBuf begin {"); -#if MULTI_SENDQUEUE - for (i=0; i<_Cmi_mynodesize+1; i++) { /* subtle: including comm thread */ - if (!PCQueueEmpty(procState[i].postMsgBuf)) { - msg_tmp = (SMSG_LIST *)PCQueuePop(procState[i].postMsgBuf); -#else /* single message sending queue */ /* CmiLock(postMsgBufLock); */ msg_tmp = (SMSG_LIST *)PCQueuePop(postMsgBuf); /* CmiUnlock(postMsgBufLock); */ while (NULL != msg_tmp) { -#endif - #if CMK_ONESIDED_IMPL +#if CMK_CUDA + if (msg_tmp->type == DEVICE_SEND_OP) { + if (MPI_Win_attach(globalDevWin, msg_tmp->ptr, msg_tmp->device_size) != MPI_SUCCESS) + CmiAbort("MPI_Win_attach failed\n"); + } else if(msg_tmp->type == DEVICE_RECV_OP) { + if (access_epochs[msg_tmp->op->src_mpi_rank] == 0) + MPI_Win_lock(MPI_LOCK_SHARED, msg_tmp->op->src_mpi_rank, 0, globalDevWin); + access_epochs[msg_tmp->op->src_mpi_rank]++; + MPI_Request req; + int result = MPI_Rget((void*)msg_tmp->op->dest_ptr, msg_tmp->op->size, MPI_BYTE, + msg_tmp->op->src_mpi_rank, (MPI_Aint)(msg_tmp->op->tag), msg_tmp->op->size, + MPI_BYTE, globalDevWin, &req); + if (result != MPI_SUCCESS) + CmiAbort("LrtsRecvDevice: MPI_Get failed!\n"); + rdma_requests.push_back({req, msg_tmp->op}); + } else +#endif if(msg_tmp->type == ONESIDED_BUFFER_DIRECT_RECV || msg_tmp->type == ONESIDED_BUFFER_DIRECT_SEND) { NcpyOperationInfo *ncpyOpInfo = (NcpyOperationInfo *)(msg_tmp->ref); MPISendOrRecvOneBuffer(msg_tmp, ncpyOpInfo->tag); @@ -1091,24 +1168,8 @@ static int SendMsgBuf(void) { MPISendOneMsg(msg_tmp); } sent=1; - -#if CMI_EXERT_SEND_CAP - if (++sentCnt == SEND_CAP) break; -#elif CMI_DYNAMIC_EXERT_CAP - if (++sentCnt >= dynamicSendCap) break; - if (CpvAccess(MsgQueueLen) > CMI_DYNAMIC_OUTGOING_THRESHOLD) - dynamicSendCap = CMI_DYNAMIC_SEND_CAPSIZE; -#endif - -#if ! MULTI_SENDQUEUE - /* CmiLock(postMsgBufLock); */ msg_tmp = (SMSG_LIST *)PCQueuePop(postMsgBuf); - /* CmiUnlock(postMsgBufLock); */ -#endif } -#if MULTI_SENDQUEUE - } -#endif MACHSTATE(2,"}SendMsgBuf end "); return sent; } @@ -1144,6 +1205,120 @@ static double sendtime = 0.0; #endif //end of CMK_SMP +#if CMK_CUDA + +#if CMK_SMP + +void processRdmaRequests() { + int n = rdma_requests.size(); + if (n == 0) return; + + static std::vector requests(10); + static std::vector indices(10); + static std::vector statuses(10); + + if(n > requests.size()) { + requests.resize(n); indices.resize(n); statuses.resize(n); + } + + for (int i = 0; i < n; i++) + requests[i] = rdma_requests[i].first; + + int outcount; + MPI_Testsome(n, + requests.data(), + &outcount, + indices.data(), + statuses.data()); + + if (outcount == MPI_UNDEFINED || outcount == 0) + return; + + for (int i = 0; i < outcount; i++) { + int idx = indices[i]; + + auto &entry = rdma_requests[idx]; + DeviceRdmaOp *op = entry.second; + + DeviceRdmaOpMsg_* conv_msg = + (DeviceRdmaOpMsg_*)CmiAlloc(sizeof(DeviceRdmaOpMsg_)); + + conv_msg->op = op; + + access_epochs[op->src_mpi_rank]--; + if (access_epochs[op->src_mpi_rank] == 0) + MPI_Win_unlock(op->src_mpi_rank, globalDevWin); + + CmiSetHandler(conv_msg, deviceRecvCallbackHandler); + CmiPushPE(CmiRankOf(op->dest_pe), conv_msg); + + rdma_requests[idx].first = MPI_REQUEST_NULL; + } + + rdma_requests.erase( + std::remove_if(rdma_requests.begin(), + rdma_requests.end(), + [](const std::pair &e) { + return e.first == MPI_REQUEST_NULL; + }), + rdma_requests.end()); +} + +#else + +void processRdmaRequests() { + int n = rdma_requests.size(); + if (n == 0) return; + + static std::vector requests(10); + static std::vector indices(10); + static std::vector statuses(10); + + if(n > requests.size()) { + requests.resize(n); indices.resize(n); statuses.resize(n); + } + + for (int i = 0; i < n; i++) + requests[i] = rdma_requests[i].first; + + int outcount; + MPI_Testsome(n, + requests.data(), + &outcount, + indices.data(), + statuses.data()); + + if (outcount == MPI_UNDEFINED || outcount == 0) + return; + + for (int i = 0; i < outcount; i++) { + int idx = indices[i]; + + auto &entry = rdma_requests[idx]; + DeviceRdmaOp *op = entry.second; + + access_epochs[op->src_mpi_rank]--; + if (access_epochs[op->src_mpi_rank] == 0) + MPI_Win_unlock(op->src_mpi_rank, globalDevWin); + + CmiInvokeRecvHandler(op); + + rdma_requests[idx].first = MPI_REQUEST_NULL; + } + + rdma_requests.erase( + std::remove_if(rdma_requests.begin(), + rdma_requests.end(), + [](const std::pair &e) { + return e.first == MPI_REQUEST_NULL; + }), + rdma_requests.end()); +} + +#endif + +#endif + void LrtsAdvanceCommunication(int whenidle) { #if REPORT_COMM_METRICS double t1, t2, t3, t4; @@ -1158,10 +1333,10 @@ void LrtsAdvanceCommunication(int whenidle) { #endif ReleasePostedMessages(); + #if REPORT_COMM_METRICS t3 = CmiWallTimer(); #endif - SendMsgBuf(); #if REPORT_COMM_METRICS @@ -1171,6 +1346,10 @@ void LrtsAdvanceCommunication(int whenidle) { sendtime += (t4-t3); #endif +#if CMK_CUDA + processRdmaRequests(); +#endif + #else /* non-SMP case */ ReleasePostedMessages(); @@ -1185,6 +1364,10 @@ void LrtsAdvanceCommunication(int whenidle) { releasetime += (t2-t1); #endif +#if CMK_CUDA + processRdmaRequests(); +#endif + #endif /* end of #if CMK_SMP */ } /* ######End of functions related with communication progress ###### */ @@ -1322,6 +1505,9 @@ void LrtsExit(int exitcode) { sigaction(SIGINT, &signal_int, NULL); #else signal(SIGINT, signal_int); +#endif +#if CMK_CUDA + LrtsCleanupRMA(); #endif MPI_Finalize(); #endif @@ -1388,6 +1574,12 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID) { char** largv=*argv; int tagUbGetResult; void *tagUbVal; + char* arg_nodelist; + + /*if (CmiGetArgStringDesc(argv, "++nodelist", &arg_nodelist, "nodelist")) + { + print_nodelist(arg_nodelist); + }*/ if (CmiGetArgFlag(largv, "+comm_thread_only_recv")) { #if CMK_SMP @@ -1518,6 +1710,7 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID) { if (newpe == -1) { MPI_Barrier(charmComm); //MPI_Barrier(charmComm); + LrtsCleanupRMA(); MPI_Finalize(); exit(0); } @@ -1681,6 +1874,12 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID) { rdmaTagLock = CmiCreateLock(); #endif #endif + +#if CMK_CUDA + LrtsInitRMA(); + CpvInitialize(int, tag_counter); + CpvAccess(tag_counter) = 0; +#endif } INLINE_KEYWORD void LrtsNotifyIdle(void) {} @@ -1818,6 +2017,7 @@ void LrtsPostCommonInit(int everReturn) { /* ######End of functions related with starting programs###### */ /*********************************************************************** + * * * Abort function: * @@ -1989,6 +2189,56 @@ int CmiBarrierZero(void) { } +#if CMK_SHRINK_EXPAND +void ConverseCleanup(void) +{ + MACHSTATE(2,"ConverseCleanup {"); + + #if (CMK_SMP && !CMK_SMP_NO_COMMTHD) + CmiAbort(" ConverseCleanup called in SMP. CmiBarrier needs to be called on comm thread as well! Right now, this hangs. Remove this abort when SMP support implemented.\n"); + #endif + CmiBarrier(); // TODO: for smp, this must also be called on comm thread. otherwise, hangs + +#if CMK_USE_SYSVSHM + CmiExitSysvshm(); +#elif CMK_USE_PXSHM + CmiExitPxshm(); +#endif + ConverseCommonExit(); /* should be called by every rank */ + CmiNodeBarrier(); /* single node SMP, make sure every rank is done */ + //if (CmiMyRank()==0) CmiStdoutFlush(); + + if (get_shrinkexpand_exit() && CmiMyPe() == 0) { + // launch charmrun here + + std::string path = std::string(_shrinkexpand_basedir) + "/numRestartProcs.txt"; + FILE *fp = fopen(path.c_str(), "w"); + if (fp != NULL) { + CmiPrintf("Charm> Writing numProcessAfterRestart %i to %s\n", numProcessAfterRestart, path.c_str()); + fprintf(fp, "%d", numProcessAfterRestart); + fclose(fp); + } else { + perror("Error opening file"); + } + + // Use the new synchronous reply function. This blocks until the reply is + // sent and acknowledged by charmrun, robustly fixing the race condition. + CcsSendDelayedReplyAndTerm(shrinkExpandreplyToken, 0, 0); + + CmiBarrier(); + ConverseExit(100); + + } else { + // kill all other processes + CmiBarrier(); + //printf("Exiting PE %d\n", CmiMyPe()); + //fflush(stdout); + ConverseExit(); + } +} +#endif + + #if CMK_MEM_CHECKPOINT || CMK_MESSAGE_LOGGING void mpi_restart_crashed(int pe, int rank) @@ -2057,6 +2307,7 @@ void CkDieNow(void) } MPI_Barrier(charmComm); // MPI_Barrier(charmComm); + LrtsCleanupRMA(); MPI_Finalize(); exit(0); #endif @@ -2357,6 +2608,63 @@ void CmiSetupMachineRecvBuffersUser(void) } /*=======End of Msg Histogram or Dynamic Post-Recv Related Funcs======*/ +#if CMK_CUDA + +#include +std::map, uint64_t> cache_window; + +void LrtsSendDevice(int dest_mpi_rank, int src_mpi_rank, const void*& ptr, size_t size, uint64_t& tag) { + if(cache_window.find(std::make_pair((void*)ptr, size)) != cache_window.end()) { + tag = cache_window[std::make_pair((void*)ptr, size)]; + } else { +#if CMK_SMP + SMSG_LIST *msg_tmp = (SMSG_LIST *) malloc(sizeof(SMSG_LIST)); + msg_tmp->ptr = (void*)(ptr); + msg_tmp->device_size = size; + msg_tmp->type = DEVICE_SEND_OP; + msg_tmp->dest_mpi_rank = dest_mpi_rank; + msg_tmp->tag = (uint64_t)(void*)(ptr); + PCQueuePush(postMsgBuf,(char *)msg_tmp); +#else + if (MPI_Win_attach(globalDevWin, (void*)ptr, size) != MPI_SUCCESS) CmiAbort("MPI_Win_attach failed\n"); +#endif + // tag is the virtual address of the buffer + tag = (uint64_t)(void*)(ptr); + cache_window[{(void*)ptr, size}] = tag; + } +} + +std::map handler_registered; + +void LrtsRecvDevice(DeviceRdmaOp* op, DeviceRecvType type) +{ +#if CMK_SMP + if(handler_registered[CmiMyPe()] == false) { + deviceRecvCallbackHandler = CmiRegisterHandler((CmiHandler) deviceRecvCallback); + handler_registered[CmiMyPe()] = true; + } + SMSG_LIST *msg_tmp = (SMSG_LIST *) malloc(sizeof(SMSG_LIST)); + msg_tmp->op = op; + msg_tmp->type = DEVICE_RECV_OP; + PCQueuePush(postMsgBuf,(char *)msg_tmp); +#else + if (access_epochs[op->src_mpi_rank] == 0) { + MPI_Win_lock(MPI_LOCK_SHARED, op->src_mpi_rank, 0, globalDevWin); + } + access_epochs[op->src_mpi_rank]++; + MPI_Request req; + int result = MPI_Rget((void*)op->dest_ptr, op->size, MPI_BYTE, + op->src_mpi_rank, (MPI_Aint)(op->tag), op->size, + MPI_BYTE, globalDevWin, &req); + if (result != MPI_SUCCESS) { + CmiAbort("LrtsRecvDevice: MPI_Get failed!\n"); + } + rdma_requests.push_back({req, op}); +#endif +} + +#endif // CMK_CUDA + /*@}*/ diff --git a/src/arch/netlrts/machine-eth.C b/src/arch/netlrts/machine-eth.C index 3f89d2c9c2..4d68d01c2f 100644 --- a/src/arch/netlrts/machine-eth.C +++ b/src/arch/netlrts/machine-eth.C @@ -92,12 +92,14 @@ int CheckSocketsReady(int withDelayMs) } CmiStdoutCheck(CMK_PIPE_SUB); - if (Cmi_charmrun_fd!=-1) + if (Cmi_charmrun_fd!=-1) { ctrlskt_ready_read = CMK_PIPE_CHECKREAD(Cmi_charmrun_fd); + } if (dataskt!=-1) { - dataskt_ready_read = CMK_PIPE_CHECKREAD(dataskt); - if (dataWrite) - dataskt_ready_write = CMK_PIPE_CHECKWRITE(dataskt); + dataskt_ready_read = CMK_PIPE_CHECKREAD(dataskt); + if (dataWrite) { + dataskt_ready_write = CMK_PIPE_CHECKWRITE(dataskt); + } } return nreadable; } diff --git a/src/arch/netlrts/machine.C b/src/arch/netlrts/machine.C index fdb0d72403..1ebda70353 100644 --- a/src/arch/netlrts/machine.C +++ b/src/arch/netlrts/machine.C @@ -251,7 +251,7 @@ int _kq = -1; #if CMK_SHRINK_EXPAND extern void resumeAfterRealloc(void); extern char willContinue; -extern int mynewpe; +int mynewpe=0; extern int numProcessAfterRestart; CcsDelayedReply shrinkExpandreplyToken; extern char *_shrinkexpand_basedir; @@ -598,7 +598,6 @@ int Cmi_isOldProcess = 0; // means this process was already there static int Cmi_mynewpe = 0; static int Cmi_oldpe = 0; static int Cmi_newnumnodes = 0; -int Cmi_myoldpe = 0; static int Cmi_charmrun_assigned_pe; #endif @@ -653,11 +652,17 @@ static void parse_netstart(void) int nread; int port; ns = getenv("NETSTART"); + int dummy; +#if CMK_SHRINK_EXPAND + int* ptr = Cmi_isOldProcess == 1 ? &dummy : &Lrts_myNode; +#else + int* ptr = &Lrts_myNode; +#endif if (ns!=0) {/*Read values set by Charmrun*/ char Cmi_charmrun_name[1024]; nread = sscanf(ns, "%d%s%d%d%d", - &Lrts_myNode, + ptr, Cmi_charmrun_name, &Cmi_charmrun_port, &Cmi_charmrun_pid, &port); Cmi_charmrun_IP=skt_lookup_ip(Cmi_charmrun_name); @@ -666,11 +671,9 @@ static void parse_netstart(void) fprintf(stderr,"Error parsing NETSTART '%s'\n",ns); exit(1); } + #if CMK_SHRINK_EXPAND Cmi_charmrun_assigned_pe = Lrts_myNode; - if (Cmi_isOldProcess) { - Cmi_myoldpe = Lrts_myNode; - } #endif if (getenv("CmiLocal") != NULL) { /* ++local */ /* CmiMyLocalRank is used for setting default cpu affinity */ @@ -1395,8 +1398,10 @@ static void open_charmrun_socket(void) dataskt=skt_datagram(&dataport, Cmi_os_buffer_size); #endif MACHSTATE2(5,"skt_connect at dataskt:%d Cmi_charmrun_port:%d",dataskt, Cmi_charmrun_port); + //printf("skt_connect at dataskt:%d Cmi_charmrun_port:%d",dataskt, Cmi_charmrun_port); Cmi_charmrun_fd = skt_connect(Cmi_charmrun_IP, Cmi_charmrun_port, 1800); MACHSTATE2(5,"Opened connection to charmrun at socket %d, dataport=%d", Cmi_charmrun_fd, dataport); + //printf("Opened connection to charmrun at socket %d, dataport=%d", Cmi_charmrun_fd, dataport); skt_tcp_no_nagle(Cmi_charmrun_fd); } @@ -1434,7 +1439,9 @@ static void send_singlenodeinfo(void) memset(&me, 0, sizeof(me)); #if CMK_SHRINK_EXPAND - me.nodeNo = ChMessageInt_new(Cmi_charmrun_assigned_pe); + me.nodeNo = ChMessageInt_new(Lrts_myNode); + //if (Cmi_isOldProcess && ChMessageInt(me.nodeNo) == 3) + // exit(1); #else me.nodeNo = ChMessageInt_new(Lrts_myNode); #endif @@ -1454,6 +1461,8 @@ static void send_singlenodeinfo(void) use non-locking version */ ctrl_sendone_nolock("initnode", (const char *)&me, sizeof(me), NULL, 0); MACHSTATE1(5, "send initnode - dataport:%d", dataport); + //fprintf(stderr, "send initnode - dataport:%d", dataport); + //fflush(stderr); MACHSTATE(3, "initnode sent"); } @@ -1798,7 +1807,7 @@ void LrtsPostCommonInit(int everReturn) CmiAbort("Charm++ Fatal Error: interrupt mode does not work with default system memory allocator. Run with +netpoll to disable the interrupt."); } #endif - } + } #if MEMORYUSAGE_OUTPUT memoryusage_counter = 0; @@ -1904,7 +1913,7 @@ void ConverseCleanup(void) if (CmiMyPe() == 0) { if (willContinue) { - CcsSendDelayedReply(shrinkExpandreplyToken, 0, 0); //reply to CCS client + //CcsSendDelayedReply(shrinkExpandreplyToken, 0, 0); //reply to CCS client // wait for this message to receive, hack // TODO: figure out why this is important usleep(500); @@ -2146,14 +2155,16 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID) #if CMK_SHRINK_EXPAND if (Cmi_isOldProcess == 1) { Lrts_myNode = Cmi_mynewpe; - Cmi_myoldpe = Cmi_oldpe; + Cmi_charmrun_assigned_pe = Lrts_myNode; + //if (Cmi_isOldProcess && Lrts_myNode == 3) + // exit(1); Lrts_numNodes = Cmi_newnumnodes; } #endif /* NOTE: can not acutally call timer before timerInit ! GZ */ #if CMK_SHRINK_EXPAND - MACHSTATE3(2,"After reorg %d %d %d \n", Cmi_oldpe, Lrts_myNode, Lrts_numNodes); + CmiPrintf("After reorg %d %d %d \n", Cmi_oldpe, Lrts_myNode, Lrts_numNodes); #endif MACHSTATE2(5,"Init: (netpoll=%d), (idlepoll=%d)",Cmi_netpoll,Cmi_idlepoll); diff --git a/src/arch/ucx/charmrun b/src/arch/ucx/charmrun index 80a1fc23ee..fde570824f 100755 --- a/src/arch/ucx/charmrun +++ b/src/arch/ucx/charmrun @@ -70,6 +70,10 @@ do args+=("$1" "$2") shift ;; + ++nodelist) + machinefile="$2" + shift + ;; ++quiet) QUIET=1 ;; @@ -114,6 +118,15 @@ done args+=("${charm_args[@]}") +# Set machinefile options for mpirun and srun if ++nodelist was specified +if [[ -n "$machinefile" ]]; then + mpirun_machinefile_opt=(-machinefile "$machinefile") + srun_nodelist_opt=(--nodelist="$machinefile") +else + mpirun_machinefile_opt=() + srun_nodelist_opt=() +fi + if [[ "$DEBUG" = '1' || "$DEBUG_NO_PAUSE" = '1' ]] then if [[ -z "$DEBUGGER" ]] @@ -167,7 +180,7 @@ then else #someday this should be pmix, but our pmix launcher needs some work # on machines like NCSA Delta this is the most robust solution known to me - runCmd srun --mpi=pmi2 -n "$nodes" "${args[@]}" + runCmd srun --mpi=pmi2 -n "$nodes" --exact "${args[@]}" fi elif [[ -n "$PBS_NODEFILE" ]] then @@ -386,16 +399,13 @@ then mpirun_cmd="$(command -v mpirun 2>/dev/null)" if [[ -n "$mpirun_cmd" ]] then - [[ -n "$MPI_MACHINEFILE" ]] && args=(-machinefile "$MPI_MACHINEFILE" "${args[@]}") setarch_cmd="$(command -v setarch 2>/dev/null)" if [[ -n "$setarch_cmd" && -x "$setarch_cmd" ]] then - # Disables randomization of the virtual address space (turns on - # ADDR_NO_RANDOMIZE). cur_arch="$(uname -m)" - runCmd "$setarch_cmd" "$cur_arch" -R mpirun -np "$nodes" "${args[@]}" + runCmd "$setarch_cmd" "$cur_arch" -R mpirun "${mpirun_machinefile_opt[@]}" -np "$nodes" "${args[@]}" else - runCmd mpirun -np "$nodes" "${args[@]}" + runCmd mpirun "${mpirun_machinefile_opt[@]}" -np "$nodes" "${args[@]}" fi else mpiexec_cmd="$(command -v mpiexec 2>/dev/null)" @@ -412,7 +422,7 @@ then #mpirun is checked before srun to support the Bridges supercomputer at PSC #as srun has a known issue and fails to successfully launch the parallel job. #This is required to run the nightly ofi autobuild. - runCmd srun -n "$nodes" -c $(( ppn + 1 )) "${args[@]}" + runCmd srun "${srun_nodelist_opt[@]}" -n "$nodes" -c $(( ppn + 1 )) "${args[@]}" else echo "No job launcher found! (tried aprun, mpirun and srun)" exit 1 diff --git a/src/arch/ucx/machine.C b/src/arch/ucx/machine.C index c8bc798a34..fbbe775a83 100644 --- a/src/arch/ucx/machine.C +++ b/src/arch/ucx/machine.C @@ -10,6 +10,9 @@ #include #include "converse.h" +#include "conv-ccs.h" +#include "ccs-server.h" +#include "ckrescale.h" #include "cmirdmautils.h" #include "machine.h" #include "pcqueue.h" @@ -30,6 +33,13 @@ #include "runtime-pmix.C" #endif +#if CMK_SHRINK_EXPAND +CcsDelayedReply shrinkExpandreplyToken; +extern int numProcessAfterRestart; +extern char *_shrinkexpand_basedir; +int mynewpe=0; +#endif + #define CmiSetMsgSize(msg, sz) ((((CmiMsgHeaderBasic *)msg)->size) = (sz)) #define UCX_MSG_PROBE_THRESH 32768 @@ -279,7 +289,7 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID) ucp_worker_params_t wParams; ucs_status_t status; int ret; - + ret = runtime_init(myNodeID, numNodes); UCX_CHECK_PMI_RET(ret, "runtime_init"); @@ -768,6 +778,42 @@ void LrtsExit(int exitcode) } } +void LrtsCleanup() +{ + int ret; + int i; + UcxRequest *req; + ucs_status_t status; + + UCX_LOG(4, "LrtsExit"); + + LrtsAdvanceCommunication(0); + + for (i = 0; i < ucxCtx.numRxReqs; ++i) { + req = ucxCtx.rxReqs[i]; + CmiFree(req->msgBuf); + ucp_request_cancel(ucxCtx.worker, req); + ucp_request_free(req); + } + + ucp_worker_destroy(ucxCtx.worker); + ucp_cleanup(ucxCtx.context); + + CmiFree(ucxCtx.eps); + CmiFree(ucxCtx.rxReqs); +#if CMK_SMP + PCQueueDestroy(ucxCtx.txQueue); +#endif + + if(!CharmLibInterOperate || userDrivenMode) { + ret = runtime_barrier(); + UCX_CHECK_PMI_RET(ret, "runtime_barrier"); + + ret = runtime_fini(); + UCX_CHECK_PMI_RET(ret, "runtime_fini"); + } +} + #if CMK_MACHINE_PROGRESS_DEFINED void CmiMachineProgressImpl() { @@ -777,6 +823,45 @@ void CmiMachineProgressImpl() } #endif + +#if CMK_SHRINK_EXPAND +void ConverseCleanup(void) +{ + MACHSTATE(2,"ConverseCleanup {"); + + CmiBarrier(); + +#if CMK_USE_SYSVSHM + CmiExitSysvshm(); +#elif CMK_USE_PXSHM + CmiExitPxshm(); +#endif + ConverseCommonExit(); /* should be called by every rank */ + CmiNodeBarrier(); /* single node SMP, make sure every rank is done */ + //if (CmiMyRank()==0) CmiStdoutFlush(); + + if (get_shrinkexpand_exit() && CmiMyPe() == 0) { + // launch charmrun here + + std::string path = std::string(_shrinkexpand_basedir) + "/numRestartProcs.txt"; + FILE *fp = fopen(path.c_str(), "w"); + if (fp != NULL) { + fprintf(fp, "%d", numProcessAfterRestart); + fclose(fp); + } + + CmiBarrier(); + ConverseExit(100); + } else { + // kill all other processes + CmiBarrier(); + //printf("Exiting PE %d\n", CmiMyPe()); + //fflush(stdout); + ConverseExit(); + } +} +#endif + // In CMK_SMP, this is called by worker thread void LrtsPostNonLocal() { @@ -840,7 +925,7 @@ void UcxRecvDeviceCompleted(void* request, ucs_status_t status, } } -void LrtsSendDevice(int dest_pe, const void*& ptr, size_t size, uint64_t& tag) { +void LrtsSendDevice(int dest_pe, int src_pe, const void*& ptr, size_t size, uint64_t& tag) { // FIXME: Is this tag generation OK? tag = ((uint64_t)CpvAccess(tag_counter)++ << (UCX_TAG_PE_BITS + UCX_TAG_MSG_BITS)) | (CmiMyPe() << UCX_TAG_MSG_BITS) | UCX_MSG_TAG_DEVICE; #if CMK_SMP diff --git a/src/arch/util/machine-broadcast.C b/src/arch/util/machine-broadcast.C index bcca7da608..49473a44e0 100644 --- a/src/arch/util/machine-broadcast.C +++ b/src/arch/util/machine-broadcast.C @@ -295,10 +295,10 @@ void CmiSyncBroadcastFn1(int size, char *msg) { } #endif - for ( i=mype+1; i<_Cmi_numpes; i++ ) + for ( int i=mype+1; i<_Cmi_numpes; i++ ) CmiSyncSendFn(i, size, msg) ; - for ( i=0; iobj); CkCallstackPush(((CkChareThreadListener *)l)->obj); } @@ -470,7 +471,7 @@ void CkSectionID::pup(PUP::er &p) { /**** Tiny random API routines */ -#if CMK_CUDA +#if CMK_CUDA || CMK_HIP void CUDACallbackManager(void *fn, void *msg) { if (fn) { ((CkCallback*)fn)->send(msg); @@ -574,6 +575,7 @@ int CkGetArgc(void) { } Chare *CkActiveObj(void) { + // printf("[PE %d] getting active: stack size now %zu\n", CkMyPe(), CkpvAccess(runningObjs).size()); auto &objs = *(&CkpvAccess(runningObjs)); if (objs.empty()) { return nullptr; @@ -584,10 +586,12 @@ Chare *CkActiveObj(void) { inline void _pushObj(Chare *obj) { CkpvAccess(runningObjs).emplace_back(obj); + // printf("[PE %d] pushObj: stack size now %zu\n", CkMyPe(), CkpvAccess(runningObjs).size()); } inline Chare *_popObj(void) { auto &objs = *(&CkpvAccess(runningObjs)); + // printf("[PE %d] popobj: stack size now %zu\n", CkMyPe(), CkpvAccess(runningObjs).size()); if (objs.empty()) { return nullptr; } else { @@ -620,10 +624,16 @@ void CkCallstackPush(Chare *obj) { // removes all instances of ( obj ) from the stack void CkCallstackUnwind(Chare *obj) { + // printf("[%d] removing all instances of obj %p\n",CkMyPe(), obj); + CkAssertMsg(obj != nullptr, "expected a valid object!"); auto &objs = *(&CkpvAccess(runningObjs)); auto start = std::begin(objs); auto end = std::end(objs); + // for(auto it=start;it!=end;it++) + // { + // printf("objects still in the stack are %p\n", *it); + // } // ensures that all copies of the object are null'd while (end != (start = std::find(start, end, obj))) { *start = nullptr; @@ -2639,14 +2649,14 @@ void CkArrayExtSend_multi(int aid, int *idx, int ndims, int epIdx, int num_bufs, // HAPI support -#if CMK_CUDA +#if CMK_CUDA || CMK_HIP #include "hapi.h" #endif void CkHapiAddCallback(long stream, void (*cb)(void*, void*), int fid) { - #if CMK_CUDA - cudaStream_t stream_ptr = (cudaStream_t)stream; + #if CMK_CUDA || CMK_HIP + hapiStream_t stream_ptr = (hapiStream_t)stream; CkCallback callback(cb, (void *) fid); hapiAddCallback(stream_ptr, callback, NULL); diff --git a/src/ck-core/ckcallback.C b/src/ck-core/ckcallback.C index fc5aea7758..337a9255c1 100644 --- a/src/ck-core/ckcallback.C +++ b/src/ck-core/ckcallback.C @@ -401,7 +401,7 @@ void CkCallback::send(void *msg,int opts) const break; case sendArray: //Send message to an array element if (!msg) msg=CkAllocSysMsg(); - if (d.array.hasRefnum) CkSetRefNum(msg, d.array.refnum); + if (d.array.hasRefnum) CkSetRefNum(msg, d.array.refnum); CkSetMsgArrayIfNotThere(msg, policy); CkSendMsgArray(d.array.ep, msg, d.array.id, d.array.idx.asChild(), opts); break; diff --git a/src/ck-core/ckcheckpoint.C b/src/ck-core/ckcheckpoint.C index 06f2b9cbdf..ca9c1f8de2 100644 --- a/src/ck-core/ckcheckpoint.C +++ b/src/ck-core/ckcheckpoint.C @@ -16,8 +16,11 @@ More documentation goes here... #include using std::ostringstream; #include +#include +#include #include "charm++.h" #include "ck.h" +#include "ckrescale.h" #include "ckcheckpoint.h" #include "CkCheckpoint.decl.h" #include @@ -36,23 +39,6 @@ void noopit(const char*, ...) CkGroupID _sysChkptWriteMgr; CkGroupID _sysChkptMgr; -struct GroupInfo -{ - CkGroupID gID; - int MigCtor; - std::string name; - bool present; - - void pup(PUP::er& p) - { - p | gID; - p | MigCtor; - p | name; - p | present; - } -}; - -bool _inrestart = false; bool _restarted = false; int _oldNumPes = 0; bool _chareRestored = false; @@ -60,10 +46,10 @@ double chkptStartTimer = 0; #if CMK_SHRINK_EXPAND int originalnumGroups = -1; extern int Cmi_isOldProcess; -extern int Cmi_myoldpe; extern char *_shrinkexpand_basedir; #endif + // Required for broadcasting RO Data after recovering from failure #if CMK_SMP extern std::atomic numZerocopyROops; @@ -99,14 +85,15 @@ private: public: ElementCheckpointer(CkLocMgr* mgr_, PUP::er &p_):locMgr(mgr_),p(p_){}; void addLocation(CkLocation &loc) { - CkArrayIndex idx=loc.getIndex(); - CkGroupID gID = locMgr->ckGetGroupID(); - CmiUInt8 id = loc.getID(); - p|gID; // store loc mgr's GID as well for easier restore - p|idx; - p|id; - p|loc; - //CkPrintf("[%d] addLocation: ", CkMyPe()), idx.print(); + CkArrayIndex idx=loc.getIndex(); + //CkPrintf("[%d] Packing index dim = %i, %s\n", CkMyPe(), idx.dimension, idx2str(idx)); + CkGroupID gID = locMgr->ckGetGroupID(); + CmiUInt8 id = loc.getID(); + p|gID; // store loc mgr's GID as well for easier restore + p|idx; + p|id; + p|loc; + //CkPrintf("[%d] addLocation: ", CkMyPe()), idx.print(); } }; @@ -148,7 +135,7 @@ static void bdcastROGroupData(void){ CkPupROData(ps); int ROSize = ps.size(); - CkPupGroupData(ps1); + //CkPupGroupData(ps1); int GroupSize = ps1.size(); char *msg = (char *)CmiAlloc(CmiMsgHeaderSizeBytes + 2*sizeof(int) + ps.size() + ps1.size()); @@ -164,7 +151,7 @@ static void bdcastROGroupData(void){ PUP::toMem pp((char *)payloadOffset, PUP::er::IS_CHECKPOINT); CkPupROData(pp); - CkPupGroupData(pp); + //CkPupGroupData(pp); CmiSetHandler(msg, _ROGroupRestartHandlerIdx); CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes + 2*sizeof(int) + pp.size(), msg); @@ -258,6 +245,36 @@ public: CProxy_CkCheckpointMgr(_sysChkptMgr)[index].Checkpoint(dirname, cb, requestStatus); } + void RescaleCheckpoint(const char* dirname, CkCallback cb, std::vector avail, + bool requestStatus = false, int writersPerNode = 0) + { + // If currently checkpointing, drop new requests + if (inProgress) return; + inProgress = true; + numComplete = 0; + + set_shrinkexpand_exit(true); // Set this flag to indicate that we are in the process of shrinking/expanding + + if (writersPerNode > 0) numWriters = std::min(writersPerNode, nodeSize); + + // Save params for future invocations and kick off the first numWriters PEs to start + // checkpointing + this->dirname = dirname; + this->cb = cb; + this->requestStatus = requestStatus; + +#if CMK_SHRINK_EXPAND + if (CkMyPe() != 0) + { + se_avail_vector = (char*) malloc(CkNumPes() * sizeof(char)); + memcpy(se_avail_vector, avail.data(), CkNumPes() * sizeof(char)); + } +#endif + + for (index = firstPE; index < firstPE + numWriters; index++) + CProxy_CkCheckpointMgr(_sysChkptMgr)[index].Checkpoint(dirname, cb, requestStatus); + } + void FinishedCheckpoint() { numComplete++; @@ -295,109 +312,135 @@ public: // broadcast void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback cb, bool _requestStatus){ +#if CMK_SHRINK_EXPAND + std::vector avail(se_avail_vector, se_avail_vector + CkNumPes()); + int chckPtId = CmiPhysicalRank(CmiMyPe()); +#else + int chckPtId = CmiPhysicalRank(CmiMyPe()); +#endif chkptStartTimer = CmiWallTimer(); - requestStatus = _requestStatus; - // make dir on all PEs in case it is a local directory - CmiMkdir(dirname); - - // Create partition directories (if applicable) - ostringstream dirPath; - dirPath << dirname; - if (CmiNumPartitions() > 1) { - addPartitionDirectory(dirPath); - CmiMkdir(dirPath.str().c_str()); - } - - // Due to file system issues we have observed, divide checkpoints - // into subdirectories to avoid having too many files in a single directory. - // Nodegroups should be checked separately since they could go into - // different subdirectory. - - // Save current path for later use with nodegroups - ostringstream dirPathNode; - dirPathNode << dirPath.str(); - - // Create subdirectories - int mySubDir = CkMyPe() / SUBDIR_SIZE; - dirPath << "/sub" << mySubDir; - CmiMkdir(dirPath.str().c_str()); - - // Create Nodegroup subdirectory if needed - if (CkMyRank() == 0) { - int mySubDirNode = CkMyNode() / SUBDIR_SIZE; - if (mySubDirNode != mySubDir) { - dirPathNode << "/sub" << mySubDirNode; - CmiMkdir(dirPathNode.str().c_str()); - } - } - - bool success = true; - if (CkMyPe() == 0) { + #if CMK_SHRINK_EXPAND - if (pending_realloc_state == REALLOC_IN_PROGRESS) { - // After restarting from this AtSync checkpoint, resume execution along the - // normal path (i.e. whatever the user defined as ResumeFromSync.) - CkCallback resumeFromSyncCB(CkIndex_LBManager::ResumeClients(), _lbmgr); - success &= checkpointOne(dirname, resumeFromSyncCB, requestStatus); - } else + if (avail[CkMyPe()]) #endif - { - success &= checkpointOne(dirname, cb, requestStatus); + { + requestStatus = _requestStatus; + // make dir on all PEs in case it is a local directory + CmiMkdir(dirname); + + // Create partition directories (if applicable) + ostringstream dirPath; + dirPath << dirname; + if (CmiNumPartitions() > 1) { + addPartitionDirectory(dirPath); + CmiMkdir(dirPath.str().c_str()); } - } -#ifndef CMK_CHARE_USE_PTR - // only create chare checkpoint file if this PE actually has data - if (CkpvAccess(chare_objs).size() > 0 || CkpvAccess(vidblocks).size() > 0) - { - // save plain singleton chares into Chares.dat - FILE* fChares = openCheckpointFile(dirname, "Chares", "wb", CkMyPe()); - PUP::toDisk pChares(fChares, PUP::er::IS_CHECKPOINT); - CkPupChareData(pChares); - if (pChares.checkError()) success = false; - if (CmiFclose(fChares) != 0) success = false; - } -#endif + // Due to file system issues we have observed, divide checkpoints + // into subdirectories to avoid having too many files in a single directory. + // Nodegroups should be checked separately since they could go into + // different subdirectory. + + // Save current path for later use with nodegroups + ostringstream dirPathNode; + dirPathNode << dirPath.str(); + + // Create subdirectories + int mySubDir = chckPtId / SUBDIR_SIZE; + dirPath << "/sub" << mySubDir; + CmiMkdir(dirPath.str().c_str()); + + // Create Nodegroup subdirectory if needed + if (CkMyRank() == 0) { + int mySubDirNode = CkMyNode() / SUBDIR_SIZE; + if (mySubDirNode != mySubDir) { + dirPathNode << "/sub" << mySubDirNode; + CmiMkdir(dirPathNode.str().c_str()); + } + } - // save groups into Groups.dat - // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), - // groups(PUP'ed) - FILE* fGroups = openCheckpointFile(dirname, "Groups", "wb", CkMyPe()); - PUP::toDisk pGroups(fGroups, PUP::er::IS_CHECKPOINT); - CkPupGroupData(pGroups); - if (pGroups.checkError()) success = false; - if (CmiFclose(fGroups) != 0) success = false; - - // save nodegroups into NodeGroups.dat - // content of the file: numNodeGroups, GroupInfo[numNodeGroups], - // _nodeGroupTable(PUP'ed), nodegroups(PUP'ed) - if (CkMyRank() == 0) - { - FILE* fNodeGroups = openCheckpointFile(dirname, "NodeGroups", "wb", CkMyNode()); - PUP::toDisk pNodeGroups(fNodeGroups, PUP::er::IS_CHECKPOINT); - CkPupNodeGroupData(pNodeGroups); - if (pNodeGroups.checkError()) success = false; - if (CmiFclose(fNodeGroups) != 0) success = false; - } + bool success = true; + if (CkMyPe() == 0) { + + #if CMK_SHRINK_EXPAND + if (pending_realloc_state == SHRINK_IN_PROGRESS) { + CkPrintf("Shrink in progress on PE%i\n", CkMyPe()); + // After restarting from this AtSync checkpoint, resume execution along the + // normal path (i.e. whatever the user defined as ResumeFromSync.) + CkCallback resumeFromSyncCB(CkIndex_LBManager::ResumeClients(), _lbmgr); + success &= checkpointOne(dirname, resumeFromSyncCB, requestStatus); + } else if (pending_realloc_state == EXPAND_IN_PROGRESS) { + CkPrintf("Expand in progress on PE%i\n", CkMyPe()); + CkCallback resumeFromSyncCB(CkIndex_LBManager::StartLB(), CProxy_LBManager(_lbmgr)[0]); + success &= checkpointOne(dirname, resumeFromSyncCB, requestStatus); + } else + #endif + { + success &= checkpointOne(dirname, cb, requestStatus); + } + } + + #if CMK_SHRINK_EXPAND + pending_realloc_state = NO_REALLOC; + #endif + + #ifndef CMK_CHARE_USE_PTR + // only create chare checkpoint file if this PE actually has data + if (CkpvAccess(chare_objs).size() > 0 || CkpvAccess(vidblocks).size() > 0) + { + // save plain singleton chares into Chares.dat + FILE* fChares = openCheckpointFile(dirname, "Chares", "wb", chckPtId); + PUP::toDisk pChares(fChares, PUP::er::IS_CHECKPOINT); + CkPupChareData(pChares); + if (pChares.checkError()) success = false; + if (CmiFclose(fChares) != 0) success = false; + } + #endif - // DEBCHK("[%d]CkCheckpointMgr::Checkpoint called dirname={%s}\n",CkMyPe(),dirname); - FILE* datFile = openCheckpointFile(dirname, "arr", "wb", CkMyPe()); - PUP::toDisk p(datFile, PUP::er::IS_CHECKPOINT); - CkPupArrayElementsData(p); - if (p.checkError()) success = false; - if (CmiFclose(datFile) != 0) success = false; - -#if ! CMK_DISABLE_SYNC -#if CMK_HAS_SYNC_FUNC - sync(); -#elif CMK_HAS_SYNC - system("sync"); -#endif -#endif - chkpStatus = success?CK_CHECKPOINT_SUCCESS:CK_CHECKPOINT_FAILURE; - restartCB = cb; - DEBCHK("[%d]restartCB installed\n",CkMyPe()); + // save groups into Groups.dat + // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), + // groups(PUP'ed) + FILE* fGroups = openCheckpointFile(dirname, "Groups", "wb", chckPtId); + PUP::toDisk pGroups(fGroups, PUP::er::IS_CHECKPOINT); + CkPupGroupData(pGroups); + if (pGroups.checkError()) success = false; + if (CmiFclose(fGroups) != 0) success = false; + // save nodegroups into NodeGroups.dat + // content of the file: numNodeGroups, GroupInfo[numNodeGroups], + // _nodeGroupTable(PUP'ed), nodegroups(PUP'ed) + if (CkMyRank() == 0) + { + FILE* fNodeGroups = openCheckpointFile(dirname, "NodeGroups", "wb", 0); + PUP::toDisk pNodeGroups(fNodeGroups, PUP::er::IS_CHECKPOINT); + CkPupNodeGroupData(pNodeGroups); + if (pNodeGroups.checkError()) success = false; + if (CmiFclose(fNodeGroups) != 0) success = false; + } + //std::vector avail_vector; + //get_avail_vector(avail_vector); + //if (pending_realloc_state == REALLOC_IN_PROGRESS && static_cast(avail_vector[CkMyPe()])) + //{ + //printf("[%d] Writing array checkpoint\n", CkMyPe()); + + FILE* datFile = openCheckpointFile(dirname, "arr", "wb", chckPtId); + PUP::toDisk p(datFile, PUP::er::IS_CHECKPOINT); + CkPupArrayElementsData(p); + if (p.checkError()) success = false; + if (CmiFclose(datFile) != 0) success = false; + //} + + #if ! CMK_DISABLE_SYNC + #if CMK_HAS_SYNC_FUNC + sync(); + #elif CMK_HAS_SYNC + system("sync"); + #endif + #endif + chkpStatus = success?CK_CHECKPOINT_SUCCESS:CK_CHECKPOINT_FAILURE; + restartCB = cb; + DEBCHK("[%d]restartCB installed\n",CkMyPe()); + } // Use barrier instead of contribute here: // barrier is stateless and multiple calls to it do not overlap. barrier(CkCallback(CkReductionTarget(CkCheckpointMgr, SendRestartCB), 0, thisgroup)); @@ -441,7 +484,7 @@ void CkPupROData(PUP::er &p) void CkPupMainChareData(PUP::er &p, CkArgMsg *args) { int nMains=_mainTable.size(); - DEBCHK("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains); + //CkPrintf("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains); for(int i=0;ichareIdx; ChareInfo *entry = _chareTable[chareIdx]; @@ -449,11 +492,14 @@ void CkPupMainChareData(PUP::er &p, CkArgMsg *args) if(entryMigCtor!=-1) { Chare* obj; if (p.isUnpacking()) { - DEBCHK("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, entry->size); + //CkPrintf("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, entry->size); obj = CkAllocateChare(chareIdx); + //CkPrintf("Allocated mainchare %s\n", entry->name); _mainTable[i]->setObj(obj); + //CkPrintf("Set mainchare %s\n", entry->name); //void *m = CkAllocSysMsg(); CkInvokeEP(obj, entryMigCtor, args); + //CkPrintf("Invoked migration constructor for mainchare %s\n", entry->name); } else obj = (Chare *)_mainTable[i]->getObj(); @@ -548,6 +594,8 @@ void CkPupChareData(PUP::er &p) typedef void GroupCreationFn(CkGroupID groupID, int constructorIdx, envelope *env); + + static void CkPupPerPlaceData(PUP::er &p, GroupIDTable *idTable, GroupTable *objectTable, unsigned int &numObjects, int constructionMsgType, GroupCreationFn creationFn @@ -559,7 +607,7 @@ static void CkPupPerPlaceData(PUP::er &p, GroupIDTable *idTable, GroupTable *obj numGroups = idTable->size(); } p|numGroups; - DEBCHK("[%d] CkPupPerPlaceData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups); + CkPrintf("[%d] CkPupPerPlaceData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups); std::vector tmpInfo(numGroups); if (!p.isUnpacking()) { @@ -618,21 +666,20 @@ static void CkPupPerPlaceData(PUP::er &p, GroupIDTable *idTable, GroupTable *obj } } -void CkPupGroupData(PUP::er &p - ) +void CkPupGroupData(PUP::er &p) { - CkPupPerPlaceData(p, CkpvAccess(_groupIDTable), CkpvAccess(_groupTable), - CkpvAccess(_numGroups), BocInitMsg, &CkCreateLocalGroup - ); + CkPupPerPlaceData(p, CkpvAccess(_groupIDTable), CkpvAccess(_groupTable), + CkpvAccess(_numGroups), BocInitMsg, &CkCreateLocalGroup + ); } void CkPupNodeGroupData(PUP::er &p ) { CkPupPerPlaceData(p, &CksvAccess(_nodeGroupIDTable), - CksvAccess(_nodeGroupTable), CksvAccess(_numNodeGroups), - NodeBocInitMsg, &CkCreateLocalNodeGroup - ); + CksvAccess(_nodeGroupTable), CksvAccess(_numNodeGroups), + NodeBocInitMsg, &CkCreateLocalNodeGroup + ); } // handle chare array elements for this processor @@ -640,7 +687,7 @@ void CkPupArrayElementsData(PUP::er &p, int notifyListeners) { int i; // safe in both packing/unpacking at this stage - int numGroups = CkpvAccess(_groupIDTable)->size(); + int numGroups = CkpvAccess(_groupIDTable)->size(); // number of array elements on this processor int numElements = 0; @@ -656,25 +703,25 @@ void CkPupArrayElementsData(PUP::er &p, int notifyListeners) if (!p.isUnpacking()) { // let CkLocMgr iterate over and store every array element - CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk);); - } + CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk);); + } else { // loop and create all array elements ourselves //CkPrintf("total chare array cnts: %d\n", numElements); for (int i=0; ifind(gID).getObj(); - if (notifyListeners){ - mgr->resume(idx, id, p, true); - } - else{ - mgr->restore(idx, id, p); - } + CkGroupID gID; + CkArrayIndex idx; + CmiUInt8 id; + p|gID; + p|idx; + p|id; + //CkPrintf("[%d] Unpacked dim = %i: %s\n", CkMyPe(), idx.dimension, idx2str(idx)); + CkLocMgr *mgr = (CkLocMgr*)CkpvAccess(_groupTable)->find(gID).getObj(); + if (notifyListeners){ + mgr->resume(idx, id, p, true); + } else{ + mgr->restore(idx, id, p); + } } } // finish up @@ -713,7 +760,7 @@ void CkPupProcessorData(PUP::er &p) CkPupChareData(p); // save groups - CkPupGroupData(p); + //CkPupGroupData(p); // save nodegroups if(CkMyRank()==0) { @@ -812,109 +859,119 @@ void CkStartCheckpoint(const char* dirname, const CkCallback& cb, bool requestSt .Checkpoint(dirname, cb, requestStatus, writersPerNode); } +void CkStartRescaleCheckpoint(const char* dirname, const CkCallback& cb, + std::vector avail, bool requestStatus, int writersPerNode) +{ +#if CMK_SHRINK_EXPAND + if (CkMyPe() != 0) + { + CkPrintf("[%d] se_avail_vector copied\n", CkMyPe()); + se_avail_vector = (char*) malloc(CkNumPes() * sizeof(char)); + memcpy(se_avail_vector, avail.data(), CkNumPes() * sizeof(char)); + } + + if (cb.isInvalid()) + CkAbort("callback after checkpoint is not set properly"); + + if (cb.containsPointer()) + CkAbort("Cannot restart from a callback based on a pointer"); + + CkPrintf("[%d] Checkpoint starting in %s\n", CkMyPe(), dirname); + + // hand over to checkpoint managers for per-processor checkpointing + CProxy_CkCheckpointWriteMgr(_sysChkptWriteMgr) + .RescaleCheckpoint(dirname, cb, avail, requestStatus, writersPerNode); +#endif +} + /** * Restart: There's no such object as restart manager is created * because a group cannot restore itself anyway. * The mechanism exists as converse code and get invoked by * broadcast message. **/ - CkCallback globalCb; -void CkRestartMain(const char* dirname, CkArgMsg *args){ - int i; - - if (CmiMyRank() == 0) { - _inrestart = true; - _restarted = true; - CkMemCheckPT::inRestarting = true; - } +void CkRecvGroupROData(char* msg) +{ + char* origMsg = msg; + msg = msg + CmiMsgHeaderSizeBytes; + int dirSize = *reinterpret_cast(msg); + msg += sizeof(int); + std::string dirname(msg, dirSize); + msg += dirSize; + int ROsize = *reinterpret_cast(msg); + msg += sizeof(int); + + //CkPrintf("dirname = %s, groupsize = %i\n", dirname.c_str(), groupSize); + PUP::fromMem bRO(msg, PUP::er::IS_CHECKPOINT); - // restore readonlys - FILE* fRO = openCheckpointFile(dirname, "RO", "rb", -1); - int _numPes = -1; - PUP::fromDisk pRO(fRO, PUP::er::IS_CHECKPOINT); - pRO|_numPes; + int _numPes = -1; + bRO|_numPes; int _numNodes = -1; - pRO|_numNodes; - pRO|globalCb; - if (CmiMyRank() == 0) CkPupROData(pRO); + bRO|_numNodes; + bRO|globalCb; + /*if (CmiMyRank() == 0)*/ CkPupROData(bRO); bool requestStatus = false; - pRO|requestStatus; - CmiFclose(fRO); - DEBCHK("[%d]CkRestartMain: readonlys restored\n",CkMyPe()); - _oldNumPes = _numPes; - - CmiNodeBarrier(); - - // Restore mainchares on PE 0 - if (CkMyPe() == 0) - { - FILE* fMain = openCheckpointFile(dirname, "MainChares", "rb"); - if (fMain) - { - PUP::fromDisk pMain(fMain, PUP::er::IS_CHECKPOINT); - CkPupMainChareData(pMain, args); - CmiFclose(fMain); - DEBCHK("[%d]CkRestartMain: mainchares restored\n", CkMyPe()); - } - } + bRO|requestStatus; + + CkPrintf("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes()); + + msg += ROsize; + + if (CkMyPe() >= _numPes) { + PUP::fromMem bGroups(msg, PUP::er::IS_CHECKPOINT); + CkPupGroupData(bGroups); + } #ifndef CMK_CHARE_USE_PTR - // restore chares only when number of pes is the same - if (CkNumPes() == _numPes) - { - // A chare checkpoint file only exists when the PE actually contained singleton - // chares at checkpoint time, so check to see if the file exists before trying - // to restore - std::string filename = getCheckpointFileName(dirname, "Chares", CkMyPe()); - FILE* fChares = CmiFopen(filename.c_str(), "rb"); - if (fChares) - { - PUP::fromDisk pChares(fChares, PUP::er::IS_CHECKPOINT); - CkPupChareData(pChares); - CmiFclose(fChares); - _chareRestored = true; - } - } + // restore chares only when number of pes is the same + if (CkNumPes() == _numPes) + { + // A chare checkpoint file only exists when the PE actually contained singleton + // chares at checkpoint time, so check to see if the file exists before trying + // to restore + std::string filename = getCheckpointFileName(dirname.c_str(), "Chares", CkMyPe()); + FILE* fChares = CmiFopen(filename.c_str(), "rb"); + if (fChares) + { + PUP::fromDisk pChares(fChares, PUP::er::IS_CHECKPOINT); + CkPupChareData(pChares); + CmiFclose(fChares); + _chareRestored = true; + } + } #endif - - // restore groups - // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed) - // restore from PE0's copy if shrink/expand - FILE* fGroups = openCheckpointFile(dirname, "Groups", "rb", - (CkNumPes() == _numPes) ? CkMyPe() : 0); - PUP::fromDisk pGroups(fGroups, PUP::er::IS_CHECKPOINT); - CkPupGroupData(pGroups); - CmiFclose(fGroups); - - // restore nodegroups - // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed) - if(CkMyRank()==0){ - FILE* fNodeGroups = openCheckpointFile(dirname, "NodeGroups", "rb", - (CkNumNodes() == _numNodes) ? CkMyNode() : 0); - PUP::fromDisk pNodeGroups(fNodeGroups, PUP::er::IS_CHECKPOINT); - CkPupNodeGroupData(pNodeGroups); - CmiFclose(fNodeGroups); - } + CmiFree(origMsg); // for each location, restore arrays //DEBCHK("[%d]Trying to find location manager\n",CkMyPe()); - DEBCHK("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes()); - if(CkMyPe() < _numPes) // in normal range: restore, otherwise, do nothing - for (i=0; i<_numPes;i++) { - if (i%CkNumPes() == CkMyPe()) { - FILE *datFile = openCheckpointFile(dirname, "arr", "rb", i); - PUP::fromDisk p(datFile, PUP::er::IS_CHECKPOINT); - CkPupArrayElementsData(p); - CmiFclose(datFile); - } - } + + if(CkMyPe() < _numPes) { // in normal range: restore, otherwise, do nothing + int rank = CmiPhysicalRank(CmiMyPe()); + CkPrintf("[%d]CkRestartMain: restoring array elements from physical rank %d\n", CkMyPe(), rank); + + FILE* groupFile = openCheckpointFile(dirname.c_str(), "Groups", "rb", rank); + PUP::fromDisk bGroups(groupFile, PUP::er::IS_CHECKPOINT); + CkPupGroupData(bGroups); + CmiFclose(groupFile); + + if(CmiMyRank()==0) { + FILE* nodeGroupFile = openCheckpointFile(dirname.c_str(), "NodeGroups", "rb", 0); + PUP::fromDisk bNodeGroups(nodeGroupFile, PUP::er::IS_CHECKPOINT); + CkPupNodeGroupData(bNodeGroups); + CmiFclose(nodeGroupFile); + } + + FILE *datFile = openCheckpointFile(dirname.c_str(), "arr", "rb", rank); + PUP::fromDisk p(datFile, PUP::er::IS_CHECKPOINT); + CkPupArrayElementsData(p); + CmiFclose(datFile); + } - _inrestart = false; + set_in_restart(false); + + if (CmiMyRank()==0) _initDone(); // this rank will trigger other ranks - if (CmiMyRank()==0) _initDone(); // this rank will trigger other ranks - //_initDone(); - CkMemCheckPT::inRestarting = false; if(CkMyPe()==0) { CmiPrintf("[%d]CkRestartMain done. sending out callback.\n",CkMyPe()); if(requestStatus) @@ -927,49 +984,117 @@ void CkRestartMain(const char* dirname, CkArgMsg *args){ globalCb.send(); } } + + if (CmiMyRank() == 0) CkMemCheckPT::inRestarting = false; + + if (CmiMyPe() == 0) { + CkPrintf("Restore from disk finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer); + } } +void CkRestartMain(const char* dirname, CkArgMsg *args){ #if CMK_SHRINK_EXPAND -// after resume and getting message -void CkResumeRestartMain(char * msg) { - int i; - char filename[1024]; - const char * dirname = ""; - _inrestart = true; - _restarted = true; - CkMemCheckPT::inRestarting = true; - CmiPrintf("[%d]CkResumeRestartMain: Inside Resume Restart\n",CkMyPe()); - CmiPrintf("[%d]CkResumeRestartMain: Group restored %d\n",CkMyPe(), CkpvAccess(_numGroups)-1); + chkptStartTimer = CmiWallTimer(); + int i; + + if (CmiMyRank() == 0) { + set_in_restart(true); + _restarted = true; + CkMemCheckPT::inRestarting = true; + } - int _numPes = -1; - if(CkMyPe()!=0) { - PUP::fromMem pRO((char *)(msg+CmiMsgHeaderSizeBytes+2*sizeof(int)), PUP::er::IS_CHECKPOINT); + // Restore mainchares on PE 0 + if (CkMyPe() == 0) + { + FILE* fMain = openCheckpointFile(dirname, "MainChares", "rb"); + if (fMain) + { + PUP::fromDisk pMain(fMain, PUP::er::IS_CHECKPOINT); + CkPupMainChareData(pMain, args); + CmiFclose(fMain); + DEBCHK("[%d]CkRestartMain: mainchares restored\n", CkMyPe()); + } + } - CkPupROData(pRO); - CmiPrintf("[%d]CkRestartMain: readonlys restored\n",CkMyPe()); + if (CkMyPe() == 0) + { + std::string dirnameStr(dirname); + int strLen = dirnameStr.size(); + + std::string ROFileName = getCheckpointFileName(dirname, "RO", -1); + std::ifstream ROFile(ROFileName, std::ios::binary | std::ios::ate); + std::streamsize ROSize = ROFile.tellg(); + ROFile.seekg(0, std::ios::beg); + + // Check for and exclude EOF character if present + if (ROSize > 0) { + ROFile.seekg(-1, std::ios::end); + char lastChar; + ROFile.get(lastChar); + if (lastChar == EOF || lastChar == '\0') { + ROSize--; + } + ROFile.seekg(0, std::ios::beg); + } - CkPupGroupData(pRO); - CmiPrintf("[%d]CkResumeRestartMain: Group restored %d\n",CkMyPe(), CkpvAccess(_numGroups)-1); - } + //CkPrintf("GroupMetadataSize = %lld\n", (long long)GroupMetadataSize); - CmiFree(msg); - CmiNodeBarrier(); - if(Cmi_isOldProcess) { - /* CmiPrintf("[%d] For shrinkexpand newpe=%d, oldpe=%d \n",Cmi_myoldpe, CkMyPe(), Cmi_myoldpe); */ - // non-shrink files would be empty since LB would take care - FILE *datFile = openCheckpointFile(dirname, "arr", "rb", Cmi_myoldpe); - PUP::fromDisk p(datFile, PUP::er::IS_CHECKPOINT); - CkPupArrayElementsData(p); - CmiFclose(datFile); + std::string GroupFilename = getCheckpointFileName(dirname, "Groups", 0); + std::ifstream GroupFile(GroupFilename, std::ios::binary | std::ios::ate); + std::streamsize GroupSize = GroupFile.tellg(); + GroupFile.seekg(0, std::ios::beg); + + // Check for and exclude EOF character if present + if (GroupSize > 0) { + GroupFile.seekg(-1, std::ios::end); + char lastChar; + GroupFile.get(lastChar); + if (lastChar == EOF || lastChar == '\0') { + GroupSize--; + } + GroupFile.seekg(0, std::ios::beg); + } + + char* msg = (char*) CmiAlloc(ROSize + GroupSize + 2 * sizeof(int) + strLen + CmiMsgHeaderSizeBytes); + char* buffer = msg + CmiMsgHeaderSizeBytes; + std::memcpy(buffer, &strLen, sizeof(int)); + buffer += sizeof(int); + std::memcpy(buffer, dirname, strLen); + buffer += strLen; + std::memcpy(buffer, &ROSize, sizeof(int)); + buffer += sizeof(int); + + ROFile.read(buffer, ROSize); + buffer += ROSize; + + GroupFile.read(buffer, GroupSize); + buffer += GroupSize; + + CmiSetHandler(msg, _shrinkExpandRestartHandlerIdx); + + CmiSyncBroadcastAllAndFree(ROSize + GroupSize + 2 * sizeof(int) + strLen + CmiMsgHeaderSizeBytes, msg); + + //CkPrintf("PE %i at barrier\n", CkMyPe()); + //CmiBarrier(); } - _initDone(); - _inrestart = false; - CkMemCheckPT::inRestarting = false; - if(CkMyPe()==0) { - CmiPrintf("[%d]CkResumeRestartMain done. sending out callback.\n",CkMyPe()); - CkPrintf("Restart from shared memory finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer); - globalCb.send(); + + //_initDone(); +#endif +} + +#if CMK_SHRINK_EXPAND +// NOTE - This function doesn't appear to be called anywhere +// after resume and getting message +void CkResumeRestartMain(char * msg) { +} + +int GetNewPeNumber(std::vector avail){ + int mype = CkMyPe(); + int count =0; + for (int i =0; i avail, bool requestStatus, int writersPerNode); + entry void FinishedCheckpoint(void); }; group [migratable] CkCheckpointMgr { entry CkCheckpointMgr(void); diff --git a/src/ck-core/ckcheckpoint.h b/src/ck-core/ckcheckpoint.h index 463c672865..c5cceb9c14 100644 --- a/src/ck-core/ckcheckpoint.h +++ b/src/ck-core/ckcheckpoint.h @@ -42,6 +42,29 @@ restarting of Charm++ programs. ... } \ } + #if CMK_SHRINK_EXPAND + extern char* se_avail_vector; + #endif + +//int _shrinkExpandRestartHandlerIdx; + + +struct GroupInfo +{ + CkGroupID gID; + int MigCtor; + std::string name; + bool present; + + void pup(PUP::er& p) + { + p | gID; + p | MigCtor; + p | name; + p | present; + } +}; + // utility functions to pup system global tables void CkPupROData(PUP::er &p); void CkPupMainChareData(PUP::er &p, CkArgMsg *args); @@ -51,22 +74,36 @@ void CkPupNodeGroupData(PUP::er &p); void CkPupArrayElementsData(PUP::er &p, int notifyListeners=1); void CkPupProcessorData(PUP::er &p); void CkRemoveArrayElements(); +void CkRecvGroupROData(char* msg); //void CkTestArrayElements(); // If writersPerNode <= 0 the number of writers is unchanged, if > 0, then set to // min(writersPerNode, CkMyNodeSize()) void CkStartCheckpoint(const char* dirname, const CkCallback& cb, bool requestStatus = false, int writersPerNode = 0); +void CkStartRescaleCheckpoint(const char* dirname, const CkCallback& cb, + std::vector avail, bool requestStatus = false, int writersPerNode = 0); void CkRestartMain(const char* dirname, CkArgMsg* args); + #if CMK_SHRINK_EXPAND +int GetNewPeNumber(std::vector avail); void CkResumeRestartMain(char *msg); #endif + #if __FAULT__ int CkCountArrayElements(); #endif #if CMK_SHRINK_EXPAND -enum realloc_state : uint8_t { NO_REALLOC=0, REALLOC_MSG_RECEIVED=1, REALLOC_IN_PROGRESS=2 }; +enum realloc_state : uint8_t +{ + NO_REALLOC=0, + SHRINK_MSG_RECEIVED=1 << 0, + EXPAND_MSG_RECEIVED=1 << 1, + SHRINK_IN_PROGRESS=1 << 2, + EXPAND_IN_PROGRESS=1 << 3 +}; + extern realloc_state pending_realloc_state; extern CkGroupID _lbmgr; #endif diff --git a/src/ck-core/cklocation.C b/src/ck-core/cklocation.C index 83ffe65615..2bb8efecb7 100644 --- a/src/ck-core/cklocation.C +++ b/src/ck-core/cklocation.C @@ -23,6 +23,15 @@ #include #include +#if CMK_CUDA || CMK_HIP + +#include "hapi.h" +#include "gpumanager.h" + +CsvExtern(GPUManager, gpu_manager); + +#endif // CMK_CUDA || CMK_HIP + #if CMK_LBDB_ON # include "LBManager.h" # include "MetaBalancer.h" @@ -83,15 +92,22 @@ int _messageBufferingThreshold; # if CMK_GLOBAL_LOCATION_UPDATE void UpdateLocation(MigrateInfo& migData) { + // CmiPrintf("calls update location\n"); CkGroupID locMgrGid = ck::ObjID(migData.obj.id).getCollectionID(); - if (locMgrGid.idx == 0) - { - return; - } - CkLocMgr* localLocMgr = (CkLocMgr*)CkLocalBranch(locMgrGid); - // CkLocMgr only uses element IDs, so extract just that part from the ObjID - localLocMgr->updateLocation(ck::ObjID(migData.obj.id).getElementID(), migData.to_pe); + CkLocCache *cache = (CkLocCache *)CkLocalBranch(localLocMgr->getLocationCache()); + + CmiUInt8 elementID = ck::ObjID(migData.obj.id).getElementID(); + CkArrayIndex idx = localLocMgr->lookupIdx(elementID); + + CkLocEntry entry; + entry.id = elementID; + entry.pe = migData.to_pe; + entry.epoch = cache->getEpoch(elementID) + 1; + + // CkPrintf("[%d] UpdateLocation: obj id=%llu from_pe=%d to_pe=%d epoch=%d\n", + // CkMyPe(), entry.id, migData.from_pe, entry.pe, entry.epoch); + localLocMgr->updateLocation(idx, entry); } # endif @@ -1824,9 +1840,15 @@ void CkMigratable::UserSetLBLoad() #if CMK_LBDB_ON // For load balancing: // user can call this helper function to set obj load (for model-based lb) -void CkMigratable::setObjTime(double cputime) { myRec->setObjTime(cputime); } +void CkMigratable::setObjTime(double cputime) { + myRec->setObjTime(cputime); } double CkMigratable::getObjTime() { return myRec->getObjTime(); } +void CkMigratable::setObjGPUTime(double gputime) { + myRec->setObjGPUTime(gputime); +} +double CkMigratable::getObjGPUTime() { return myRec->getObjGPUTime(); } + # if CMK_LB_USER_DATA /** * Use this method to set user specified data to the lbdatabase. @@ -1936,12 +1958,21 @@ void CkMigratable::AtSync(int waitForMigration) if (usesAutoMeasure == false) UserSetLBLoad(); + #if CMK_CUDA || CMK_HIP + PUP::sizer ps(PUP::er::IS_MIGRATION); + this->virtual_pup(ps); + // printf("[%d] gpu pup size %ld\n",CkMyPe(), ps.gpu_size() ); + setGPUPupSize(ps.gpu_size()); + #endif if (_lb_psizer_on || _lb_args.metaLbOn()) { PUP::sizer ps(PUP::er::IS_MIGRATION); this->virtual_pup(ps); if (_lb_psizer_on) + { setPupSize(ps.size()); + } + //TODO: check if this is correct after gpuPUP size if (_lb_args.metaLbOn()) myRec->getMetaBalancer()->SetCharePupSize(ps.size()); } @@ -2040,6 +2071,9 @@ void CkMigratable::setMigratable(int migratable) { myRec->setMigratable(migratab void CkMigratable::setPupSize(size_t obj_pup_size) { myRec->setPupSize(obj_pup_size); } +void CkMigratable::setGPUPupSize(size_t obj_gpu_pup_size) { myRec->setGPUPupSize(obj_gpu_pup_size); } + + void CkMigratable::CkAddThreadListeners(CthThread tid, void* msg) { Chare::CkAddThreadListeners(tid, msg); // for trace @@ -2049,6 +2083,8 @@ void CkMigratable::CkAddThreadListeners(CthThread tid, void* msg) #else void CkMigratable::setObjTime(double cputime) {} double CkMigratable::getObjTime() { return 0.0; } +void CkMigratable::setObjGPUTime(double gputime) {} +double CkMigratable::getObjGPUTime() { return 0.0; } # if CMK_LB_USER_DATA void* CkMigratable::getObjUserData(int idx) { return NULL; } @@ -2129,13 +2165,23 @@ void CkLocRec::stopTiming(int ignore_running) if (!ignore_running) running = false; } -void CkLocRec::setObjTime(double cputime) { lbmgr->EstObjLoad(ldHandle, cputime); } +void CkLocRec::setObjTime(double cputime) { + lbmgr->EstObjLoad(ldHandle, cputime); } double CkLocRec::getObjTime() { LBRealType walltime, cputime; lbmgr->GetObjLoad(ldHandle, walltime, cputime); return walltime; } +void CkLocRec::setObjGPUTime(double gputime) { + lbmgr->EstObjGPULoad(ldHandle, gputime); +} +double CkLocRec::getObjGPUTime() +{ + LBRealType gputime; + lbmgr->GetObjGPULoad(ldHandle, gputime); + return gputime; +} # if CMK_LB_USER_DATA void* CkLocRec::getObjUserData(int idx) { return lbmgr->GetDBObjUserData(ldHandle, idx); } # endif @@ -2273,6 +2319,11 @@ void CkLocRec::setPupSize(size_t obj_pup_size) lbmgr->setPupSize(ldHandle, obj_pup_size); } +void CkLocRec::setGPUPupSize(size_t obj_gpu_pup_size) +{ + lbmgr->setGPUPupSize(ldHandle, obj_gpu_pup_size); +} + #endif // Call ckDestroy for each record, which deletes the record, and ~CkLocRec() @@ -2363,8 +2414,10 @@ void CkLocCache::requestLocation(CmiUInt8 id, const int peToTell) void CkLocCache::updateLocation(const CkLocEntry& newEntry) { + // printf("[%d] updateLocation: id=%llu pe=%d epoch=%d\n", CmiMyPe(), newEntry.id, newEntry.pe, newEntry.epoch); CkAssert(newEntry.pe != -1); CkLocEntry& oldEntry = locMap[newEntry.id]; + // printf("[%d] updateLocation: oldEntry.epoch=%d\n", CmiMyPe(), oldEntry.epoch); if (newEntry.epoch > oldEntry.epoch) { oldEntry = newEntry; @@ -2376,6 +2429,8 @@ void CkLocCache::recordEmigration(CmiUInt8 id, int pe) { LocationMap::iterator itr = locMap.find(id); + // printf("[%d] recordEmigration: id=%llu itr->second.pe=%d pe=%d\n", CmiMyPe(), id, itr->second.pe, pe); + CkAssert(itr != locMap.end()); CkAssert(itr->second.pe == CkMyPe()); @@ -2947,6 +3002,44 @@ void CkLocMgr::migratableList(CkLocRec* rec, std::vector& list) } } +bool did_inter_node_gpudirect_rdma(int srcPe, int dstPe) { + CmiEnforce((srcPe >= 0) && (srcPe <= CmiNumPes())); + CmiEnforce((dstPe >= 0) && (dstPe <= CmiNumPes())); + + if (CmiNodeOf(srcPe) == CmiNodeOf(dstPe)) { + return false; + } else if (CmiPeOnSamePhysicalNode(srcPe, dstPe)) { + return false; + } else { + return true; + } +} + +#if CMK_CUDA || CMK_HIP +void CkLocMgr::sendGPUMsg(CmiUInt8 id) +{ + auto gpuData = sendGPUBuffers[id]; + sendGPUBuffers.erase(id); + thisProxy[gpuData.toPe].immigrateGPU(id, gpuData.size, CkDeviceBuffer(gpuData.data, gpuData.size, + CkCallbackResumeThread())); + + if(did_inter_node_gpudirect_rdma(CkMyPe(), gpuData.toPe)) { + GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); + if(csv_gpu_manager.use_shm) { + DeviceManager* dm = csv_gpu_manager.device_map[CkMyPe()]; + #if CMK_SMP + CmiLock(dm->lock); + #endif + dm->free_comm_buffer((size_t)((char*)gpuData.data - (char*)dm->comm_buffer->base_ptr)); + #if CMK_SMP + CmiUnlock(dm->lock); + #endif + } + } + //CkPrintf("PE %d sent GPU msg of size %zu for id %llu\n", CkMyPe(), gpuData.size, id); +} +#endif + /// Migrate this local element away to another processor. void CkLocMgr::emigrate(CkLocRec* rec, int toPe) { @@ -2972,12 +3065,16 @@ void CkLocMgr::emigrate(CkLocRec* rec, int toPe) callMethod(rec, &CkMigratable::ckAboutToMigrate); // First pass: find size of migration message - size_t bufSize; - { - PUP::sizer p(PUP::er::IS_MIGRATION); - pupElementsFor(p, rec, CkElementCreation_migrate); - bufSize = p.size(); - } + size_t bufSize, gpuBufSize; + PUP::sizer p(PUP::er::IS_MIGRATION); + pupElementsFor(p, rec, CkElementCreation_migrate); + bufSize = p.size(); + + gpuBufSize = 0; +#if CMK_CUDA || CMK_HIP + gpuBufSize = p.gpu_size(); +#endif + #if CMK_ERROR_CHECKING if (bufSize > std::numeric_limits::max()) { @@ -2986,6 +3083,8 @@ void CkLocMgr::emigrate(CkLocRec* rec, int toPe) } #endif + + void* gpuMsg = nullptr; // Allocate and pack into message CkArrayElementMigrateMessage* msg = new (bufSize, 0) CkArrayElementMigrateMessage(idx, id, @@ -2995,10 +3094,30 @@ void CkLocMgr::emigrate(CkLocRec* rec, int toPe) false, #endif bufSize, managers.size(), - cache->getEpoch(id) + 1); + cache->getEpoch(id) + 1, + gpuBufSize > 0); { - PUP::toMem p(msg->packData, PUP::er::IS_MIGRATION); +#if CMK_CUDA || CMK_HIP + if (gpuBufSize > 0) { + GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); + if(csv_gpu_manager.use_shm) { + DeviceManager* dm = csv_gpu_manager.device_map[CkMyPe()]; +#if CMK_SMP + CmiLock(dm->lock); +#endif + gpuMsg = dm->alloc_comm_buffer(gpuBufSize, false); + if (gpuMsg == nullptr) { + CkAbort("PE %d, device %d: Not enough memory on device Load balance buffer (%zu free)", + CkMyPe(), dm->global_index, dm->get_lb_buffer_free_size()); + } +#if CMK_SMP + CmiUnlock(dm->lock); +#endif + } + } +#endif + PUP::toMem p(msg->packData, gpuMsg, PUP::er::IS_MIGRATION); p.becomeDeleting(); pupElementsFor(p, rec, CkElementCreation_migrate); if (p.size() != bufSize) @@ -3013,6 +3132,13 @@ void CkLocMgr::emigrate(CkLocRec* rec, int toPe) DEBM((AA "Migrated index size %s to %d \n" AB, idx2str(idx), toPe)); +#if CMK_CUDA || CMK_HIP + // Ensure all device-to-device copies from PUP packing are complete before + // destroying elements, since cudaMemcpy(D2D) can be async in CUDA 12.x. + if (gpuBufSize > 0) + hapiDeviceSynchronize(); +#endif + thisProxy[toPe].immigrate(msg); duringMigration = true; @@ -3024,9 +3150,16 @@ void CkLocMgr::emigrate(CkLocRec* rec, int toPe) cache->recordEmigration(id, toPe); informHome(idx, toPe); +#if CMK_CUDA || CMK_HIP + if (gpuBufSize > 0) + { + sendGPUBuffers[id] = GPUMigrateData(toPe, gpuBufSize, gpuMsg); + thisProxy[CkMyPe()].sendGPUMsg(id); + } +#endif #if !CMK_LBDB_ON && CMK_GLOBAL_LOCATION_UPDATE - DEBM((AA "Global location update. idx %s " + CmiPrintf((AA "Global location update. idx %s " "assigned to %d \n" AB, idx2str(idx), toPe)); thisProxy.updateLocation(id, toPe); @@ -3047,14 +3180,65 @@ void CkLocMgr::metaLBCallLB(CkLocRec* rec) } #endif +#if CMK_CUDA || CMK_HIP +void CkLocMgr::immigrateGPU(CmiUInt8& id, int& size, char* &data, CkDeviceBufferPost* post) +{ + //CkPrintf("PE %d allocating GPU memory size %d for id %llu\n", CkMyPe(), size, id); + GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); + if(csv_gpu_manager.use_shm) { + DeviceManager* dm = csv_gpu_manager.device_map[CkMyPe()]; +#if CMK_SMP + CmiLock(dm->lock); +#endif + data = (char*)(dm->alloc_comm_buffer(size, false)); + if (data == nullptr) { + CkAbort("PE %d, device %d: Not enough memory on device Load balance buffer (%zu free)", + CkMyPe(), dm->global_index, dm->get_lb_buffer_free_size()); + } +#if CMK_SMP + CmiUnlock(dm->lock); +#endif + } + hapiDeviceSynchronize(); + receivedDeviceMsgs[id] = data; + post[0].hapi_stream = (hapiStream_t) 0; +} + +void CkLocMgr::immigrateGPU(CmiUInt8 id, int size, char* data) +{ + void* dataPtr = receivedDeviceMsgs[id]; + receivedDeviceMsgs.erase(id); + bufferedDeviceMigrateMsgs[id] = dataPtr; + if (bufferedHostMigrateMsgs.find(id) != bufferedHostMigrateMsgs.end()) + { + immigrate(bufferedHostMigrateMsgs[id]); + bufferedHostMigrateMsgs.erase(id); + } +} +#endif + /** Migrating array element is arriving on this processor. */ void CkLocMgr::immigrate(CkArrayElementMigrateMessage* msg) { + void* gpuMsg = nullptr; + if (msg->hasGPUMsg) + { + auto it = bufferedDeviceMigrateMsgs.find(msg->id); + + if (it == bufferedDeviceMigrateMsgs.end()) + { + bufferedHostMigrateMsgs[msg->id] = msg; + return; + } + + gpuMsg = it->second; + } + const CkArrayIndex& idx = msg->idx; - PUP::fromMem p(msg->packData, PUP::er::IS_MIGRATION); + PUP::fromMem p(msg->packData, gpuMsg, PUP::er::IS_MIGRATION); if (msg->nManagers < managers.size()) CkAbort("Array element arrived from location with fewer managers!\n"); @@ -3067,15 +3251,35 @@ void CkLocMgr::immigrate(CkArrayElementMigrateMessage* msg) return; } + if (msg->hasGPUMsg) + bufferedDeviceMigrateMsgs.erase(msg->id); + insertID(idx, msg->id); // Create a record for this element - CkLocRec* rec = - createLocal(idx, true, msg->ignoreArrival, false /* home told on departure */, msg->epoch); + CkLocRec* rec = elementNrec(msg->id); + if (rec == nullptr) + rec = createLocal(idx, true, msg->ignoreArrival, false /* home told on departure */, msg->epoch); CmiAssert(CpvAccess(newZCPupGets).empty()); // Ensure that vector is empty // Create the new elements as we unpack the message pupElementsFor(p, rec, CkElementCreation_migrate); + hapiDeviceSynchronize(); + +#if CMK_CUDA || CMK_HIP + GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); + if(csv_gpu_manager.use_shm) { + DeviceManager* dm = csv_gpu_manager.device_map[CkMyPe()]; +#if CMK_SMP + CmiLock(dm->lock); +#endif + dm->free_comm_buffer((size_t)((char*)gpuMsg - (char*)dm->comm_buffer->base_ptr)); +#if CMK_SMP + CmiUnlock(dm->lock); +#endif + } +#endif + bool zcRgetsActive = !CpvAccess(newZCPupGets).empty(); if (zcRgetsActive) { diff --git a/src/ck-core/cklocation.ci b/src/ck-core/cklocation.ci index 30d1d158c6..5e51ee93bc 100644 --- a/src/ck-core/cklocation.ci +++ b/src/ck-core/cklocation.ci @@ -1,3 +1,5 @@ +#include "conv-mach-opt.h" + module CkLocation { extern module CkMarshall; @@ -14,6 +16,10 @@ module CkLocation { group [migratable] CkLocMgr { entry CkLocMgr(CkArrayOptions opts); entry [expedited] void immigrate(CkArrayElementMigrateMessage *msg); +#if CMK_CUDA || CMK_HIP + entry [expedited, threaded] void sendGPUMsg(CmiUInt8 id); + entry [expedited] void immigrateGPU(CmiUInt8 id, int size, nocopydevice char data[size]); +#endif entry [expedited] void requestLocation(const CkArrayIndex& idx, int peToTell); entry [expedited] void updateLocation(const CkArrayIndex& idx, const CkLocEntry& e); entry void reclaimRemote(const CkArrayIndex& idx, int deletedOnPe); diff --git a/src/ck-core/cklocation.h b/src/ck-core/cklocation.h index 380db7570a..b636f23d91 100644 --- a/src/ck-core/cklocation.h +++ b/src/ck-core/cklocation.h @@ -92,12 +92,13 @@ class CkArrayElementMigrateMessage : public CMessage_CkArrayElementMigrateMessag { public: CkArrayElementMigrateMessage(CkArrayIndex idx_, CmiUInt8 id_, bool ignoreArrival_, - int length_, int nManagers_, int epoch_) + int length_, int nManagers_, int epoch_, bool hasGPUMsg_ = false) : idx(idx_), id(id_), ignoreArrival(ignoreArrival_), length(length_), nManagers(nManagers_), + hasGPUMsg(hasGPUMsg_), epoch(epoch_) { } @@ -105,6 +106,7 @@ class CkArrayElementMigrateMessage : public CMessage_CkArrayElementMigrateMessag CkArrayIndex idx; // Array index that is migrating CmiUInt8 id; // ID of the elements with this index in this collection bool ignoreArrival; // if to inform LB of arrival + bool hasGPUMsg; int length; // Size in bytes of the packed data int nManagers; // Number of associated array managers int epoch; @@ -220,6 +222,17 @@ CkpvExtern(int, CkSaveRestorePrefetch); #include "ckmigratable.h" +class GPUMigrateData +{ +public: + int toPe; + int size; + void* data; + + GPUMigrateData() : toPe(-1), size(0), data(nullptr) {} + GPUMigrateData(int toPe_, int size_, void* data_) : toPe(toPe_), size(size_), data(data_) {} +}; + /********************** CkLocMgr ********************/ /// A tiny class for detecting heap corruption class CkMagicNumber_impl @@ -418,6 +431,13 @@ class CkLocMgr : public IrrGroup // Immigration messages which are waiting for all array managers to be ready std::list pendingImmigrate; + std::unordered_map sendGPUBuffers; + std::unordered_map bufferedHostMigrateMsgs; + std::unordered_map bufferedDeviceMigrateMsgs; + std::unordered_map sentDeviceMsgs; + + std::unordered_map receivedDeviceMsgs; + // The mapping of index to ID is either done via compression or an explicit map, // depending on if the bounds of this array are compressible into a 64bit ID. CkArrayIndex bounds; @@ -524,7 +544,7 @@ class CkLocMgr : public IrrGroup CmiUInt8 lookupID(const CkArrayIndex& idx) const { - CkAssert(checkInBounds(idx)); + //CkAssert(checkInBounds(idx)); if (compressor) { const CmiUInt8 home = homePe(idx); @@ -550,7 +570,7 @@ class CkLocMgr : public IrrGroup // TODO: This should be better bool lookupID(const CkArrayIndex& idx, CmiUInt8& id) const { - CkAssert(checkInBounds(idx)); + //CkAssert(checkInBounds(idx)); if (compressor) { const CmiUInt8 home = homePe(idx); @@ -691,6 +711,11 @@ class CkLocMgr : public IrrGroup // Communication: void immigrate(CkArrayElementMigrateMessage* msg); +#if CMK_CUDA || CMK_HIP + void sendGPUMsg(CmiUInt8 id); + void immigrateGPU(CmiUInt8& id, int& size, char* &data, CkDeviceBufferPost* post); + void immigrateGPU(CmiUInt8 id, int size, char* data); +#endif void requestLocation(CmiUInt8 id); void requestLocation(const CkArrayIndex& idx); bool requestLocation(const CkArrayIndex& idx, int peToTell); diff --git a/src/ck-core/cklocrec.h b/src/ck-core/cklocrec.h index 8528aafcd2..df3271d9fe 100644 --- a/src/ck-core/cklocrec.h +++ b/src/ck-core/cklocrec.h @@ -49,6 +49,10 @@ class CkLocRec { void stopTiming(int ignore_running=0); void setObjTime(double cputime); double getObjTime(); + + void setObjGPUTime(double gputime); + double getObjGPUTime(); + void *getObjUserData(int idx); #else inline void startTiming(int ignore_running=0) { } @@ -70,6 +74,7 @@ class CkLocRec { void recvMigrate(int dest); void setMigratable(int migratable); /// set migratable void setPupSize(size_t obj_pup_size); + void setGPUPupSize(size_t obj_gpu_pup_size); void AsyncMigrate(bool use); bool isAsyncMigrate() { return asyncMigrate; } void ReadyMigrate(bool ready) { readyMigrate = ready; } ///called from user diff --git a/src/ck-core/ckmemcheckpoint.C b/src/ck-core/ckmemcheckpoint.C index b691fca2e3..1232b47946 100644 --- a/src/ck-core/ckmemcheckpoint.C +++ b/src/ck-core/ckmemcheckpoint.C @@ -678,7 +678,8 @@ static inline void _handleProcData(PUP::er &p) #endif // save groups into Groups.dat - CkPupGroupData(p); + //std::vector groupMetadata = CkPupGroupMetadata(p); + //CkPupGroupData(p, groupMetadata.size(), groupMetadata); // save nodegroups into NodeGroups.dat if(CkMyRank()==0) CkPupNodeGroupData(p); diff --git a/src/ck-core/ckmigratable.h b/src/ck-core/ckmigratable.h index d0e9a96eeb..7cec2884ad 100644 --- a/src/ck-core/ckmigratable.h +++ b/src/ck-core/ckmigratable.h @@ -80,6 +80,8 @@ class CkMigratable : public Chare { virtual void UserSetLBLoad(void); /// user define this when setLBLoad is true void setObjTime(double cputime); double getObjTime(); + void setObjGPUTime(double cputime); + double getObjGPUTime(); #if CMK_LB_USER_DATA void *getObjUserData(int idx); #endif @@ -96,6 +98,7 @@ class CkMigratable : public Chare { void ckFinishConstruction(int epoch = -1); void setMigratable(int migratable); void setPupSize(size_t obj_pup_size); + void setGPUPupSize(size_t obj_gpu_pup_size); #else void AtSync(int waitForMigration=1) { ResumeFromSync();} void setMigratable(int migratable) { } diff --git a/src/ck-core/ckrdmadevice.C b/src/ck-core/ckrdmadevice.C index 806ef289e0..4b9ea79967 100644 --- a/src/ck-core/ckrdmadevice.C +++ b/src/ck-core/ckrdmadevice.C @@ -27,7 +27,7 @@ * ordering between these data transfers. Because multiple PEs can be mapped * to the same GPU and hence concurrently request allocations from the same * device communication buffer, a thread-safe allocator using the buddy - * allocation algorithm was implemented. The allocator first calls cudaMalloc + * allocation algorithm was implemented. The allocator first calls hapiMalloc * to obtain a relatively large chunk of memory and then services allocation * and deallocation requests from PEs that are mapped to its GPU device. * The buddy algorithm was used to minimize the external fragmentation that @@ -50,25 +50,123 @@ #include "ck.h" #include "ckrdmadevice.h" -#if CMK_CUDA +#define CMK_GPU_COMM 1 + +#if CMK_CUDA || CMK_HIP + +CmiNcpyModeDevice findTransferModeDevice(int srcPe, int dstPe) { + CmiEnforce((srcPe >= 0) && (srcPe <= CmiNumPes())); + CmiEnforce((dstPe >= 0) && (dstPe <= CmiNumPes())); + + if (CmiNodeOf(srcPe) == CmiNodeOf(dstPe)) { + // Same logical node + return CmiNcpyModeDevice::MEMCPY; + } else if (CmiPeOnSamePhysicalNode(srcPe, dstPe)) { + // Different logical nodes, same physical node + return CmiNcpyModeDevice::IPC; + } else { + // Different physical nodes, requires GPUDirect RDMA + return CmiNcpyModeDevice::RDMA; + } +} #include "hapi.h" #include "gpumanager.h" CsvExtern(GPUManager, gpu_manager); +CpvExtern(int, my_device_id); + +// void CkRdmaDeviceRecvHandler(void* data) +// { +// DeviceRdmaOp* op = (DeviceRdmaOp*)data; +// DeviceRdmaInfo* info = op->info; + +// // Invoke source callbacks +// if (op->src_cb) { +// int rank; +// CkCallback* cb = (CkCallback*)op->src_cb; +// cb->send(); +// delete cb; +// } + +// // Update counter (there may be multiple buffers in transit) +// info->counter++; + +// // Check if all buffers have been received +// // If so, invoke regular entry method +// if (info->counter == info->n_ops) { +// QdCreate(1); + +// enqueueNcpyMessage(op->dest_pe, info->msg); + +// // Free RDMA metadata +// CmiFree(info); +// } +// } + +struct LoopBackMsg { + char header[CmiMsgHeaderSizeBytes]; + void* msg; +}; + +extern "C" { + void* loopback_bridge(void* arg) { + QdProcess(1); + LoopBackMsg* recv_msg = (LoopBackMsg*)arg; + CkRdmaDeviceRecvHandler(recv_msg->msg); + CmiFree(recv_msg); + return NULL; + } + + int loopback_handler; +} -// Invoked when a GPU buffer arrives on the receiver -#if !CMK_GPU_COMM -void CkRdmaDeviceRecvHandler(void* data, void* msg) -#else void CkRdmaDeviceRecvHandler(void* data) -#endif { -#if CMK_GPU_COMM - // Process QD to mark completion of buffer transfer + NcpyOperationInfo *ncpy_op_info = (NcpyOperationInfo *)data; + DeviceRdmaOp* op = (DeviceRdmaOp*)(ncpy_op_info->deviceRdmaOpInfo); + + if(op->dest_pe != CmiMyPe()) { + int infoSize = ncpy_op_info->ncpyOpInfoSize; + NcpyOperationInfo* copy = (NcpyOperationInfo*)CmiAlloc(infoSize); + memcpy(copy, ncpy_op_info, infoSize); + + LoopBackMsg* conv_msg = (LoopBackMsg*)CmiAlloc(sizeof(LoopBackMsg)); + conv_msg->msg = copy; + + QdCreate(1); + CmiSetHandler(conv_msg, loopback_handler); + CmiPushPE(CmiRankOf(op->dest_pe), conv_msg); + return; + } + QdProcess(1); -#endif + DeviceRdmaInfo* info = op->info; + + // Invoke source callbacks + if (op->src_cb) { + CkCallback* cb = (CkCallback*)op->src_cb; + cb->send(); + delete cb; + } + // Update counter (there may be multiple buffers in transit) + info->counter++; + + // Check if all buffers have been received + // If so, invoke regular entry method + if (info->counter == info->n_ops) { + QdCreate(1); + + enqueueNcpyMessage(op->dest_pe, info->msg); + + // Free RDMA metadata + // CmiFree(info); + } +} +// Invoked when a GPU buffer arrives on the receiver +void CkRdmaDeviceRecvHandler(void* data, void* msg) +{ DeviceRdmaOp* op = (DeviceRdmaOp*)data; DeviceRdmaInfo* info = op->info; @@ -105,12 +203,12 @@ void CkDevicePersistent::init() { void CkDevicePersistent::open() { // Create a CUDA IPC handle for inter-process communication - hapiCheck(cudaIpcGetMemHandle(&cuda_ipc_handle, (void*)ptr)); + hapiCheck(hapiIpcGetMemHandle(&hapi_ipc_handle, (void*)ptr)); } void CkDevicePersistent::close() { // Close the CUDA IPC handle if it was opened - hapiCheck(cudaIpcCloseMemHandle(ipc_ptr)); + hapiCheck(hapiIpcCloseMemHandle(ipc_ptr)); } void CkDevicePersistent::set_msg(void* msg) { @@ -122,7 +220,7 @@ void CkDevicePersistent::pup(PUP::er& p) { p|cnt; p|pe; p|cb; - p((char*)&cuda_ipc_handle, sizeof(cuda_ipc_handle)); + p((char*)&hapi_ipc_handle, sizeof(hapi_ipc_handle)); } CkDeviceStatus CkDevicePersistent::get(CkDevicePersistent& src) { @@ -135,24 +233,24 @@ CkDeviceStatus CkDevicePersistent::get(CkDevicePersistent& src) { // Perform get if (mode == CkNcpyModeDevice::MEMCPY) { - cudaMemcpyAsync((void*)ptr, src.ptr, cnt, cudaMemcpyDeviceToDevice, cuda_stream); + hapiMemcpyAsync((void*)ptr, src.ptr, cnt, hapiMemcpyDeviceToDevice, hapi_stream); } else if (mode == CkNcpyModeDevice::IPC) { if (!src.ipc_open) { - hapiCheck(cudaIpcOpenMemHandle(&src.ipc_ptr, src.cuda_ipc_handle, - cudaIpcMemLazyEnablePeerAccess)); + hapiCheck(hapiIpcOpenMemHandle(&src.ipc_ptr, src.hapi_ipc_handle, + hapiIpcMemLazyEnablePeerAccess)); src.ipc_open = true; } - cudaMemcpyAsync((void*)ptr, src.ipc_ptr, cnt, cudaMemcpyDeviceToDevice, cuda_stream); + hapiMemcpyAsync((void*)ptr, src.ipc_ptr, cnt, hapiMemcpyDeviceToDevice, hapi_stream); } else { CkAbort("Persistant GPU messaging is currently not supported for inter-node messages"); } // Set callbacks to be invoked once get is complete if (src.cb.type != CkCallback::ignore) { - hapiAddCallback(cuda_stream, src.cb, src.cb_msg); + hapiAddCallback(hapi_stream, src.cb, src.cb_msg); } if (cb.type != CkCallback::ignore) { - hapiAddCallback(cuda_stream, cb, cb_msg); + hapiAddCallback(hapi_stream, cb, cb_msg); } return CkDeviceStatus::incomplete; @@ -168,24 +266,24 @@ CkDeviceStatus CkDevicePersistent::put(CkDevicePersistent& dst) { // Perform put if (mode == CkNcpyModeDevice::MEMCPY) { - cudaMemcpyAsync((void*)dst.ptr, ptr, cnt, cudaMemcpyDeviceToDevice, cuda_stream); + hapiMemcpyAsync((void*)dst.ptr, ptr, cnt, hapiMemcpyDeviceToDevice, hapi_stream); } else if (mode == CkNcpyModeDevice::IPC) { if (!dst.ipc_open) { - hapiCheck(cudaIpcOpenMemHandle(&dst.ipc_ptr, dst.cuda_ipc_handle, - cudaIpcMemLazyEnablePeerAccess)); + hapiCheck(hapiIpcOpenMemHandle(&dst.ipc_ptr, dst.hapi_ipc_handle, + hapiIpcMemLazyEnablePeerAccess)); dst.ipc_open = true; } - cudaMemcpyAsync(dst.ipc_ptr, ptr, cnt, cudaMemcpyDeviceToDevice, cuda_stream); + hapiMemcpyAsync(dst.ipc_ptr, ptr, cnt, hapiMemcpyDeviceToDevice, hapi_stream); } else { CkAbort("Persistant GPU messaging is not yet supported for inter-node messages"); } // Set callbacks to be invoked once get is complete if (cb.type != CkCallback::ignore) { - hapiAddCallback(cuda_stream, cb, cb_msg); + hapiAddCallback(hapi_stream, cb, cb_msg); } if (dst.cb.type != CkCallback::ignore) { - hapiAddCallback(cuda_stream, dst.cb, dst.cb_msg); + hapiAddCallback(hapi_stream, dst.cb, dst.cb_msg); } return CkDeviceStatus::incomplete; @@ -193,6 +291,17 @@ CkDeviceStatus CkDevicePersistent::put(CkDevicePersistent& dst) { /****************************** Recv Entry Method API ******************************/ +// Returns the local rank of the logical node (process) that the given PE belongs to +static inline int CmiNodeRankLocal(int pe) { + // Logical node index % Number of logical nodes per physical node + return CmiNodeOf(pe) % (CmiNumNodes() / CmiNumPhysicalNodes()); +} + +// Returns the local rank of the logical node that I belong to +static inline int CmiMyNodeRankLocal() { + return CmiNodeRankLocal(CmiMyPe()); +} + // Invoked after post entry method void CkRdmaDeviceIssueRgets(envelope *env, int numops, void **arrPtrs, int *arrSizes, CkDeviceBufferPost *postStructs) { // Change message header to invoke regular entry method @@ -211,13 +320,12 @@ void CkRdmaDeviceIssueRgets(envelope *env, int numops, void **arrPtrs, int *arrS CkDeviceBuffer source; -#if !CMK_GPU_COMM // Machine layer does not support GPU-aware communication GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); // Find which mode of transfer should be used + // CmiPrintf("[%d] CkRdmaDeviceOnSender: src_pe=%d, dst_pe=%d\n", CkMyPe(), env->getSrcPe(), CkMyPe()); CkNcpyModeDevice mode = findTransferModeDevice(env->getSrcPe(), CkMyPe()); -#endif // Allocate and fill in metadata for this zerocopy operation void* rdma_data = CmiAlloc(sizeof(DeviceRdmaInfo) + sizeof(DeviceRdmaOp) * numops); @@ -238,18 +346,20 @@ void CkRdmaDeviceIssueRgets(envelope *env, int numops, void **arrPtrs, int *arrS // Store information about this buffer DeviceRdmaOp& save_op = *(DeviceRdmaOp*)((char*)rdma_data + sizeof(DeviceRdmaInfo) + sizeof(DeviceRdmaOp) * i); - save_op.dest_pe = CkMyPe(); + save_op.dest_pe = source.dest_pe; save_op.dest_ptr = arrPtrs[i]; save_op.size = (size_t)arrSizes[i]; save_op.info = rdma_info; save_op.src_cb = (source.cb.type != CkCallback::ignore) ? new CkCallback(source.cb) : nullptr; save_op.dst_cb = nullptr; -#if !CMK_GPU_COMM // Machine layer does not support GPU-aware communication // Check if destination PE is correct // TODO: Handle this case instead of aborting + // Chare* obj = CkActiveObj(); + // CmiUInt8 id = obj->id; if (source.dest_pe != CkMyPe()) { + CmiPrintf("Current PE %d does not match the destination PE %d and sender determined to be %d\n", CkMyPe(), source.dest_pe, env->getSrcPe()); CkAbort("Current PE does not match the destination PE determined by the sender. " "Please enable CMK_GLOBAL_LOCATION_UPDATE."); } @@ -261,64 +371,58 @@ void CkRdmaDeviceIssueRgets(envelope *env, int numops, void **arrPtrs, int *arrS if (mode == CkNcpyModeDevice::MEMCPY) { // Source and destination PEs are in the same process (logical node) // Directly invoke memcpy from source buffer to destination buffer - hapiCheck(cudaMemcpyAsync((void*)dest.ptr, source.ptr, dest.cnt, - cudaMemcpyDeviceToDevice, postStructs[i].cuda_stream)); + hapiCheck(hapiMemcpyAsync((void*)dest.ptr, source.ptr, dest.cnt, + hapiMemcpyDeviceToDevice, postStructs[i].hapi_stream)); } else if (mode == CkNcpyModeDevice::IPC && csv_gpu_manager.use_shm) { // Inter-process using shared memory optimizations // Use optimiziations with POSIX shared memory - cuda_ipc_device_info& device_info = - csv_gpu_manager.cuda_ipc_device_infos[source.device_idx]; + hapi_ipc_device_info& device_info = + csv_gpu_manager.hapi_ipc_device_infos[source.device_idx]; - // 1. Make user-provided stream wait for IPC event using cudaStreamWaitEvent + // 1. Make user-provided stream wait for IPC event using hapiStreamWaitEvent // (source buffer to device comm buffer on source) - hapiCheck(cudaStreamWaitEvent(postStructs[i].cuda_stream, + hapiCheck(hapiStreamWaitEvent(postStructs[i].hapi_stream, device_info.src_event_pool[source.event_idx], 0)); - // 2. Invoke cudaMemcpyAsync (from source device comm buffer to destination buffer) - hapiCheck(cudaMemcpyAsync((void*)dest.ptr, + // 2. Invoke hapiMemcpyAsync (from source device comm buffer to destination buffer) + hapiCheck(hapiMemcpyAsync((void*)dest.ptr, (void*)((char*)device_info.buffer + source.comm_offset), - dest.cnt, cudaMemcpyDeviceToDevice, postStructs[i].cuda_stream)); + dest.cnt, hapiMemcpyDeviceToDevice, postStructs[i].hapi_stream)); // 3. Record IPC event so that the sender can query it for freeing // device comm buffer and corresponding pair of CUDA IPC events - hapiCheck(cudaEventRecord(device_info.dst_event_pool[source.event_idx], - postStructs[i].cuda_stream)); + hapiCheck(hapiEventRecord(device_info.dst_event_pool[source.event_idx], + postStructs[i].hapi_stream)); // 4. Set flag in shared memory so that the sender can start querying // completion of the IPC event - cuda_ipc_event_shared* shm_event_shared = - (cuda_ipc_event_shared*)((char*)csv_gpu_manager.shm_ptr + hapi_ipc_event_shared* shm_event_shared = + (hapi_ipc_event_shared*)((char*)csv_gpu_manager.shm_ptr + csv_gpu_manager.shm_chunk_size * source.device_idx - + sizeof(cudaIpcMemHandle_t)) + source.event_idx; - pthread_mutex_lock(&shm_event_shared->lock); - shm_event_shared->dst_flag = true; - pthread_mutex_unlock(&shm_event_shared->lock); + + sizeof(hapiIpcMemHandle_t)) + source.event_idx; + __atomic_store_n(&shm_event_shared->dst_flag, 1, __ATOMIC_RELEASE); } else { + // CmiPrintf("it should never be called during intra node\n"); +#if CMK_GPU_COMM + // Machine layer supports GPU-aware communication + QdCreate(1); + CmiSetDirectNcpyAckHandler(CkRdmaDeviceRecvHandler); + CmiNcpyBuffer lci_dest_ncpy_buffer(arrPtrs[i], (size_t)arrSizes[i], (void*)(&save_op)); + lci_dest_ncpy_buffer.rdmaGet(source.lci_ncpy_buffer, 0, nullptr, nullptr); + continue; +#else // Handle all other cases (basic inter-process and inter-node) // Transfer the received/unpacked data on host to the destination device buffer // FIXME: Print warning that this is slow? CkAssert(source.data_stored); - hapiCheck(cudaMemcpyAsync((void*)dest.ptr, source.data, dest.cnt, - cudaMemcpyHostToDevice, postStructs[i].cuda_stream)); + hapiCheck(hapiMemcpyAsync((void*)dest.ptr, source.data, dest.cnt, + hapiMemcpyHostToDevice, postStructs[i].hapi_stream)); +#endif } // Add source callback for polling, so that it can be invoked once the transfer is complete - hapiAddCallback(postStructs[i].cuda_stream, CkCallback(CkRdmaDeviceRecvHandler, &save_op)); -#else - // Machine layer supports GPU-aware communication - save_op.tag = source.tag; -#endif // CMK_GPU_COMM + hapiAddCallback(postStructs[i].hapi_stream, CkCallback(CkRdmaDeviceRecvHandler, &save_op)); } - -#if CMK_GPU_COMM - // Post ucp_tag_recv_nb's to receive GPU data - for (int i = 0; i < numops; i++) { - DeviceRdmaOp* save_op = (DeviceRdmaOp*)((char*)rdma_data - + sizeof(DeviceRdmaInfo) + sizeof(DeviceRdmaOp) * i); - QdCreate(1); - CmiRecvDevice(save_op, DEVICE_RECV_TYPE_CHARM); - } -#endif } // Unused, left for future reference @@ -340,48 +444,43 @@ int CkRdmaGetDestPEChare(int dest_pe, void* obj_ptr) { } */ -static int findFreeIpcEvent(DeviceManager* dm, const size_t comm_offset) { - int pool_size = CsvAccess(gpu_manager).cuda_ipc_event_pool_size_pe; +static int findFreeIpcEvent(DeviceManager* dm, const size_t comm_offset, int cpv_my_device_id) { + GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); + int pool_size = csv_gpu_manager.hapi_ipc_event_pool_size_pe; int pool_start = CkMyRank() * pool_size; - int device_index = dm->global_index; - cuda_ipc_device_info& my_device_info = CsvAccess(gpu_manager).cuda_ipc_device_infos[device_index]; + hapi_ipc_device_info& my_device_info = csv_gpu_manager.hapi_ipc_device_infos[csv_gpu_manager.device_count * CmiMyNodeRankLocal() + cpv_my_device_id]; // Free IPC events that are complete // TODO: Don't do this every time but only when the event pool is somewhat empty for (int i = pool_start; i < pool_start + pool_size; i++) { int& event_flag = my_device_info.event_pool_flags[i]; - cudaEvent_t& ev = my_device_info.dst_event_pool[i]; + hapiEvent_t& ev = my_device_info.dst_event_pool[i]; size_t& buff_offset = my_device_info.event_pool_buff_offsets[i]; // For a used event, check if it's complete and mark as free if so if (event_flag != 0) { // Check in shared memory if receiver has invoked the memcpy from // the device comm buffer on sender to destination buffer - cuda_ipc_event_shared* shm_event_shared = - (cuda_ipc_event_shared*)((char*)CsvAccess(gpu_manager).shm_ptr - + CsvAccess(gpu_manager).shm_chunk_size * device_index - + sizeof(cudaIpcMemHandle_t)) + i; - bool can_query = false; - pthread_mutex_lock(&shm_event_shared->lock); - if (shm_event_shared->dst_flag == true) { - shm_event_shared->dst_flag = false; - can_query = true; - } - pthread_mutex_unlock(&shm_event_shared->lock); + hapi_ipc_event_shared* shm_event_shared = + (hapi_ipc_event_shared*)((char*)csv_gpu_manager.shm_ptr + + csv_gpu_manager.shm_chunk_size * (csv_gpu_manager.device_count * CmiMyNodeRankLocal() + cpv_my_device_id) + + sizeof(hapiIpcMemHandle_t)) + i; + bool can_query = __atomic_load_n(&shm_event_shared->dst_flag, __ATOMIC_ACQUIRE); // If the receiver has invoked the memcpy, // the sender can query the event for completion if (can_query) { - if (cudaEventQuery(ev) == cudaSuccess) { + if (hapiEventQuery(ev) == hapiSuccess) { // Event completion means that the transfer from source device comm buffer // to dest buffer is complete, so free the allocated block if (event_flag == 1) { dm->free_comm_buffer(buff_offset); } else { - CkAbort("Retrieved cudaSuccess for a free IPC event"); + CkAbort("Retrieved hapiSuccess for a free IPC event"); } // Mark event as free event_flag = 0; + __atomic_store_n(&shm_event_shared->dst_flag, 0, __ATOMIC_RELEASE); } } } @@ -389,11 +488,11 @@ static int findFreeIpcEvent(DeviceManager* dm, const size_t comm_offset) { // Allocate CUDA IPC events from the pool // Two events are used per message: - // 1) Recorded by the sender after 'source buffer -> device comm buffer' cudaMemcpy. + // 1) Recorded by the sender after 'source buffer -> device comm buffer' hapiMemcpy. // Can be used by the sender to determine if the sender buffer is free for reuse. - // It is also used by the receiver to create a dependency for the second cudaMemcpy + // It is also used by the receiver to create a dependency for the second hapiMemcpy // ('device comm buffer -> dest buffer') - // 2) Recorded by the receiver after 'device comm buffer -> dest buffer' cudaMemcpy. + // 2) Recorded by the receiver after 'device comm buffer -> dest buffer' hapiMemcpy. // It is used by the sender to determine when the allocated block on // device comm buffer and IPC events can be freed. for (int i = pool_start; i < pool_start + pool_size; i++) { @@ -414,38 +513,51 @@ void CkRdmaDeviceOnSender(int dest_pe, int numops, CkDeviceBuffer** buffers) { // TODO: Need to handle the case where the destination PE could be wrong // (due to migration, etc.). Currently the code relies on a global // location update after migration (with CMK_GLOBAL_LOCATION_UPDATE). -#if !CMK_GPU_COMM - GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); - - // Determine transfer mode (intra-process, inter-process, inter-node) + // CmiPrintf("[%d] CkRdmaDeviceOnSender: src_pe=%d, dst_pe=%d\n", CkMyPe(), CkMyPe(), dest_pe); CkNcpyModeDevice transfer_mode = findTransferModeDevice(CkMyPe(), dest_pe); // Store destination PE in the metadata message // FIXME: Not necessary? save_op.dest_pe is set to CkMyPe() on the receiver for (int i = 0; i < numops; i++) { buffers[i]->dest_pe = dest_pe; + buffers[i]->dest_mpi_rank = CmiNodeOf(dest_pe); + buffers[i]->src_pe = CmiMyPe(); + buffers[i]->src_mpi_rank = CmiNodeOf(CmiMyPe()); } - - if (transfer_mode == CkNcpyModeDevice::MEMCPY) { - // Don't need to do anything for intra-process + if(transfer_mode == CkNcpyModeDevice::MEMCPY) + { + for (int i = 0; i < numops; i++) + hapiStreamSynchronize(buffers[i]->hapi_stream); return; - } else if (transfer_mode == CkNcpyModeDevice::IPC && csv_gpu_manager.use_shm) { + } + + GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); + //int cpv_my_device_id = CmiMyRank() % csv_gpu_manager.device_count; + int cpv_my_device_id = CpvAccess(my_device_id); + + if(transfer_mode == CkNcpyModeDevice::IPC && csv_gpu_manager.use_shm) { // Use optimizations with POSIX shaerd memory // Allocate blocks on device comm buffer DeviceManager* dm = csv_gpu_manager.device_map[CkMyPe()]; for (int i = 0; i < numops; i++) { + bool is_lb_buffer = ( (size_t)((char*)(buffers[i]->ptr) - (char*)(dm->comm_buffer->base_ptr)) < dm->comm_buffer->total_size ); #if CMK_SMP CmiLock(dm->lock); #endif - void* alloc_comm_buffer = dm->alloc_comm_buffer(buffers[i]->cnt); - if (alloc_comm_buffer == nullptr) { - CkAbort("PE %d, device %d: Not enough memory on device communication buffer (%zu free)", - CkMyPe(), dm->global_index, dm->get_comm_buffer_free_size()); + void* alloc_comm_buffer; + if(is_lb_buffer) { + alloc_comm_buffer = const_cast(buffers[i]->ptr); + } else { + alloc_comm_buffer = dm->alloc_comm_buffer(buffers[i]->cnt); + if (alloc_comm_buffer == nullptr) { + CkAbort("PE %d, device %d: Not enough memory on device communication buffer (%zu free)", + CkMyPe(), dm->global_index, dm->get_comm_buffer_free_size()); + } } buffers[i]->comm_offset = (char*)alloc_comm_buffer - (char*)dm->comm_buffer->base_ptr; - buffers[i]->device_idx = dm->global_index; - buffers[i]->event_idx = findFreeIpcEvent(dm, buffers[i]->comm_offset); + buffers[i]->device_idx = (csv_gpu_manager.device_count * CmiMyNodeRankLocal() + cpv_my_device_id); + buffers[i]->event_idx = findFreeIpcEvent(dm, buffers[i]->comm_offset, cpv_my_device_id); // Abort if no free IPC event was found // FIXME: Instead of aborting, we can maybe create IPC events on demand // (although they probably cannot be shared through the shared memory @@ -458,34 +570,36 @@ void CkRdmaDeviceOnSender(int dest_pe, int numops, CkDeviceBuffer** buffers) { #endif // Initiate transfer from source buffer to device comm buffer - hapiCheck(cudaMemcpyAsync(alloc_comm_buffer, buffers[i]->ptr, buffers[i]->cnt, - cudaMemcpyDeviceToDevice, buffers[i]->cuda_stream)); + if(!is_lb_buffer) { + hapiCheck(hapiMemcpyAsync(alloc_comm_buffer, buffers[i]->ptr, buffers[i]->cnt, + hapiMemcpyDeviceToDevice, buffers[i]->hapi_stream)); + } // Record event - cuda_ipc_device_info& my_device_info = csv_gpu_manager.cuda_ipc_device_infos[dm->global_index]; - hapiCheck(cudaEventRecord(my_device_info.src_event_pool[buffers[i]->event_idx], buffers[i]->cuda_stream)); + hapi_ipc_device_info& my_device_info = csv_gpu_manager.hapi_ipc_device_infos[(csv_gpu_manager.device_count * CmiMyNodeRankLocal() + cpv_my_device_id)]; + hapiCheck(hapiEventRecord(my_device_info.src_event_pool[buffers[i]->event_idx], buffers[i]->hapi_stream)); } } else { +#if !CMK_GPU_COMM // Use a naive host-staged mechanism // Allocate temporary host buffers and copy source buffers for (int i = 0; i < numops; i++) { buffers[i]->data_stored = true; - hapiCheck(cudaMallocHost(&buffers[i]->data, buffers[i]->cnt)); - hapiCheck(cudaMemcpyAsync(buffers[i]->data, buffers[i]->ptr, buffers[i]->cnt, - cudaMemcpyDeviceToHost, buffers[i]->cuda_stream)); + hapiCheck(hapiMallocHost(&buffers[i]->data, buffers[i]->cnt)); + hapiCheck(hapiMemcpyAsync(buffers[i]->data, buffers[i]->ptr, buffers[i]->cnt, + hapiMemcpyDeviceToHost, buffers[i]->hapi_stream)); } // Wait for the copies to finish for (int i = 0; i < numops; i++) { - hapiCheck(cudaStreamSynchronize(buffers[i]->cuda_stream)); + hapiCheck(hapiStreamSynchronize(buffers[i]->hapi_stream)); } - } #else - // Post ucp_tag_send_nb's to send GPU data. When receiver receives the metadata, - // it should post ucp_tag_recv_nb's to receive the GPU data. for (int i = 0; i < numops; i++) { - CmiSendDevice(dest_pe, buffers[i]->ptr, buffers[i]->cnt, buffers[i]->tag); + hapiStreamSynchronize(buffers[i]->hapi_stream); + buffers[i]->lci_ncpy_buffer = CmiNcpyBuffer(buffers[i]->ptr, buffers[i]->cnt); + } +#endif } -#endif // CMK_GPU_COMM } #endif // CMK_CUDA diff --git a/src/ck-core/ckrdmadevice.h b/src/ck-core/ckrdmadevice.h index c9f97c4a72..b2beea5b35 100644 --- a/src/ck-core/ckrdmadevice.h +++ b/src/ck-core/ckrdmadevice.h @@ -4,8 +4,8 @@ #include "ckcallback.h" #include "conv-rdmadevice.h" -#if CMK_CUDA -#include +#if CMK_CUDA || CMK_HIP +#include "hapi_portable.h" #define CkNcpyModeDevice CmiNcpyModeDevice #define CkDeviceStatus CmiDeviceStatus @@ -15,9 +15,9 @@ struct CkDevicePersistent { size_t cnt; CkCallback cb; void* cb_msg; - cudaStream_t cuda_stream; + hapiStream_t hapi_stream; int pe; - cudaIpcMemHandle_t cuda_ipc_handle; + hapiIpcMemHandle_t hapi_ipc_handle; void* ipc_ptr; bool ipc_open; // Used only by the remote chare @@ -34,15 +34,15 @@ struct CkDevicePersistent { init(); } - explicit CkDevicePersistent(const void* ptr_, size_t cnt_, cudaStream_t cuda_stream_) + explicit CkDevicePersistent(const void* ptr_, size_t cnt_, hapiStream_t hapi_stream_) : ptr(ptr_), cnt(cnt_), cb(CkCallback(CkCallback::ignore)), - cuda_stream(cuda_stream_) { + hapi_stream(hapi_stream_) { init(); } explicit CkDevicePersistent(const void* ptr_, size_t cnt_, const CkCallback& cb_, - cudaStream_t cuda_stream_) - : ptr(ptr_), cnt(cnt_), cb(cb_), cuda_stream(cuda_stream_) { + hapiStream_t hapi_stream_) + : ptr(ptr_), cnt(cnt_), cb(cb_), hapi_stream(hapi_stream_) { init(); } @@ -62,10 +62,10 @@ struct CkDevicePersistent { struct CkDeviceBufferPost { // CUDA stream for device transfers - cudaStream_t cuda_stream; + hapiStream_t hapi_stream; // Use per-thread stream by default - CkDeviceBufferPost() : cuda_stream(cudaStreamPerThread) {} + CkDeviceBufferPost() : hapi_stream(hapiStreamPerThread) {} }; class CkDeviceBuffer : public CmiDeviceBuffer { @@ -85,14 +85,14 @@ class CkDeviceBuffer : public CmiDeviceBuffer { cb = cb_; } - explicit CkDeviceBuffer(const void* ptr_, cudaStream_t cuda_stream_) : CmiDeviceBuffer(ptr_, 0) { + explicit CkDeviceBuffer(const void* ptr_, hapiStream_t hapi_stream_) : CmiDeviceBuffer(ptr_, 0) { cb = CkCallback(CkCallback::ignore); - cuda_stream = cuda_stream_; + hapi_stream = hapi_stream_; } - explicit CkDeviceBuffer(const void* ptr_, const CkCallback& cb_, cudaStream_t cuda_stream_) : CmiDeviceBuffer(ptr_, 0) { + explicit CkDeviceBuffer(const void* ptr_, const CkCallback& cb_, hapiStream_t hapi_stream_) : CmiDeviceBuffer(ptr_, 0) { cb = cb_; - cuda_stream = cuda_stream_; + hapi_stream = hapi_stream_; } explicit CkDeviceBuffer(const void* ptr_, size_t cnt_) : CmiDeviceBuffer(ptr_, cnt_) { @@ -103,14 +103,14 @@ class CkDeviceBuffer : public CmiDeviceBuffer { cb = cb_; } - explicit CkDeviceBuffer(const void* ptr_, size_t cnt_, cudaStream_t cuda_stream_) : CmiDeviceBuffer(ptr_, cnt_) { + explicit CkDeviceBuffer(const void* ptr_, size_t cnt_, hapiStream_t hapi_stream_) : CmiDeviceBuffer(ptr_, cnt_) { cb = CkCallback(CkCallback::ignore); - cuda_stream = cuda_stream_; + hapi_stream = hapi_stream_; } - explicit CkDeviceBuffer(const void* ptr_, size_t cnt_, const CkCallback& cb_, cudaStream_t cuda_stream_) : CmiDeviceBuffer(ptr_, cnt_) { + explicit CkDeviceBuffer(const void* ptr_, size_t cnt_, const CkCallback& cb_, hapiStream_t hapi_stream_) : CmiDeviceBuffer(ptr_, cnt_) { cb = cb_; - cuda_stream = cuda_stream_; + hapi_stream = hapi_stream_; } void pup(PUP::er &p) { @@ -121,14 +121,16 @@ class CkDeviceBuffer : public CmiDeviceBuffer { friend void CkRdmaDeviceIssueRgets(envelope *env, int numops, void **arrPtrs, int *arrSizes, CkDeviceBufferPost *postStructs); }; -#if !CMK_GPU_COMM -void CkRdmaDeviceRecvHandler(void* data, void* msg); -#else void CkRdmaDeviceRecvHandler(void* data); -#endif +void CkRdmaDeviceRecvHandler(void* data, void* msg); void CkRdmaDeviceIssueRgets(envelope *env, int numops, void **arrPtrs, int *arrSizes, CkDeviceBufferPost *postStructs); void CkRdmaDeviceOnSender(int dest_pe, int numops, CkDeviceBuffer** buffers); +extern "C" { + void* loopback_bridge(void* arg); + extern int loopback_handler; +} + #endif // CMK_CUDA #endif // _CKRDMADEVICE_H_ diff --git a/src/ck-core/ckreduction.C b/src/ck-core/ckreduction.C index c4f6e33f56..ead38b3de1 100644 --- a/src/ck-core/ckreduction.C +++ b/src/ck-core/ckreduction.C @@ -51,6 +51,7 @@ waits for the migrant contributions to straggle in. #include "charm++.h" #include "ck.h" +#include "ckrescale.h" #include "pathHistory.h" @@ -83,7 +84,6 @@ waits for the migrant contributions to straggle in. #define INT_MAX 2147483647 #endif -extern bool _inrestart; #if CMK_CHARM4PY //define a global instance of CkReductionTypesExt for external access CkReductionTypesExt charm_reducers; @@ -92,7 +92,7 @@ extern int (*PyReductionExt)(char**, int*, int, char**); Group::Group():thisIndex(CkMyPe()) { - if (_inrestart) CmiAbort("A Group object did not call the migratable constructor of its base class!"); + if (get_in_restart()) CmiAbort("A Group object did not call the migratable constructor of its base class!"); creatingContributors(); contributorStamped(&reductionInfo); diff --git a/src/ck-core/init.C b/src/ck-core/init.C index 3190ea818c..b6ea5e6f37 100644 --- a/src/ck-core/init.C +++ b/src/ck-core/init.C @@ -65,6 +65,7 @@ never be excluded... #include "ckcheckpoint.h" #include "ck.h" +#include "ckrescale.h" #include "trace.h" #include "ckrdma.h" #include "CkCheckpoint.decl.h" @@ -78,7 +79,8 @@ never be excluded... #include "TreeLB.h" #endif -#if CMK_CUDA +#define CMK_GPU_COMM 1 +#if CMK_CUDA || CMK_HIP #include "hapi_impl.h" #include "ckrdmadevice.h" @@ -155,10 +157,12 @@ int _infoIdx; int _charmHandlerIdx; int _initHandlerIdx; int _roRestartHandlerIdx; +int _shrinkExpandRestartHandlerIdx; int _bocHandlerIdx; int _qdHandlerIdx; int _qdCommHandlerIdx; int _triggerHandlerIdx; + bool _mainDone = false; CksvDeclare(bool, _triggersSent); @@ -683,15 +687,16 @@ static void _exitHandler(envelope *env) } else CmiFree(env); -#if CMK_SHRINK_EXPAND - ConverseCleanup(); -#endif -#if CMK_CUDA +#if CMK_CUDA || CMK_HIP // Clean up HAPI hapiExit(); #endif +#if CMK_SHRINK_EXPAND + ConverseCleanup(); +#endif + //everyone exits here - there may be issues with leftover messages in the queue #if !CMK_WITH_STATS && !CMK_WITH_WARNINGS DEBUGF(("[%d] Calling converse exit from ReqStatMsg \n",CkMyPe())); @@ -773,7 +778,7 @@ static inline void _processBufferedBocInits(void) envelope *env = inits[i]; if(env==0) { #if CMK_SHRINK_EXPAND - if(_inrestart){ + if(get_in_restart()){ CkPrintf("_processBufferedBocInits: empty message in restart, ignoring\n"); break; } @@ -1445,6 +1450,7 @@ void _initCharm(int unused_argc, char **argv) #if CMK_SHRINK_EXPAND // for shrink expand cleanup CmiAssignOnce(&_ROGroupRestartHandlerIdx, CkRegisterHandler(_ROGroupRestartHandler)); + CmiAssignOnce(&_shrinkExpandRestartHandlerIdx, CkRegisterHandler(CkRecvGroupROData)); #endif _infoIdx = CldRegisterInfoFn((CldInfoFn)_infoFn); @@ -1481,8 +1487,8 @@ void _initCharm(int unused_argc, char **argv) // Set the ack handler function used for the direct nocopy api CmiSetDirectNcpyAckHandler(CkRdmaDirectAckHandler); -#if CMK_CUDA && CMK_GPU_COMM - CmiRdmaDeviceRecvInit(CkRdmaDeviceRecvHandler); +#if (CMK_CUDA || CMK_HIP) && CMK_GPU_COMM + loopback_handler = CmiRegisterHandler((CmiHandler) loopback_bridge); #endif #if CMK_USE_SHMEM @@ -1691,9 +1697,10 @@ void _initCharm(int unused_argc, char **argv) } } -#if CMK_CUDA +#if CMK_CUDA || CMK_HIP // Perform HAPI initialization for GPU support hapiInit(argv); + //hapiStartMemoryDaemon(); // Initialize Charm++ layer functions hapiInvokeCallback = CUDACallbackManager; @@ -1806,6 +1813,7 @@ void _initCharm(int unused_argc, char **argv) // NOTE: this assumes commthreads will not block from this point on } + DEBUGF(("[%d,%d%.6lf] inCommThread %d\n",CmiMyPe(),CmiMyRank(),CmiWallTimer(),inCommThread)); // when I am a communication thread, I don't participate initDone. if (inCommThread) { @@ -1829,7 +1837,6 @@ void _initCharm(int unused_argc, char **argv) readKillFile(); } #endif - } int charm_main(int argc, char **argv) diff --git a/src/ck-core/init.h b/src/ck-core/init.h index cf2a383b2b..12b73a3abd 100644 --- a/src/ck-core/init.h +++ b/src/ck-core/init.h @@ -137,6 +137,7 @@ extern int _charmHandlerIdx; extern int _roRestartHandlerIdx; /* for checkpoint/restart */ #if CMK_SHRINK_EXPAND extern int _ROGroupRestartHandlerIdx; /* for checkpoint/restart */ +extern int _shrinkExpandRestartHandlerIdx; #endif extern int _bocHandlerIdx; extern int _qdHandlerIdx; diff --git a/src/ck-ldb/BaseLB.h b/src/ck-ldb/BaseLB.h index e7b6683d7f..2c057bac3b 100644 --- a/src/ck-ldb/BaseLB.h +++ b/src/ck-ldb/BaseLB.h @@ -53,9 +53,17 @@ class BaseLB: public CBase_BaseLB // double utilization; int pe; // processor id bool available; +#if CMK_CUDA || CMK_HIP + size_t gpu_mem_remaining; + size_t pool_buff_mem_remaining; + uint64_t gpu_device_id; // GPU device this PE is mapped to (-1 = no GPU) +#endif ProcStats(): n_objs(0), pe_speed(1), total_walltime(0.0), idletime(0.0), #if CMK_LB_CPUTIMER total_cputime(0.0), bg_cputime(0.0), +#endif +#if CMK_CUDA || CMK_HIP + gpu_device_id(-1), gpu_mem_remaining(0), pool_buff_mem_remaining(0), #endif bg_walltime(0.0), pe(-1), available(true) {} inline void clearBgLoad() { @@ -78,7 +86,12 @@ class BaseLB: public CBase_BaseLB double dummy; p|dummy; // for old format with utilization } p|available; p|n_objs; - if (_lb_args.lbversion()>=2) p|pe; + if (_lb_args.lbversion()>=2) p|pe; +#if CMK_CUDA || CMK_HIP + p|gpu_mem_remaining; + p|pool_buff_mem_remaining; + p|gpu_device_id; +#endif } }; diff --git a/src/ck-ldb/CentralLB.C b/src/ck-ldb/CentralLB.C index dbc3e6a60d..768957e75d 100644 --- a/src/ck-ldb/CentralLB.C +++ b/src/ck-ldb/CentralLB.C @@ -10,6 +10,17 @@ #include "envelope.h" #include "CentralLB.h" #include "LBSimulation.h" +#if CMK_CUDA || CMK_HIP +#if CMK_CUDA +#include +#endif +#include "gpumanager.h" +// extern void hapiProcessCuptiBuffers(); +// extern void hapiClearCuptiData(); +CsvExtern(GPUManager, gpu_manager); +CkpvExtern(int, _lb_obj_index); +#include "hapi.h" +#endif #define DEBUGF(x) // CmiPrintf x; #define DEBUG(x) // x; @@ -37,14 +48,13 @@ extern "C" void charmrun_realloc(char *s); extern char willContinue; extern realloc_state pending_realloc_state; extern char * se_avail_vector; -extern "C" int mynewpe; +extern int mynewpe; extern char *_shrinkexpand_basedir; -int numProcessAfterRestart; -int mynewpe=0; +extern int numProcessAfterRestart; #endif CkGroupID loadbalancer; int * lb_ptr; -bool load_balancer_created; +extern bool load_balancer_created; static void lbinit() { @@ -71,6 +81,10 @@ void CentralLB::initLB(const CkLBOptions &opt) if (opt.getSeqNo() > 0 || (_lb_args.metaLbOn() && _lb_args.metaLbModelDir() != nullptr)) turnOff(); + #if CMK_CUDA || CMK_HIP && CMK_LB_USER_DATA + CkpvAccess(_lb_obj_index) = LBRegisterObjUserData(sizeof(size_t));//gpu allocation size + #endif + stats_msg_count = 0; statsMsgsList = NULL; statsData = NULL; @@ -130,11 +144,11 @@ int CentralLB::GetPESpeed() return myspeed; } -void CentralLB::InvokeLB() +void CentralLB::CallLB() { -#if CMK_LBDB_ON + #if CMK_LBDB_ON DEBUGF(("[%d] CentralLB AtSync step %d!!!!!\n",CkMyPe(),step())); -#if CMK_MEM_CHECKPOINT +#if CMK_MEM_CHECKPOINT CkSetInLdb(); #endif @@ -143,12 +157,42 @@ void CentralLB::InvokeLB() MigrationDone(0); return; } + +#if CMK_CUDA || CMK_HIP +#if CMK_SMP + CmiNodeBarrier(); // ensure rank 0 finishes buffer processing before other ranks read the map +#endif +if (CmiMyRank() == 0) +{ +#if CMK_CUDA + double start = CkWallTimer(); + cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);//sync flush cupti records which are finished, does not wait for partial records + hapiProcessCuptiBuffers(); +#endif +} +#if CMK_SMP + CmiNodeBarrier(); // ensure rank 0 finishes buffer processing before other ranks read the map +#endif + // Every PE matches its own objects against the shared per-process CUPTI map + lbmgr->SetObjGPULoad(CsvAccess(gpu_manager).cupti_obj_gpu_times_); +#endif + { thisProxy [CkMyPe()].ProcessAtSync(); } #endif } +void CentralLB::InvokeLB() +{ + lbmgr->lb_in_progress = true; +#if CMK_SHRINK_EXPAND + contribute(CkCallback(CkReductionTarget(CentralLB, CheckForLB), thisProxy[0])); +#else + CallLB(); +#endif +} + void CentralLB::ProcessAtSync() { #if CMK_LBDB_ON @@ -306,16 +350,35 @@ void CentralLB::BuildStatsMsg() msg->pe_speed = myspeed; #endif - DEBUGF(("Processor %d Total time (wall,cpu) = %f %f Idle = %f Bg = %f %f\n", CkMyPe(),msg->total_walltime,msg->total_cputime,msg->idletime,msg->bg_walltime,msg->bg_cputime)); +#if CMK_CUDA || CMK_HIP + // printf("CMK_CUDA setting device is %ld\n", hapiMyDevice()); + msg->gpu_device_id = hapiMyDevice(); + size_t freeMem, totalMem; + hapiMemGetInfo(&freeMem, &totalMem); + msg->gpu_mem_remaining = freeMem; + GPUManager& csv_gpu_manager = CsvAccess(gpu_manager); + if(csv_gpu_manager.use_shm) { + DeviceManager* dm = csv_gpu_manager.device_map[CkMyPe()]; + msg->pool_buff_mem_remaining = dm->get_lb_buffer_free_size(); + // printf("PE %d: GPU %ld free mem: %ld, pool buffer free mem: %ld\n", CkMyPe(), msg->gpu_device_id, msg->gpu_mem_remaining, msg->pool_buff_mem_remaining); + } else + { + msg->pool_buff_mem_remaining = 0;//// should not run + } + // printf("msg->gpu_device_id is %ld\n", msg->gpu_device_id); +#endif + + DEBUGF(("Processor %d Total time (wall,cpu) = %f Idle = %f Bg = %f\n", CkMyPe(),msg->total_walltime,msg->idletime,msg->bg_walltime)); msg->objData.resize(osz); lbmgr->GetObjData(msg->objData.data()); msg->commData.resize(csz); lbmgr->GetCommData(msg->commData.data()); // lbmgr->ClearLoads(); - DEBUGF(("PE %d BuildStatsMsg %d objs, %d comm\n",CkMyPe(),msg->n_objs,msg->n_comm)); + DEBUGF(("PE %d BuildStatsMsg %d objs, %d comm\n",CkMyPe(),msg->objData.size(),msg->commData.size())); if(CkMyPe() == cur_ld_balancer) { + int count_avail = 0; lbmgr->get_avail_vector(msg->avail_vector); msg->next_lb = LBManagerObj()->new_lbbalancer(); } @@ -435,6 +498,11 @@ void CentralLB::depositData(CLBStatsMsg *m) procStat.bg_cputime = m->bg_cputime; #endif procStat.pe_speed = m->pe_speed; +#if CMK_CUDA || CMK_HIP + procStat.gpu_device_id = m->gpu_device_id; + procStat.gpu_mem_remaining = m->gpu_mem_remaining; + procStat.pool_buff_mem_remaining = m->pool_buff_mem_remaining; +#endif //procStat.utilization = 1.0; procStat.available = true; @@ -510,6 +578,11 @@ void CentralLB::ReceiveStats(CkMarshalledCLBStatsMessage &&msg) procStat.bg_cputime = m->bg_cputime; #endif procStat.pe_speed = m->pe_speed; +#if CMK_CUDA || CMK_HIP + procStat.gpu_device_id = m->gpu_device_id; + procStat.gpu_mem_remaining = m->gpu_mem_remaining; + procStat.pool_buff_mem_remaining = m->pool_buff_mem_remaining; +#endif //procStat.utilization = 1.0; procStat.available = true; procStat.n_objs = msg_n_objs; @@ -985,6 +1058,7 @@ void CentralLB::ProcessMigrationDecision() { void CentralLB::ProcessReceiveMigration() { + // CmiPrintf("[%d] ProcessReceiveMigration\n", CkMyPe()); #if CMK_LBDB_ON int i; LBMigrateMsg *m = storedMigrateMsg; @@ -997,6 +1071,7 @@ void CentralLB::ProcessReceiveMigration() CmiAssert(migrates_expected <= 0 || migrates_completed == migrates_expected); migrates_expected = 0; future_migrates_expected = 0; + // CmiPrintf("[%d] ProcessReceiveMigration: n_moves=%d\n", CkMyPe(), m->n_moves); for(i=0; i < m->n_moves; i++) { MigrateInfo& move = m->moves[i]; const int me = CkMyPe(); @@ -1021,11 +1096,11 @@ void CentralLB::ProcessReceiveMigration() else future_migrates_expected++; } else { -#if CMK_GLOBAL_LOCATION_UPDATE - UpdateLocation(move); -#endif + #if CMK_GLOBAL_LOCATION_UPDATE + // CmiPrintf("[%d] Updating location for obj id=%llu from %d to %d\n", CkMyPe(), move.obj.id, move.from_pe, move.to_pe); + UpdateLocation(move); + #endif } - } DEBUGF(("[%d] in ReceiveMigration %d moves expected: %d future expected: %d\n",CkMyPe(),m->n_moves, migrates_expected, future_migrates_expected)); @@ -1049,52 +1124,58 @@ void CentralLB::ProcessReceiveMigration() #endif } +void CentralLB::CheckForLB() { + //sleep(5); +#if CMK_SHRINK_EXPAND + if (pending_realloc_state == EXPAND_MSG_RECEIVED) + CheckForRealloc(); + //else if (pending_realloc_state == NO_REALLOC) + // thisProxy.ResumeClients(0); + else + thisProxy.CallLB(); +#else + // if we are not in shrink/expand mode, just call LB + thisProxy.CallLB(); +#endif + //else + // thisProxy.ResumeClients(0); +} + // We assume that bit vector would have been aptly set async by either scheduler or charmrun. void CentralLB::CheckForRealloc(){ #if CMK_SHRINK_EXPAND - if(pending_realloc_state == REALLOC_MSG_RECEIVED) { - pending_realloc_state = REALLOC_IN_PROGRESS; //in progress - CkPrintf("Load balancer invoking charmrun to handle reallocation on pe %d\n", CkMyPe()); - double end_lb_time = CkWallTimer(); - CkPrintf("CharmLB> %s: PE [%d] step %d finished at %f duration %f s\n\n", - lbname, cur_ld_balancer, step()-1, end_lb_time, end_lb_time-start_lb_time); - // do checkpoint - CkCallback cb(CkIndex_CentralLB::ResumeFromReallocCheckpoint(), thisProxy[0]); - CkStartCheckpoint(_shrinkexpand_basedir, cb); - } - else{ - thisProxy.MigrationDoneImpl(1); - } + if(pending_realloc_state != NO_REALLOC) { + pending_realloc_state = (pending_realloc_state == SHRINK_MSG_RECEIVED) ? SHRINK_IN_PROGRESS : EXPAND_IN_PROGRESS; //in progress + CkPrintf("Load balancer invoking charmrun to handle reallocation on pe %d\n", CkMyPe()); + double end_lb_time = CkWallTimer(); + CkPrintf("CharmLB> %s: PE [%d] step %d finished at %f duration %f s\n\n", + lbname, cur_ld_balancer, step()-1, end_lb_time, end_lb_time-start_lb_time); + // do checkpoint + CkCallback cb(CkIndex_CentralLB::ResumeFromReallocCheckpoint(), thisProxy[0]); + CkStartRescaleCheckpoint(_shrinkexpand_basedir, cb, + std::vector(se_avail_vector, se_avail_vector + CkNumPes())); + } else { + thisProxy.MigrationDoneImpl(1); + } #endif } void CentralLB::ResumeFromReallocCheckpoint(){ #if CMK_SHRINK_EXPAND - const int count = CkNumPes(); - std::vector avail(se_avail_vector, se_avail_vector + count); - memset(se_avail_vector, 0, sizeof(char) * count); + CkPrintf("Resumed from realloc\n"); + std::vector avail(se_avail_vector, se_avail_vector + CkNumPes()); + //free(se_avail_vector); thisProxy.WillIbekilled(avail, numProcessAfterRestart); #endif } - - -#if CMK_SHRINK_EXPAND -int GetNewPeNumber(std::vector avail){ - int mype = CkMyPe(); - int count =0; - for (int i =0; i avail, int newnumProcessAfterRestart){ #if CMK_SHRINK_EXPAND numProcessAfterRestart = newnumProcessAfterRestart; mynewpe = GetNewPeNumber(avail); + //CkPrintf("[%d] -> new pe %d\n", CkMyPe(), mynewpe); willContinue = avail[CkMyPe()]; + //CkPrintf("PE%i> Sending start cleanup reduction\n", CkMyPe()); CkCallback cb(CkIndex_CentralLB::StartCleanup(), thisProxy[0]); contribute(cb); #endif @@ -1102,9 +1183,12 @@ void CentralLB::WillIbekilled(std::vector avail, int newnumProcessAfterRes void CentralLB::StartCleanup(){ #if CMK_SHRINK_EXPAND - CkCleanup(); + //CkAbort("FLAG\n"); + //CkPrintf("Starting cleanup\n"); + CkCleanup(); #endif } + void CentralLB::MigrationDone(int balancing) { #if CMK_SHRINK_EXPAND @@ -1116,6 +1200,7 @@ void CentralLB::MigrationDone(int balancing) MigrationDoneImpl(balancing); #endif } + void CentralLB::MigrationDoneImpl (int balancing) { @@ -1124,6 +1209,10 @@ void CentralLB::MigrationDoneImpl (int balancing) migrates_expected = -1; // clear load stats if (balancing) lbmgr->ClearLoads(); +#if CMK_CUDA || CMK_HIP + if (CmiMyRank() == 0) + hapiClearCuptiData(); +#endif // Increment to next step lbmgr->incStep(); DEBUGF(("[%d] Incrementing Step %d \n",CkMyPe(),step())); @@ -1158,7 +1247,7 @@ void CentralLB::ResumeClients() void CentralLB::ResumeClients(int balancing) { #if CMK_LBDB_ON - DEBUGF(("[%d] Resuming clients. balancing:%d.\n",CkMyPe(),balancing)); + //CkPrintf("[%d] Resuming clients. balancing:%d.\n",CkMyPe(),balancing); lbmgr->ResumeClients(); if (balancing) { @@ -1169,6 +1258,10 @@ void CentralLB::ResumeClients(int balancing) CheckMigrationComplete(); } } + lbmgr->lb_in_progress = false; + + if (CkMyPe() == 0) + lbmgr->callRealloc(); #endif } @@ -1652,6 +1745,11 @@ CLBStatsMsg::~CLBStatsMsg() { void CLBStatsMsg::pup(PUP::er &p) { p|from_pe; p|pe_speed; +#if CMK_CUDA || CMK_HIP + p|gpu_device_id; + p|gpu_mem_remaining; + p|pool_buff_mem_remaining; +#endif p|total_walltime; p|idletime; #if defined(TEMP_LDB) diff --git a/src/ck-ldb/CentralLB.ci b/src/ck-ldb/CentralLB.ci index 59694ce685..acab70ec82 100644 --- a/src/ck-ldb/CentralLB.ci +++ b/src/ck-ldb/CentralLB.ci @@ -31,11 +31,13 @@ group [migratable] CentralLB : BaseLB { entry [reductiontarget] void ProcessReceiveMigration(); entry [reductiontarget] void ProcessMigrationDecision(); entry void MissMigrate(int); + entry void CallLB(); entry void CheckForRealloc(); entry void ResumeFromReallocCheckpoint(); entry void MigrationDoneImpl(int); entry void WillIbekilled(std::vector avail, int); - entry void StartCleanup(); + entry [reductiontarget] void StartCleanup(); + entry [reductiontarget] void CheckForLB(); }; }; diff --git a/src/ck-ldb/CentralLB.h b/src/ck-ldb/CentralLB.h index bbc9a5c140..8a4366b080 100644 --- a/src/ck-ldb/CentralLB.h +++ b/src/ck-ldb/CentralLB.h @@ -12,6 +12,7 @@ #include #include "pup_stl.h" #include "manager.h" +#include "ckcheckpoint.h" extern CkGroupID loadbalancer; void CreateCentralLB(); @@ -96,6 +97,7 @@ class CentralLB : public CBase_CentralLB int GetPESpeed(); inline void setConcurrent(bool c) { concurrent = c; } + void CallLB(); void InvokeLB(); // Everything is at the PE barrier void ProcessAtSync(void); // Receive a message from AtSync to avoid // making projections output look funny @@ -121,6 +123,7 @@ class CentralLB : public CBase_CentralLB void MissMigrate(int waitForBarrier); //Shrink-Expand related functions + void CheckForLB(); void CheckForRealloc (); void ResumeFromReallocCheckpoint(); void MigrationDoneImpl (int ); @@ -283,6 +286,11 @@ class CLBStatsMsg { int from_pe; int pe_speed; +#if CMK_CUDA || CMK_HIP + size_t gpu_mem_remaining; + size_t pool_buff_mem_remaining; + uint64_t gpu_device_id; +#endif LBRealType total_walltime; LBRealType idletime; LBRealType bg_walltime; @@ -298,7 +306,11 @@ class CLBStatsMsg { public: CLBStatsMsg(int osz, int csz); - CLBStatsMsg(): from_pe(0), pe_speed(0), total_walltime(0.0), idletime(0.0), + CLBStatsMsg(): from_pe(0), pe_speed(0), +#if CMK_CUDA || CMK_HIP + gpu_device_id(-1), gpu_mem_remaining(0), pool_buff_mem_remaining(0), +#endif + total_walltime(0.0), idletime(0.0), bg_walltime(0.0), #if defined(TEMP_LDB) pe_temp(1.0), diff --git a/src/ck-ldb/CommonLBs.ci b/src/ck-ldb/CommonLBs.ci index 436ffa6729..f76b21f308 100644 --- a/src/ck-ldb/CommonLBs.ci +++ b/src/ck-ldb/CommonLBs.ci @@ -5,6 +5,8 @@ module CommonLBs { extern module DistributedLB; extern module MetisLB; extern module RecBipartLB; + extern module GreedyCentralLB; + extern module GreedyRefineCentralLB; initnode void initCommonLBs(void); }; diff --git a/src/ck-ldb/EveryLB.ci b/src/ck-ldb/EveryLB.ci index a634d9c9e0..4bfc0fafb0 100644 --- a/src/ck-ldb/EveryLB.ci +++ b/src/ck-ldb/EveryLB.ci @@ -5,6 +5,8 @@ module EveryLB { extern module DistributedLB; extern module MetisLB; extern module RecBipartLB; + extern module GreedyCentralLB; + extern module GreedyRefineCentralLB; initnode void initEveryLB(void); }; diff --git a/src/ck-ldb/GreedyCentralLB.C b/src/ck-ldb/GreedyCentralLB.C new file mode 100644 index 0000000000..e82d0dd626 --- /dev/null +++ b/src/ck-ldb/GreedyCentralLB.C @@ -0,0 +1,323 @@ +/** + * \addtogroup CkLdb +*/ +/*@{*/ + +/* + status: + * support processor avail bitvector + * support nonmigratable attrib + nonmigratable object load is added to its processor's background load + and the nonmigratable object is not taken in the objData array +*/ + +#include +#include + +#include "charm++.h" + + +#include "ckgraph.h" +#include "cklists.h" +#include "GreedyCentralLB.h" +#include "conv-mach-cuda.h" +#include "conv-mach-hip.h" + +using namespace std; + +extern int quietModeRequested; + +CreateLBFunc_Def(GreedyCentralLB, "always assign the heaviest obj onto lightest loaded processor.") + +GreedyCentralLB::GreedyCentralLB(const CkLBOptions &opt): CBase_GreedyCentralLB(opt) +{ + lbname = "GreedyCentralLB"; + if (CkMyPe()==0 && !quietModeRequested) + CkPrintf("CharmLB> GreedyCentralLB created.\n"); +} + +bool GreedyCentralLB::QueryBalanceNow(int _step) +{ + // CkPrintf("[%d] Balancing on step %d\n",CkMyPe(),_step); + return true; +} + +class GreedyCentralLB::ProcLoadGreater { + public: + bool operator()(const ProcInfo &p1, const ProcInfo &p2) { + return (p1.getTotalLoad() > p2.getTotalLoad()); + } +}; + +class GreedyCentralLB::ObjLoadGreater { + public: + bool operator()(const CkVertex &v1, const CkVertex &v2) { + return (v1.getCompLoad() > v2.getCompLoad()); + } +}; + +#if CMK_CUDA || CMK_HIP +// A group of PEs that share the same GPU device. +// Load balancing reasons at this level: the GPU is the bottleneck, +// so we distribute objects across GPUs, not across individual PEs. +struct GPUGroup { + int gpu_id; // GPU device id + double totalLoad; // aggregate GPU load across all PEs in this group + std::vector pe_indices; // indices into the procs vector +}; +#endif + +void GreedyCentralLB::work(LDStats* stats) +{ + int obj, objCount, pe; + int n_pes = stats->nprocs(); + int *map = new int[n_pes]; + + std::vector procs; + for(pe = 0; pe < n_pes; pe++) { + map[pe] = -1; + if (stats->procs[pe].available) { + map[pe] = procs.size(); + procs.push_back(ProcInfo(pe, stats->procs[pe].bg_walltime, 0.0, stats->procs[pe].pe_speed, true)); + } + } + + // take non migratable object load as background load + for (obj = 0; obj < stats->objData.size(); obj++) + { + LDObjData &oData = stats->objData[obj]; + if (!oData.migratable) { + int pe = stats->from_proc[obj]; + pe = map[pe]; + if (pe==-1) + CmiAbort("GreedyCentralLB: nonmigratable object on an unavail processor!\n"); +#if CMK_CUDA || CMK_HIP + procs[pe].setOverhead(procs[pe].getOverhead() + std::max(oData.wallTime, oData.gpuTime)); +#else + procs[pe].setOverhead(procs[pe].getOverhead() + oData.wallTime); +#endif + } + } + delete [] map; + + // Add the overhead to the total load + for (pe = 0; pe objs; + + for(int obj = 0; obj < stats->objData.size(); obj++) { + LDObjData &oData = stats->objData[obj]; + int pe = stats->from_proc[obj]; + if (!oData.migratable) { + if (!stats->procs[pe].available) + CmiAbort("GreedyCentralLB cannot handle nonmigratable object on an unavial processor!\n"); + continue; + } +#if CMK_CUDA || CMK_HIP + // Use whichever is the bottleneck: CPU wall time or GPU kernel time + double load = std::max(oData.wallTime, oData.gpuTime) * stats->procs[pe].pe_speed; + CkPrintf("[%d] GreedyCentralLB obj %d (PE %d): gpuTime=%.6f wallTime=%.6f load=%.6f\n", + CkMyPe(), obj, pe, oData.gpuTime, oData.wallTime, load); +#else + double load = oData.wallTime * stats->procs[pe].pe_speed; +#endif + objs.push_back(CkVertex(obj, load, stats->objData[obj].migratable, stats->from_proc[obj])); + } + + // max heap of objects (heaviest first) + sort(objs.begin(), objs.end(), GreedyCentralLB::ObjLoadGreater()); + + if (_lb_args.debug()>1) + CkPrintf("[%d] In GreedyCentralLB strategy\n",CkMyPe()); + + int nmoves = 0; + +#if CMK_CUDA || CMK_HIP + // ---- GPU-aware greedy: balance across GPU groups, not individual PEs ---- + + // Build GPU groups: map gpu_device_id -> GPUGroup + // With typical counts (2-8 GPUs), linear scan beats a heap and avoids + // heap-invariant headaches when we update a non-front group in place. + std::vector gpuGroups; + std::unordered_map gpuIdToIdx; // gpu_device_id -> index in gpuGroups + CkPrintf("starting stratergy for GPU\n"); + for (int i = 0; i < (int)procs.size(); i++) { + int real_pe = procs[i].getProcId(); + uint64_t gpu_id = stats->procs[real_pe].gpu_device_id; + printf("gpu_id %ld\n", gpu_id); + fflush(stdout); + + auto it = gpuIdToIdx.find(gpu_id); + if (it == gpuIdToIdx.end()) { + gpuIdToIdx[gpu_id] = gpuGroups.size(); + GPUGroup g; + g.gpu_id = gpu_id; + g.totalLoad = procs[i].getTotalLoad(); + g.pe_indices.push_back(i); + gpuGroups.push_back(std::move(g)); + } else { + gpuGroups[it->second].totalLoad += procs[i].getTotalLoad(); + gpuGroups[it->second].pe_indices.push_back(i); + } + } + + // Reverse map: real PE -> index in gpuGroups + std::unordered_map peToGroupIdx; + for (int gi = 0; gi < (int)gpuGroups.size(); gi++) { + for (int pidx : gpuGroups[gi].pe_indices) { + peToGroupIdx[procs[pidx].getProcId()] = gi; + } + } + + CkPrintf("[%d] GreedyCentralLB: %ld GPU group(s), %ld available PEs, %ld migratable objs\n", + CkMyPe(), (int)gpuGroups.size(), (int)procs.size(), (int)objs.size()); + for (auto &g : gpuGroups) { + CkPrintf("[%d] GPU %ld: %ld PEs, aggregate load=%.6f\n", + CkMyPe(), g.gpu_id, (int)g.pe_indices.size(), g.totalLoad); + } + + // Greedy with locality preference: + // For each object (heaviest first), find the lightest GPU group. + // If the object's current GPU group has comparable load, keep it there. + // Within the chosen group, prefer the object's current PE if it belongs + // to that group; otherwise pick the lightest PE. + for (obj = 0; obj < (int)objs.size(); obj++) { + const int from_pe = objs[obj].getCurrentPe(); + const int id = objs[obj].getVertexId(); + double obj_load = objs[obj].getCompLoad(); + if (obj_load <= 0.0) obj_load = 1e-6; + + // Find lightest GPU group (linear scan — few groups) + int lightest_gi = 0; + for (int gi = 1; gi < (int)gpuGroups.size(); gi++) { + if (gpuGroups[gi].totalLoad < gpuGroups[lightest_gi].totalLoad) + lightest_gi = gi; + } + + // Check if object's current group is close enough to the lightest + int chosen_gi = lightest_gi; + auto curIt = peToGroupIdx.find(from_pe); + if (curIt != peToGroupIdx.end()) { + int cur_gi = curIt->second; + if (gpuGroups[cur_gi].totalLoad <= gpuGroups[lightest_gi].totalLoad + 0.01) { + chosen_gi = cur_gi; // stay on current GPU + } + } + GPUGroup &g = gpuGroups[chosen_gi]; + + // Within the chosen group, prefer the current PE if it belongs here + int best_idx = -1; + if (chosen_gi == (curIt != peToGroupIdx.end() ? curIt->second : -1)) { + // Object's current PE is in this group — use it + for (int k = 0; k < (int)g.pe_indices.size(); k++) { + if (procs[g.pe_indices[k]].getProcId() == from_pe) { + best_idx = g.pe_indices[k]; + break; + } + } + } + if (best_idx < 0) { + // Pick the lightest PE in the group + best_idx = g.pe_indices[0]; + double best_load = procs[best_idx].getTotalLoad(); + for (int k = 1; k < (int)g.pe_indices.size(); k++) { + double pl = procs[g.pe_indices[k]].getTotalLoad(); + if (pl < best_load) { + best_load = pl; + best_idx = g.pe_indices[k]; + } + } + } + + ProcInfo &p = procs[best_idx]; + double scaled_load = obj_load / p.getPeSpeed(); + p.setTotalLoad(p.getTotalLoad() + scaled_load); + g.totalLoad += scaled_load; + + // Record migration only if PE actually changed + const int dest = p.getProcId(); + if (dest != from_pe) { + stats->to_proc[id] = dest; + nmoves++; + if (_lb_args.debug() > 2) + CkPrintf("[%d] Obj %d migrating from %d to %d\n", CkMyPe(), id, from_pe, dest); + } + } + + for (int gi = 0; gi < (int)gpuGroups.size(); gi++) { + CkPrintf("gpu group %d load: %f\n", gi, gpuGroups[gi].totalLoad); + // if ( < gpuGroups[lightest_gi].totalLoad) + // lightest_gi = gi; + } + +#else + // ---- Original PE-level greedy (non-GPU path) ---- + + // min heap of processors (lightest first) + make_heap(procs.begin(), procs.end(), GreedyCentralLB::ProcLoadGreater()); + + // greedy algorithm: assign heaviest object to lightest processor + // Use getCompLoad() to avoid the 0.1 floor in getVertexLoad() which + // destroys load differentiation for fine-grained GPU workloads + for (obj=0; obj < objs.size(); obj++) { + ProcInfo p = procs.front(); + pop_heap(procs.begin(), procs.end(), GreedyCentralLB::ProcLoadGreater()); + procs.pop_back(); + + double obj_load = objs[obj].getCompLoad(); + if (obj_load <= 0.0) obj_load = 1e-6; + p.setTotalLoad(p.getTotalLoad() + obj_load / p.getPeSpeed()); + + //Insert object into migration queue if necessary + const int dest = p.getProcId(); + const int from_pe = objs[obj].getCurrentPe(); + const int id = objs[obj].getVertexId(); + if (dest != from_pe) { + stats->to_proc[id] = dest; + nmoves ++; + if (_lb_args.debug()>2) + CkPrintf("[%d] Obj %d migrating from %d to %d\n", CkMyPe(),id,from_pe,dest); + } + + //Insert the least loaded processor with load updated back into the heap + procs.push_back(p); + push_heap(procs.begin(), procs.end(), GreedyCentralLB::ProcLoadGreater()); + } +#endif + + CkPrintf("[%d] GreedyCentralLB: %d objects migrating.\n", CkMyPe(), nmoves); + + if (_lb_args.debug()>1) { + CkPrintf("CharmLB> Min obj: %f Max obj: %f\n", objs[objs.size()-1].getCompLoad(), objs[0].getCompLoad()); + CkPrintf("CharmLB> PE speed:\n"); + for (pe = 0; pe PE Load:\n"); + for (pe = 0; pe max_load) { + max_load = procs[pe].getTotalLoad(); + } + avg_load += procs[pe].getTotalLoad(); + } + + stats->after_lb_max = max_load; + stats->after_lb_avg = avg_load/procs.size(); + stats->is_prev_lb_refine = 0; + if (_lb_args.debug() > 0) + CkPrintf("GreedyCentralLB> After lb max load: %lf avg load: %lf\n", max_load, avg_load/procs.size()); + } +} + +#include "GreedyCentralLB.def.h" diff --git a/src/ck-ldb/GreedyCentralLB.ci b/src/ck-ldb/GreedyCentralLB.ci new file mode 100644 index 0000000000..2883616b93 --- /dev/null +++ b/src/ck-ldb/GreedyCentralLB.ci @@ -0,0 +1,9 @@ +module GreedyCentralLB { + + extern module CentralLB; + initnode void lbinit(void); + + group [migratable] GreedyCentralLB : CentralLB { + entry void GreedyCentralLB(const CkLBOptions &); + }; +}; \ No newline at end of file diff --git a/src/ck-ldb/GreedyCentralLB.h b/src/ck-ldb/GreedyCentralLB.h new file mode 100644 index 0000000000..cb0c6aef4f --- /dev/null +++ b/src/ck-ldb/GreedyCentralLB.h @@ -0,0 +1,45 @@ +/** + * \addtogroup CkLdb +*/ +/*@{*/ + +#ifndef _GreedyCentralLB_H_ +#define _GreedyCentralLB_H_ + +#define __DEBUG_GREEDY_REFINE_ 1 + +#include "CentralLB.h" +#include "GreedyCentralLB.decl.h" + +void CreateGreedyCentralLB(); +BaseLB * AllocateGreedyCentralLB(); + +class GreedyCentralLB : public CBase_GreedyCentralLB { + +public: + struct HeapData { + double load; + int pe; + int id; + }; + + GreedyCentralLB(const CkLBOptions &); + GreedyCentralLB(CkMigrateMessage *m):CBase_GreedyCentralLB(m) { lbname = "GreedyCentralLB"; } + void work(LDStats* stats); +private: + class ProcLoadGreater; + class ObjLoadGreater; + + enum HeapCmp {GT = '>', LT = '<'}; + void Heapify(HeapData*, int, int, HeapCmp); + void HeapSort(HeapData*, int, HeapCmp); + void BuildHeap(HeapData*, int, HeapCmp); + bool Compare(double, double, HeapCmp); + HeapData* BuildCpuArray(BaseLB::LDStats*, int, int *); + HeapData* BuildObjectArray(BaseLB::LDStats*, int, int *); + bool QueryBalanceNow(int step); +}; + +#endif /* _HEAPCENTLB_H_ */ + +/*@}*/ \ No newline at end of file diff --git a/src/ck-ldb/GreedyRefineCentralLB.C b/src/ck-ldb/GreedyRefineCentralLB.C new file mode 100644 index 0000000000..8aac6cdf21 --- /dev/null +++ b/src/ck-ldb/GreedyRefineCentralLB.C @@ -0,0 +1,803 @@ +/** + * \addtogroup CkLdb +*/ +/*@{*/ + +/** + * Author: jjgalvez@illinois.edu (Juan Galvez) + * Greedy algorithm to minimize cpu max_load and object migrations. + * Can find solution equal or close to regular Greedy with less (sometimes much less) migrations. + * The amount of migrations that the user can tolerate is passed via the command-line + * option +LBPercentMoves (as percentage of chares that can be moved). + * + * If LBPercentMoves is not passed, strategy assumes it can move all objects. + * In this case, the algorithm will give preference to minimizing cpu max_load. + * It will still move less than greedy, but the amount of migrations + * will depend very much on the particular case (object load distribution and processor background loads), + * + * supports processor avail bitvector + * supports nonmigratable attrib + * +*/ + +#include "charm++.h" +#include "ckgraph.h" +#include "GreedyRefineCentralLB.h" + +#include +#include +#include +#include +#if CMK_CUDA || CMK_HIP +CkpvExtern(int, _lb_obj_index); +#include +#endif + +extern int quietModeRequested; + +// a solution is feasible if num migrations <= user-specified limit +// LOAD_MIG_BAL is used to control tradeoff between maxload and migrations +// when selecting solutions from the feasible set +#define LOAD_MIG_BAL 1.003 + +using namespace std; + +class GreedyRefineCentralLB::Solution { +public: + Solution() {} + Solution(int pe, double maxLoad, int nmoves) : pe(pe), max_load(maxLoad), migrations(nmoves) {} + int pe; // pe who produced this solution + float max_load; + int migrations; + + void pup(PUP::er &p) { + p|pe; + p|max_load; + p|migrations; + } +}; + +// custom heap to allow removal of processors from any position +class GreedyRefineCentralLB::PHeap { +public: + PHeap(int numpes) { + Q.reserve(numpes+1); + Q.push_back(NULL); // first element of the array is NULL + } + + void addProcessors(std::vector &procs, bool bgLoadZero, bool insert=true) { + for (int i=0; i < procs.size(); i++) { + GreedyRefineCentralLB::GProc &p = procs[i]; + if (p.available) { + p.load = p.bgload; + if (insert) { + Q.push_back(&p); + p.pos = Q.size()-1; + } + } + } + if (!bgLoadZero) buildMinHeap(); + } + + inline GreedyRefineCentralLB::GProc *top() const { + CkAssert(Q.size() > 1); + return Q[1]; + } + + inline void push(GreedyRefineCentralLB::GProc *p) { + Q.push_back(p); + p->pos = Q.size()-1; + siftUp(p->pos); + } + + inline GreedyRefineCentralLB::GProc *pop() { + if (Q.size() == 1) return NULL; + GreedyRefineCentralLB::GProc *retval; + if (Q.size() == 2) { + retval = Q[1]; + Q.pop_back(); + return retval; + } + retval = Q[1]; + Q[1] = Q.back(); + Q.pop_back(); + Q[1]->pos = 1; + siftDown(1); + return retval; + } + + // remove processor from any position in the heap + void remove(GreedyRefineCentralLB::GProc *p) { + int pos = p->pos; + if ((Q.size() == 2) || (pos == Q.size()-1)) return Q.pop_back(); + if (pos == 1) { pop(); return; } + Q[pos] = Q.back(); + Q.pop_back(); + Q[pos]->pos = pos; + if (Q[pos/2]->load > Q[pos]->load) siftUp(pos); + else siftDown(pos); + } + + inline void clear() { + Q.clear(); + Q.push_back(NULL); + } + +private: + + void min_heapify(int i) { + const int left = 2*i; + const int right = 2*i + 1; + int smallest = i; + if ((left < Q.size()) && (Q[left]->load < Q[smallest]->load)) smallest = left; + if ((right < Q.size()) && (Q[right]->load < Q[smallest]->load)) smallest = right; + if (smallest != i) { + swap(i,smallest); + Q[i]->pos = i; + Q[smallest]->pos = smallest; + min_heapify(smallest); + } + } + + void inline buildMinHeap() { + for (int i=Q.size()/2; i > 0; i--) min_heapify(i); + } + + inline void swap(int pos1, int pos2) { + GreedyRefineCentralLB::GProc *t = Q[pos1]; + Q[pos1] = Q[pos2]; + Q[pos2] = t; + } + + void siftUp(int pos) { + if (pos == 1) return; // reached root + int ppos = pos/2; + if (Q[ppos]->load > Q[pos]->load) { + swap(ppos,pos); + Q[ppos]->pos = ppos; + Q[pos]->pos = pos; + siftUp(ppos); + } + } + + inline int minChild(int pos) const { + int c1 = pos*2; + int c2 = pos*2 + 1; + if (c1 >= Q.size()) return -1; + if (c2 >= Q.size()) return c1; + if (Q[c1]->load < Q[c2]->load) return c1; + else return c2; + } + + void siftDown(int pos) { + int cpos = minChild(pos); + if (cpos == -1) return; + if (Q[pos]->load > Q[cpos]->load) { + swap(pos,cpos); + Q[cpos]->pos = cpos; + Q[pos]->pos = pos; + siftDown(cpos); + } + } + + std::vector Q; +}; + +CreateLBFunc_Def(GreedyRefineCentralLB, "Greedy refinement-based algorithm") + +GreedyRefineCentralLB::GreedyRefineCentralLB(const CkLBOptions &opt): CBase_GreedyRefineCentralLB(opt), migrationTolerance(1.0) +{ + lbname = "GreedyRefineCentralLB"; + if ((CkMyPe() == 0) && !quietModeRequested) + CkPrintf("CharmLB> GreedyRefineCentralLB created.\n"); + if (_lb_args.percentMovesAllowed() < 100) { + migrationTolerance = float(_lb_args.percentMovesAllowed())/100.0; + } + concurrent = true; +} + +GreedyRefineCentralLB::GreedyRefineCentralLB(CkMigrateMessage *m): CBase_GreedyRefineCentralLB(m), migrationTolerance(1.0) { + lbname = "GreedyRefineCentralLB"; + if (_lb_args.percentMovesAllowed() < 100) + migrationTolerance = float(_lb_args.percentMovesAllowed())/100.0; + concurrent = true; +} + +// ------------------------------------------------ + +// regular greedy lb algorithm +double GreedyRefineCentralLB::greedyLB(const std::vector &pobjs, + GreedyRefineCentralLB::PHeap &procHeap, + const BaseLB::LDStats *stats) const +{ + double max_load = 0; + int nmoves = 0; + for (int i=0; i < pobjs.size(); i++) { + const GreedyRefineCentralLB::GObj *obj = pobjs[i]; + GreedyRefineCentralLB::GProc *p = procHeap.pop(); // least loaded processor + // update processor load + p->load += (obj->load / p->speed); + procHeap.push(p); + + if (p->id != obj->oldPE) nmoves++; + if (p->load > max_load) max_load = p->load; + } + + if ((CkMyPe() == cur_ld_balancer+1) && (_lb_args.debug() > 1)) { + CkPrintf("[%d] %f : Greedy strategy nmoves=%d, max_load=%f\n", CkMyPe(), + CkWallTimer() - strategyStartTime, nmoves, max_load); + } + return max_load; +} + +// ----------------------------------------------- +#if __DEBUG_GREEDY_REFINE_ +#include +void GreedyRefineCentralLB::dumpObjLoads(std::vector &objs) { + std::ofstream outfile("objloads.txt"); + outfile << objs.size() << std::endl; + for (int i=0; i < objs.size(); i++) { + GreedyRefineCentralLB::GObj &obj = objs[i]; + if ((i > 0) && (i % 100 == 0)) outfile << obj.load << std::endl; + else outfile << obj.load << " "; + } + outfile.close(); +} +void GreedyRefineCentralLB::dumpProcLoads(std::vector &procs) { + std::ofstream outfile("proc_bg_loads.txt"); + outfile << procs.size() << std::endl; + for (int i=0; i < procs.size(); i++) { + GreedyRefineCentralLB::GProc &p = procs[i]; + if ((i > 0) && (i % 100 == 0)) outfile << p.load << std::endl; + else outfile << p.load << " "; + } + outfile.close(); +} +#endif + +double GreedyRefineCentralLB::fillData(LDStats *stats, + std::vector &objs, + std::vector &pobjs, + std::vector &procs, + PHeap &procHeap) +{ + const int n_pes = stats->nprocs(); + const int n_objs = stats->n_migrateobjs; + // most of these variables are just for printing stats when _lb_args.debug() + int unmigratableObjs = 0; + availablePes = 0; totalObjLoad = 0; + double minBGLoad = DBL_MAX; double avgBGLoad = 0; double maxBGLoad = 0; + double minSpeed = DBL_MAX; double maxSpeed = 0; double avgSpeed = 0; + double minOload = DBL_MAX; double maxOload = 0; + + for (int pe=0; pe < n_pes; pe++) { + GreedyRefineCentralLB::GProc &p = procs[pe]; + p.id = pe; + p.available = stats->procs[pe].available; + p.speed = stats->procs[pe].pe_speed; + if (p.available) { + availablePes++; + #if !(CMK_CUDA || CMK_HIP) + p.bgload = stats->procs[pe].bg_walltime; + if (p.bgload > maxBGLoad) maxBGLoad = p.bgload; + #else + p.bgload = 0.0; + #endif + + #if (CMK_CUDA || CMK_HIP) + p.bg_walltime = stats->procs[pe].bg_walltime; + // CmiPrintf("[%d] settign bg_walltime to %f\n", pe, p.bg_walltime); + #endif + if (_lb_args.debug() > 1) { + double &speed = stats->procs[pe].pe_speed; + if (speed < minSpeed) minSpeed = speed; + if (speed > maxSpeed) maxSpeed = speed; + avgSpeed += speed; + } + } + } + if (!availablePes) CkAbort("GreedyRefineCentralLB: No available processors\n"); + + for (int i=0; i < n_objs; i++) { + LDObjData &oData = stats->objData[i]; + GreedyRefineCentralLB::GObj &obj = objs[i]; + int pe = stats->from_proc[i]; + obj.id = i; + obj.oldPE = pe; + obj.gpuPupSize = oData.gpuPupSize; + obj.gpuAllocSize = *(size_t *)oData.getUserData(CkpvAccess(_lb_obj_index)); + CkAssert(pe >= 0 && pe <= n_pes); + if (pe == n_pes) obj.oldPE = -1; // this can happen in HybridLB if object comes from outside group. mark oldPE as -1 in this situation + if (!oData.migratable) { + CkAssert(pe < n_pes); + unmigratableObjs++; + GreedyRefineCentralLB::GProc &p = procs[pe]; + if (!p.available) + CkAbort("GreedyRefineCentralLB: nonmigratable object on unavailable processor\n"); +#if CMK_CUDA || CMK_HIP + double nmObjLoad = oData.gpuTime; +#else + double nmObjLoad = oData.wallTime; +#endif + p.bgload += nmObjLoad; // take non-migratable object load as background load + //is the non migratable obj load correct + CkPrintf("[%d] Obj %d on PE %d is non-migratable, load=%.6f\n", CkMyPe(), i, pe, nmObjLoad); + if (p.bgload > maxBGLoad) maxBGLoad = p.bgload; + } else { +#if CMK_CUDA || CMK_HIP + obj.load = oData.gpuTime * stats->procs[pe].pe_speed; +#else + obj.load = oData.wallTime * stats->procs[pe].pe_speed; +#endif + // CkPrintf("[%d] Obj %d on PE %d is migratable, load=%.6f, GPU pup size=%ld, GPU alloc size=%ld\n", CkMyPe(), i, pe, obj.load, oData.gpuPupSize, obj.gpuAllocSize); + pobjs.push_back(&obj); + totalObjLoad += obj.load; + if (_lb_args.debug() > 1) { + if (obj.load < minOload) minOload = obj.load; + if (obj.load > maxOload) maxOload = obj.load; +#if CMK_CUDA || CMK_HIP + // CkPrintf("[%d] Obj %d (PE %d): wallTime=%.6f gpuTime=%.6f effectiveLoad=%.6f\n", + // CkMyPe(), i, pe, oData.wallTime, oData.gpuTime, obj.load); +#endif + } + } + } + + procHeap.addProcessors(procs, (maxBGLoad <= 0.001), true); + + // ---- print some stats ---- + // CkPrintf("here\n") + if ((_lb_args.debug() > 1) && (!concurrent || (CkMyPe() == cur_ld_balancer))) { + for (int pe=0; pe < n_pes; pe++) { + GreedyRefineCentralLB::GProc &p = procs[pe]; + if (!p.available) continue; + if (p.bgload < minBGLoad) minBGLoad = p.bgload; + avgBGLoad += p.bgload; + } + CkPrintf("[%d] GreedyRefineCentralLB: num pes=%d, num objs=%d\n", CkMyPe(), n_pes, n_objs); + CkPrintf("[%d] Unavailable processors=%d, Unmigratable objs=%d\n", CkMyPe(), n_pes - availablePes, unmigratableObjs); + CkPrintf("[%d] min_bgload=%f mean_bgload=%f max_bgload=%f\n", CkMyPe(), minBGLoad, (avgBGLoad / availablePes), maxBGLoad); + CkPrintf("[%d] min_oload=%f mean_oload=%f max_oload=%f\n", CkMyPe(), minOload, (totalObjLoad / (n_objs - unmigratableObjs)), maxOload); + CkPrintf("[%d] min_speed=%f mean_speed=%f max_speed=%f\n", CkMyPe(), minSpeed, (avgSpeed / availablePes), maxSpeed); + + double maxLoad = 0; + double minLoad = FLT_MAX; + std::vector ploads(n_pes, -1); + for (int i=0; i < n_objs; i++) { + GreedyRefineCentralLB::GObj &o = objs[i]; + int pe = o.oldPE; + if (pe < 0) continue; + if (ploads[pe] < 0) ploads[pe] = procs[pe].bgload; + if (stats->objData[i].migratable) // load for this object is already counted if !migratable + ploads[pe] += o.load; + if (ploads[pe] > maxLoad) maxLoad = ploads[pe]; + if (ploads[pe] < minLoad) minLoad = ploads[pe]; + } + CkPrintf("[%d] maxload with current map=%f\n", CkMyPe(), maxLoad); + CkPrintf("[%d] minload with current map=%f\n", CkMyPe(), minLoad); + + // CkPrintf("[%d] --- Per-PE loads before LB ---\n", CkMyPe()); + // for (int pe=0; pe < n_pes; pe++) { + // if (ploads[pe] >= 0) + // CkPrintf("[%d] PE %d: totalLoad=%.6f bgLoad=%.6f\n", + // CkMyPe(), pe, ploads[pe], procs[pe].bgload); + // } + + //CkPrintf("[%d] %f : Filled proc and obj stats\n", CkMyPe(), CkWallTimer() - strategyStartTime); + } + + return maxBGLoad; +} + +static const float Avals[] = {1.0, 1.005, 1.01, 1.015, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.16, 1.20, 1.30}; +static const float Bvals[] = {FLT_MAX, 1.0, 1.05, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3}; +#define Avals_len 14 +#define Bvals_len 16 +#define NUM_SOLUTIONS Avals_len*Bvals_len+1 +static void getGreedyRefineParams(int rank, float &A, float &B) { + if (rank == 0) { A = 0; B = -1; return; } // causes PE0 to run regular greedy + rank--; + int x = rank / Bvals_len; + if (x >= Avals_len) { + A = B = -1; + } else { + A = Avals[x]; + B = Bvals[rank % Bvals_len]; + } +} + +void GreedyRefineCentralLB::sendSolution(double maxLoad, int migrations) +{ + // gather results in central PE, who will decide which solution is the best + // only the objective values of the solutions are sent, not the whole solutions + + GreedyRefineCentralLB::Solution sol(CkMyPe(), maxLoad, migrations); + size_t buf_size = sizeof(GreedyRefineCentralLB::Solution); + void *buffer = malloc(buf_size); + PUP::toMem pd(buffer); + pd|sol; + + CkCallback cb(CkIndex_GreedyRefineCentralLB::receiveSolutions((CkReductionMsg*)NULL), thisProxy[cur_ld_balancer]); + contribute(buf_size, buffer, CkReduction::set, cb); + + if ((_lb_args.debug() > 1) && (CkMyPe() == cur_ld_balancer)) { + CkPrintf("[%d] %f : Called gather/reduction\n", CkMyPe(), CkWallTimer() - strategyStartTime); + } + + free(buffer); +} + +void GreedyRefineCentralLB::work(LDStats *stats) +{ + strategyStartTime = CkWallTimer(); + float A = 1.001, B = FLT_MAX; // Use A=0, B=-1 to imitate regular Greedy (ignore migrations) + if (concurrent) { + getGreedyRefineParams(CkMyPe(), A, B); + if (A < 0) { + sendSolution(-1,-1); // send empty response to PE0 + return; + } + } + + const int n_pes = stats->nprocs(); + totalObjs = stats->n_migrateobjs; + + std::vector objs(totalObjs); + // will sort pobjs instead of objs (faster swapping). will only contain pointers + // to migratable objects + std::vector pobjs; + pobjs.reserve(totalObjs); + + std::vector procs(n_pes); + PHeap procHeap(n_pes); + + // fill data structures used by algorithm + double maxLoad = fillData(stats, objs, pobjs, procs, procHeap); + + // ------------ apply greedy refine algorithm -------------- + + std::sort(pobjs.begin(), pobjs.end(), GreedyRefineCentralLB::ObjLoadGreater()); + + int nmoves = 0; + double greedyMaxLoad = 0; + +#if CMK_CUDA || CMK_HIP + // ---- GPU-aware path: balance at GPU-group level ---- + // + // Group PEs by gpu_device_id. M tracks the max *GPU-group* aggregate load. + // greedyLB preprocessing computes M at GPU-group level. + // Main loop: pop lightest GPU group, assign object to lightest PE in that group. + + // --- Build GPU groups from the per-PE procs vector --- + + struct GPUGrp { + uint64_t gpu_id; + double load; // aggregate load across PEs in this group + std::vector peIds; // indices into procs[] + size_t gpu_mem_remaining; + size_t pool_buff_mem_remaining; + }; + + std::vector gpuGroups; + std::unordered_map gpuIdToIdx; + + for (int pe = 0; pe < n_pes; pe++) { + GreedyRefineCentralLB::GProc &p = procs[pe]; + if (!p.available) continue; + uint64_t devId = stats->procs[pe].gpu_device_id; + size_t gpu_mem_remaining = stats->procs[pe].gpu_mem_remaining; + size_t pool_buff_mem_remaining = stats->procs[pe].pool_buff_mem_remaining; + // printf("pe gpu_id %ld\n", devId); + + auto it = gpuIdToIdx.find(devId); + if (it == gpuIdToIdx.end()) { + gpuIdToIdx[devId] = gpuGroups.size(); + GPUGrp g; + g.gpu_id = devId; + g.load = p.bgload; + g.peIds.push_back(pe); + g.gpu_mem_remaining = gpu_mem_remaining; + g.pool_buff_mem_remaining = pool_buff_mem_remaining; + gpuGroups.push_back(std::move(g)); + } else { + gpuGroups[it->second].load += p.bgload; + gpuGroups[it->second].peIds.push_back(pe); + } + } + int nGroups = gpuGroups.size(); + + // CkPrintf("[%d] GreedyRefineCentralLB: %d GPU group(s), %d available PEs, %d migratable objs\n", + // CkMyPe(), nGroups, availablePes, (int)pobjs.size()); + // for (auto &g : gpuGroups) + // CkPrintf("[%d] GPU %llu: %d PEs, bgload=%.6f, gpu_mem_remaining=%ld, pool_buff_mem_remaining=%ld\n", + // CkMyPe(), g.gpu_id, (int)g.peIds.size(), g.load, g.gpu_mem_remaining, g.pool_buff_mem_remaining); + + // --- Greedy preprocessing at GPU-group level to establish target M --- + // Reset group loads to bg only, then greedily assign objects to get M. + double M = 0; + { + // Save a copy of group bg loads + std::vector grpLoad(nGroups); + for (int gi = 0; gi < nGroups; gi++) grpLoad[gi] = gpuGroups[gi].load; + + for (int i = 0; i < (int)pobjs.size(); i++) { + // Find lightest GPU group + int lightest = 0; + for (int gi = 1; gi < nGroups; gi++) { + if (grpLoad[gi] < grpLoad[lightest]) lightest = gi; + } + grpLoad[lightest] += pobjs[i]->load; + if (grpLoad[lightest] > M) M = grpLoad[lightest]; + } + greedyMaxLoad = M; + } + M *= A; + // CkPrintf("M is %f\n", M); + + // Reset GPU group loads back to bg-only for the real assignment pass + for (int gi = 0; gi < nGroups; gi++) { + gpuGroups[gi].load = 0; + for (int pe : gpuGroups[gi].peIds) + gpuGroups[gi].load += procs[pe].bgload; + } + // Also reset per-PE loads in procHeap to bgload + procHeap.addProcessors(procs, (maxLoad <= 0.001), false); + + // if ((_lb_args.debug() > 0) && (CkMyPe() == cur_ld_balancer)) + // CkPrintf("[%d] GPU greedy-refine: M(target)=%.6f, A=%.3f, B=%.3f\n", CkMyPe(), M, A, B); + + // Reverse map: PE index -> GPU group index + std::unordered_map peToGrpIdx; + for (int gi = 0; gi < nGroups; gi++) + for (int pe : gpuGroups[gi].peIds) + peToGrpIdx[pe] = gi; + + for (int i = 0; i < (int)pobjs.size(); i++) { + const GreedyRefineCentralLB::GObj *obj = pobjs[i]; + double obj_load = obj->load; + + int lightest_gi = 0; + for (int gi = 1; gi < nGroups; gi++) { + if (gpuGroups[gi].load < gpuGroups[lightest_gi].load) + lightest_gi = gi; + } + + int src_gi = -1; + if (obj->oldPE >= 0) { + auto srcIt = peToGrpIdx.find(obj->oldPE); + if (srcIt != peToGrpIdx.end()) src_gi = srcIt->second; + } + + // Refinement: if object's current GPU group is close enough, keep it there + int chosen_gi = lightest_gi; + if (src_gi >= 0) { + GPUGrp &curGrp = gpuGroups[src_gi]; + if ((curGrp.load <= (gpuGroups[lightest_gi].load + 0.01) * B) && (curGrp.load + obj_load <= M)) + chosen_gi = src_gi; + } + + // Pool buffer constraint + if (chosen_gi != src_gi && src_gi >= 0 && obj->gpuPupSize > 0) { + if (gpuGroups[src_gi].pool_buff_mem_remaining < obj->gpuPupSize || gpuGroups[chosen_gi].pool_buff_mem_remaining < obj->gpuPupSize) + chosen_gi = src_gi; + + if((size_t)(0.95 * gpuGroups[chosen_gi].gpu_mem_remaining) < obj->gpuAllocSize )//95% of the rest of the memory can be filled + chosen_gi = src_gi; + } + + GPUGrp &g = gpuGroups[chosen_gi]; + + int bestPe = g.peIds[0]; + + //find the PE with the least walltime + for(int pe : g.peIds) { + if(procs[pe].load < procs[bestPe].load) { + bestPe = pe; + } + } + + if(obj->oldPE >= 0 && peToGrpIdx[obj->oldPE] == chosen_gi) + bestPe = obj->oldPE; + + GreedyRefineCentralLB::GProc *p = &procs[bestPe]; + double scaled = obj->load / p->speed; + + // Update PE load + procHeap.remove(p); + p->load += scaled; + procHeap.push(p); + + // Update GPU group aggregate + g.load += scaled; + + if (chosen_gi != src_gi && src_gi >= 0 && obj->gpuPupSize > 0) + { + gpuGroups[src_gi].pool_buff_mem_remaining -= obj->gpuPupSize; + gpuGroups[chosen_gi].pool_buff_mem_remaining -= obj->gpuPupSize; + gpuGroups[chosen_gi].gpu_mem_remaining-= obj->gpuAllocSize; + } + + // Track max GPU-group load; expand M if exceeded + if (g.load > maxLoad) { + maxLoad = g.load; + if (maxLoad > M) M = maxLoad; + } + + // Record migration if PE changed + if (bestPe != obj->oldPE) { + nmoves++; + stats->to_proc[obj->id] = bestPe; + // if (_lb_args.debug() > 2) + // CkPrintf("[%d] Migrating obj %d: PE %d -> PE %d (GPU %d, objLoad=%.6f, gpuGrpLoad=%.6f)\n", + // CkMyPe(), obj->id, obj->oldPE, bestPe, g.gpu_id, obj_load, g.load); + } + } + + // Print per-GPU-group loads after LB + CkPrintf("[%d] --- Per-GPU-group loads after LB ---\n", CkMyPe()); + for (int gi = 0; gi < nGroups; gi++) + CkPrintf("[%d] GPU %llu: aggregate load=%.6f\n", + CkMyPe(), gpuGroups[gi].gpu_id, gpuGroups[gi].load); + +#else + // ---- Original PE-level greedy refine (non-GPU path) ---- + + double M = 0; + if (B > 0) { + M = greedyLB(pobjs, procHeap, stats); + greedyMaxLoad = M; + procHeap.addProcessors(procs, (maxLoad <= 0.001), false); + } + + M *= A; + // if ((_lb_args.debug() > 1) && (CkMyPe() == cur_ld_balancer)) { + // CkPrintf("maxLoad=%f totalObjLoad=%f M=%f A=%f B=%f\n", maxLoad, totalObjLoad, M, A, B); + // } + for (int i=0; i < pobjs.size(); i++) { + const GreedyRefineCentralLB::GObj *obj = pobjs[i]; + GreedyRefineCentralLB::GProc *llp = procHeap.top(); + GreedyRefineCentralLB::GProc *prevPe = NULL; + if (obj->oldPE >= 0) prevPe = &(procs[obj->oldPE]); + + GreedyRefineCentralLB::GProc *p = llp; + if (prevPe && (prevPe->load <= (llp->load+0.01)*B) && (prevPe->load + obj->load <= M) && (prevPe->available)) + p = prevPe; + + procHeap.remove(p); + p->load += (obj->load / p->speed); + procHeap.push(p); + + // if (p->id != obj->oldPE) { + // nmoves++; + // stats->to_proc[obj->id] = p->id; + // if (_lb_args.debug() > 1) { + // CkPrintf("[%d] Migrating obj %d: PE %d -> PE %d (objLoad=%.6f, destPELoad=%.6f)\n", + // CkMyPe(), obj->id, obj->oldPE, p->id, obj->load, p->load); + // } + // } + if (p->load > maxLoad) { + maxLoad = p->load; + if (maxLoad > M) M = maxLoad; + } + } +#endif + // ---------------------------------------------- + // if (_lb_args.debug() > 1 && (!concurrent || (CkMyPe() == cur_ld_balancer))) { + // CkPrintf("[%d] --- Per-PE loads after LB ---\n", CkMyPe()); + // for (int pe=0; pe < n_pes; pe++) { + // GreedyRefineCentralLB::GProc &p = procs[pe]; + // if (p.available) + // CkPrintf("[%d] PE %d: totalLoad=%.6f bgLoad=%.6f\n", + // CkMyPe(), pe, p.load, p.bgload); + // } + // CkPrintf("[%d] After LB: max_load=%.6f, migrations=%d/%d (%.2f%%)\n", + // CkMyPe(), maxLoad, nmoves, (int)pobjs.size(), + // 100.0 * nmoves / double(pobjs.size())); + // } + + if (concurrent) { + + sendSolution(maxLoad, nmoves); + +#if __DEBUG_GREEDY_REFINE_ + CkCallback cb(CkReductionTarget(GreedyRefineCentralLB, receiveTotalTime), thisProxy[cur_ld_balancer]); + contribute(sizeof(double), &strategyStartTime, CkReduction::sum_double, cb); +#endif + } else if (_lb_args.debug() > 0) { + double greedyRatio = 1.0; + if (greedyMaxLoad > 0) greedyRatio = maxLoad / greedyMaxLoad; + double migrationRatio = nmoves/double(pobjs.size()); + // if ((greedyRatio > 1.03) && (migrationRatio < migrationTolerance)) { + // CkPrintf("[%d] GreedyRefine: WARNING - migration ratio is %.3f (within user-specified tolerance).\n" + // "but maxload after lb is %f higher than greedy. Consider testing with A=0, B=-1\n", + // CkMyPe(), migrationRatio, greedyRatio); + // } + // CkPrintf("[%d] GreedyRefineCentralLB: after lb, max_load=%.3f, migrations=%d(%.2f%%), ratioToGreedy=%.3f\n", + // CkMyPe(), maxLoad, nmoves, 100.0*migrationRatio, greedyRatio); + } +} + +void GreedyRefineCentralLB::receiveTotalTime(double time) +{ + CkPrintf("Avg start time of GreedyRefineCentralLB strategy is %f\n", time / CkNumPes()); +} + +// decide which solution among all PEs is best and apply it +void GreedyRefineCentralLB::receiveSolutions(CkReductionMsg *msg) +{ + std::vector results(NUM_SOLUTIONS); + + int migrationsAllowed = totalObjs * migrationTolerance; + ckout<<"migrations allowed "<getData(); // Get the first element in the set + int numSolutions = 0; + for ( ; current && (numSolutions < NUM_SOLUTIONS); current = current->next()) { + PUP::fromMem pd(¤t->data); + pd|results[numSolutions]; // store result + if (results[numSolutions].migrations >= 0) { // valid result + const GreedyRefineCentralLB::Solution &r = results[numSolutions++]; + if ((r.migrations <= migrationsAllowed) && (r.max_load < lowest_max_load_f)) { + lowest_max_load_f = r.max_load; + feasibleSolutions = true; + } + + if ((r.migrations < lowestMigrations) || + ((r.migrations == lowestMigrations) && (r.max_load < bestSol->max_load))) { + lowestMigrations = r.migrations; + bestSol = &r; + } + + if (r.max_load < lowest_max_load) lowest_max_load = r.max_load; + if (r.max_load > highest_max_load) highest_max_load = r.max_load; + } + } + results.resize(numSolutions); // for cases where CkNumPes() < NUM_SOLUTIONS + CkAssert(numSolutions > 0); + + if (feasibleSolutions) { + // second pass, get solution with low max load and migrations from feasible set + int bestMigrations = INT_MAX; // num migrations of best solution + for (int i=0; i < results.size(); i++) { + const GreedyRefineCentralLB::Solution &r = results[i]; + // Select if we find (fewer migrations and load within tolerance) or + // (same as lowest migration and better load). Since we know a feasible + // solution exists and we only minimize here, we guarantee that we'll end + // with a feasible solution. + if ((r.migrations < bestMigrations && r.max_load <= lowest_max_load_f*LOAD_MIG_BAL) || + (r.migrations == bestMigrations && r.max_load < bestSol->max_load)) { + bestMigrations = r.migrations; + bestSol = &r; + } + } + } + // else: can't satisfy user migration constraint (for this lb step), + // so just use solution with lowest num migrations + + if (_lb_args.debug() > 1) { + CkPrintf("GreedyRefineCentralLB: Lowest max_load is %f, worst max_load is %f, lowest migrations=%d\n", + lowest_max_load, highest_max_load, lowestMigrations); + + CkPrintf("GreedyRefineCentralLB: Got %d solutions at %f\nBest one is from PE %d with max_load=%f, migrations=%d\n", + numSolutions, CkWallTimer(), bestSol->pe, bestSol->max_load, bestSol->migrations); + float A, B; + getGreedyRefineParams(bestSol->pe, A, B); + CkPrintf("Best PE used params A=%f B=%f\n", A, B); + } + + // notify PE that produced the best solution + thisProxy[bestSol->pe].ApplyDecision(); +} + +#include "GreedyRefineCentralLB.def.h" + +/*@}*/ \ No newline at end of file diff --git a/src/ck-ldb/GreedyRefineCentralLB.ci b/src/ck-ldb/GreedyRefineCentralLB.ci new file mode 100644 index 0000000000..999de1b0f8 --- /dev/null +++ b/src/ck-ldb/GreedyRefineCentralLB.ci @@ -0,0 +1,10 @@ +module GreedyRefineCentralLB { + + extern module CentralLB; + initnode void lbinit(void); + group [migratable] GreedyRefineCentralLB : CentralLB { + entry void GreedyRefineCentralLB(const CkLBOptions &); + entry void receiveSolutions(CkReductionMsg *msg); + entry [reductiontarget] void receiveTotalTime(double time); + }; +}; \ No newline at end of file diff --git a/src/ck-ldb/GreedyRefineCentralLB.h b/src/ck-ldb/GreedyRefineCentralLB.h new file mode 100644 index 0000000000..a37b9dea7e --- /dev/null +++ b/src/ck-ldb/GreedyRefineCentralLB.h @@ -0,0 +1,102 @@ +/** + * \addtogroup CkLdb +*/ +/*@{*/ + +/** + * Author: jjgalvez@illinois.edu (Juan Galvez) + * Greedy algorithm to minimize cpu max_load and object migrations. + * Can find solution equal or close to regular Greedy with less (sometimes much less) migrations. + * The amount of migrations that the user can tolerate is passed via the command-line + * option +LBPercentMoves (as percentage of chares that can be moved). + * + * If LBPercentMoves is not passed, strategy assumes it can move all objects. + * In this case, the algorithm will give preference to minimizing cpu max_load. + * It will still move less than greedy, but the amount of migrations + * will depend very much on the particular case (object load distribution and processor background loads), + * + * supports processor avail bitvector + * supports nonmigratable attrib + * +*/ + +#ifndef _GREEDY_REFINE_LB_H_ +#define _GREEDY_REFINE_LB_H_ + +#include "CentralLB.h" +#include "GreedyRefineCentralLB.decl.h" + +void CreateGreedyRefineCentralLB(); +BaseLB *AllocateGreedyRefineCentralLB(); + +#define __DEBUG_GREEDY_REFINE_ 0 + +class GreedyRefineCentralLB : public CBase_GreedyRefineCentralLB { +public: + GreedyRefineCentralLB(const CkLBOptions &); + GreedyRefineCentralLB(CkMigrateMessage *m); + void work(LDStats* stats); + void receiveSolutions(CkReductionMsg *msg); + void receiveTotalTime(double time); + void setMigrationTolerance(float tol) { migrationTolerance = tol; } + +private: + bool QueryBalanceNow(int step) { return true; } + + class GProc { + public: + GProc() : available(true), load(0) {} + int id; + bool available; + int pos; // position in min heap + double load; + double bgload; // background load + #if (CMK_CUDA || CMK_HIP) + double bg_walltime; + #endif + float speed; + }; + + class GObj { + public: + int id; + double load; + int oldPE; + size_t gpuPupSize; + size_t gpuAllocSize; + }; + + class ObjLoadGreater { + public: + inline bool operator() (const GObj *o1, const GObj *o2) const { + return (o1->load > o2->load); + } + }; + + class PHeap; + class Solution; + + double fillData(LDStats *stats, + std::vector &objs, + std::vector &pobjs, + std::vector &procs, + PHeap &procHeap); + + double greedyLB(const std::vector &pobjs, PHeap &procHeap, const BaseLB::LDStats *stats) const; + void sendSolution(double maxLoad, int migrations); + + double strategyStartTime; + double totalObjLoad; + int availablePes; + float migrationTolerance; + int totalObjs; + +#if __DEBUG_GREEDY_REFINE_ + void dumpObjLoads(std::vector &objs); + void dumpProcLoads(std::vector &procs); +#endif +}; + +#endif + +/*@}*/ \ No newline at end of file diff --git a/src/ck-ldb/LBDatabase.C b/src/ck-ldb/LBDatabase.C index 8c6c1b1d4c..67acda9be1 100644 --- a/src/ck-ldb/LBDatabase.C +++ b/src/ck-ldb/LBDatabase.C @@ -6,6 +6,7 @@ LBDatabase::LBDatabase() { omCount = omsRegistering = 0; obj_walltime = 0; + obj_gputime = 0; statsAreOn = false; objsEmptyHead = -1; commTable = new LBCommTable; @@ -243,6 +244,12 @@ void LBDatabase::GetTime(LBRealType *total_walltime, LBRealType *total_cputime, //CkPrintf("HERE [%d] total: %f %f obj: %f %f idle: %f bg: %f\n", CkMyPe(), *total_walltime, *total_cputime, obj_walltime, obj_cputime, *idletime, *bg_walltime); } +void LBDatabase::GetGPUBGTime(LBRealType *bg_gputime) +{ + // TODO: implement this properly + *bg_gputime = 0; +} + void LBDatabase::ClearLoads(void) { int i; @@ -256,6 +263,9 @@ void LBDatabase::ClearLoads(void) obj->lastCpuTime = obj->data.cpuTime; #endif } +#if CMK_CUDA || CMK_HIP + obj->data.gpuTime = 0.0; +#endif obj->data.wallTime = 0.0; #if CMK_LB_CPUTIMER obj->data.cpuTime = 0.0; @@ -266,6 +276,7 @@ void LBDatabase::ClearLoads(void) commTable = new LBCommTable; machineUtil.Clear(); obj_walltime = 0; + obj_gputime = 0; #if CMK_LB_CPUTIMER obj_cputime = 0; #endif @@ -328,3 +339,17 @@ void LBDatabase::EstObjLoad(const LDObjHandle &_h, double cputime) obj->setTiming(cputime); #endif } + +void LBDatabase::EstObjGPULoad(const LDObjHandle &_h, double gputime) +{ +#if CMK_CUDA || CMK_HIP +#if CMK_LBDB_ON + LBObj *const obj = LbObj(_h); + + CmiAssert(obj != NULL); + obj->data.gpuTime = gputime; +#endif +#else + CmiAbort("LBDatabase::EstObjGPULoad called but CMK_CUDA is not set"); +#endif +} diff --git a/src/ck-ldb/LBDatabase.h b/src/ck-ldb/LBDatabase.h index b344d7c29f..ce7c37a469 100644 --- a/src/ck-ldb/LBDatabase.h +++ b/src/ck-ldb/LBDatabase.h @@ -3,12 +3,14 @@ #include "lbdb.h" +#include "objid.h" #include "LBObj.h" #include "LBOM.h" #include "LBComm.h" #include "LBMachineUtil.h" #include +#include class CkSyncBarrier; @@ -32,6 +34,7 @@ friend class LBManager; LBCommTable* commTable; bool statsAreOn; double obj_walltime; + double obj_gputime; LBMachineUtil machineUtil; CkSyncBarrier* syncBarrier; @@ -48,6 +51,13 @@ friend class LBManager; #endif } } + + inline void MeasuredObjGPUTime(double gputime) { + if (statsAreOn) { + obj_gputime += gputime; + } + } + inline LBOM* LbOM(LDOMHandle h) { return oms[h.handle]; } @@ -67,6 +77,37 @@ friend class LBManager; LbObj(h)->getTime(&walltime, &cputime); }; + inline void GetObjGPULoad(LDObjHandle &h, LBRealType &gputime) { + LbObj(h)->getGPUTime(&gputime); + }; + + inline void SetObjGPULoad(std::unordered_map &id_gputimeMap) + { + int matched = 0; + int liveObjs = 0; + for (int i = 0; i < objs.size(); i++) { + if(objs[i].obj == nullptr) + continue; + liveObjs++; + // The CUPTI map is keyed by raw element IDs (from CkMigratable::ckGetID()). + // The LB database stores IDs with collection bits prepended (when + // CMK_GLOBAL_LOCATION_UPDATE is set). Strip collection bits to match. + CmiUInt8 lb_id = objs[i].obj->ObjData().objID(); + CmiUInt8 raw_id = ck::ObjID(lb_id).getElementID(); + auto it = id_gputimeMap.find(raw_id); + if(it==id_gputimeMap.end()) { + // CkPrintf("[PE %d] SetObjGPULoad: obj %d lb_id=%lu raw_id=%lu NO MATCH\n", CmiMyPe(), i, (unsigned long)lb_id, (unsigned long)raw_id); + continue; + } + + matched++; + // CkPrintf("[PE %d] SetObjGPULoad: obj %d id=%lu -> gpuTime=%.6f s\n", + // CmiMyPe(), i, (unsigned long)it->first, it->second / 1.0e9); + objs[i].obj->setGPUTiming(it->second / 1.0e9); + } + // CkPrintf("[PE %d] SetObjGPULoad: %d/%d live objects matched from %zu CUPTI entries (objs.size=%zu)\n", + // CmiMyPe(), matched, liveObjs, id_gputimeMap.size(), objs.size()); + } inline void* GetObjUserData(LDObjHandle &h) { return LbObj(h)->getLocalUserData(); } @@ -89,6 +130,7 @@ friend class LBManager; inline void NonMigratable(LDObjHandle h) { LbObj(h)->SetMigratable(false); }; inline void Migratable(LDObjHandle h) { LbObj(h)->SetMigratable(true); }; inline void setPupSize(LDObjHandle h, size_t pup_size) { LbObj(h)->setPupSize(pup_size);}; + inline void setGPUPupSize(LDObjHandle h, size_t gpu_pup_size) { LbObj(h)->setGPUPupSize(gpu_pup_size);}; inline void UseAsyncMigrate(LDObjHandle h, bool flag) { LbObj(h)->UseAsyncMigrate(flag); }; inline int GetCommDataSz(void) { if (commTable) @@ -121,12 +163,14 @@ friend class LBManager; int migratable); void UnregisterObj(LDObjHandle h); void EstObjLoad(const LDObjHandle &h, double cpuload); + void EstObjGPULoad(const LDObjHandle &h, double cpuload); void BackgroundLoad(LBRealType *walltime, LBRealType *cputime); void Send(const LDOMHandle &destOM, const CmiUInt8 &destID, unsigned int bytes, int destObjProc, int force = 0); void MulticastSend(const LDOMHandle &_om, CmiUInt8 *_ids, int _n, unsigned int _b, int _nMsgs=1); void GetTime(LBRealType *total_walltime, LBRealType *total_cputime, LBRealType *idletime, LBRealType *bg_walltime, LBRealType *bg_cputime); + void GetGPUBGTime(LBRealType *bg_gputime); const std::vector& getObjs() {return objs;} inline void ObjectStart(const LDObjHandle &h) { @@ -143,6 +187,10 @@ friend class LBManager; obj->StopTimer(&walltime, &cputime); obj->IncrementTime(walltime, cputime); MeasuredObjTime(walltime, cputime); + + #if CMK_CUDA || CMK_HIP + MeasuredObjGPUTime(obj->data.gpuTime); + #endif } }; inline const LDObjHandle &GetObjHandle(int idx) { diff --git a/src/ck-ldb/LBManager.C b/src/ck-ldb/LBManager.C index 2c6798873f..fbfc38e03b 100644 --- a/src/ck-ldb/LBManager.C +++ b/src/ck-ldb/LBManager.C @@ -8,6 +8,8 @@ #include #include "cksyncbarrier.h" +#include "hapi_portable.h" + #include "DistributedLB.h" #include "LBManager.h" #include "LBSimulation.h" @@ -83,6 +85,7 @@ class LBDBRegistry { lbtables.emplace_back(name, fn, afn, help, shown); } + bool hasBalancers() const { return !runtime_lbs.empty() || !compile_lbs.empty(); } void addCompiletimeBalancer(const char* name) { compile_lbs.push_back(name); } void addRuntimeBalancer(const char* name, const char* legacyLBName = nullptr) { @@ -125,6 +128,11 @@ void LBRegisterBalancer(std::string name, LBCreateFn fn, LBAllocFn afn, std::str LBAllocFn getLBAllocFn(const char* lbname) { return lbRegistry.getLBAllocFn(lbname); } +bool LBHasBalancersRegistered() +{ + return lbRegistry.hasBalancers(); +} + // create a load balancer group using the strategy name static void createLoadBalancer(const std::string& lbname, const char* legacybalancer = nullptr) { @@ -212,6 +220,8 @@ void _loadbalancerInit() lbNames.push_back("Refine"); lbNames.push_back("Hybrid"); lbNames.push_back("MetisLB"); + lbNames.push_back("GreedyCentralLB"); + lbNames.push_back("GreedyRefineCentralLB"); if (CkMyPe() == 0) { if (CmiGetArgStringDesc(argv, "+balancer", &balancer, "Use this load balancer")) @@ -316,6 +326,8 @@ void _loadbalancerInit() CmiGetArgIntDesc(argv, "+LBVersion", &_lb_args.lbversion(), "LB database file version number"); CmiGetArgIntDesc(argv, "+LBCentPE", &_lb_args.central_pe(), "CentralLB processor"); + CmiGetArgIntDesc(argv, "+LBPercentMovesAllowed", &_lb_args.percentMovesAllowed(), + "For GreedyRefineCentralLB, the percentage of chares that can be moved"); bool _lb_dump_activated = false; if (CmiGetArgIntDesc(argv, "+LBDump", &LBSimulation::dumpStep, "Dump the LB state from this step")) @@ -533,10 +545,12 @@ void LBManager::init(void) { mystep = 0; new_ld_balancer = 0; + lb_in_progress = false; chare_count = 0; metabalancer = nullptr; lbdb_obj = new LBDatabase(); currentLBIndex = 0; + reallocBuffer = nullptr; #if CMK_LB_CPUTIMER obj_cputime = 0; #endif @@ -568,6 +582,7 @@ int LBManager::AddStartLBFn(std::function fn) callbk->fn = fn; callbk->on = true; + CkPrintf("Registering StartLB function %p\n", (void*)callbk); startLBFnList.push_back(callbk); startLBFn_count++; return startLBFnList.size() - 1; @@ -586,6 +601,7 @@ void LBManager::RemoveStartLBFn(int handle) void LBManager::StartLB() { + CkPrintf("Start LB called, count %d\n", startLBFn_count); if (startLBFn_count == 0) { CmiAbort("StartLB is not supported in this LB"); @@ -593,7 +609,12 @@ void LBManager::StartLB() for (int i = 0; i < startLBFnList.size(); i++) { StartLBCB* startLBFn = startLBFnList[i]; - if (startLBFn && startLBFn->on) startLBFn->fn(); + CkPrintf("StartLB checking function %d: %p, %d\n", i, (void*)startLBFn, startLBFn->on); + if (startLBFn && startLBFn->on) + { + CkPrintf("Invoking StartLB function %p\n", (void*)&startLBFn->fn); + startLBFn->fn(); + } } } @@ -751,7 +772,10 @@ void LBManager::nextLoadbalancer(int seq) // switch strategy void LBManager::switchLoadbalancer(int switchFrom, int switchTo) { - if (lbNames[switchTo] != "DistributedLB" && lbNames[switchTo] != "MetisLB") + if (lbNames[switchTo] != "DistributedLB" && + lbNames[switchTo] != "MetisLB" && + lbNames[switchTo] != "GreedyCentralLB" && + lbNames[switchTo] != "GreedyRefineCentralLB") { json config; if (lbNames[switchTo] == "Hybrid") @@ -806,8 +830,9 @@ void LBManager::pup(PUP::er& p) avail_vector_set = true; p | avail_vector; // If we're restarting with more PEs, make the new ones available - if (avail_vector.size() < CkNumPes()) - avail_vector.resize(CkNumPes(), 1); + //if (avail_vector.size() < CkNumPes()) + //avail_vector.resize(CkNumPes(), 1); + avail_vector = std::vector(CkNumPes(), 1); } else { @@ -823,6 +848,7 @@ void LBManager::pup(PUP::er& p) p | mystep; if (p.isUnpacking()) { + reallocBuffer = nullptr; if (_lb_args.metaLbOn()) { // if unpacking set metabalancer using the id @@ -1035,6 +1061,9 @@ int LDProcessorSpeed() wps = (int)((double)wps * correction + 0.5); } + if (_lb_args.debug() > 1) + CmiPrintf("LB> PE %d speed is %d\n", CkMyPe(), wps); + return wps; } @@ -1048,6 +1077,66 @@ int LBManager::ProcessorSpeed() return peSpeed; } +int LBManager::ProcessorGPUSpeed() +{ +#if CMK_hapi || CMK_HIP + static int gpuSpeed = -1; // Cache the result + + if (gpuSpeed != -1) { + return gpuSpeed; + } + + // Check if GPU is available + int deviceCount = 0; + if (hapiGetDeviceCount(&deviceCount) != hapiSuccess || deviceCount == 0) { + CmiAbort("LB> PE %d: No GPU available, GPU speed = 0\n", CkMyPe()); + } + + // Get device for this PE (round-robin assignment) + int deviceId = CkMyPe() % deviceCount; + if (hapiSetDevice(deviceId) != hapiSuccess) { + CmiAbort("LB> PE %d: Failed to set GPU device %d, GPU speed = 0\n", CkMyPe(), deviceId); + } + + // Get device properties + hapiDeviceProp prop; + if (hapiGetDeviceProperties(&prop, deviceId) != hapiSuccess) { + CmiAbort("LB> PE %d: Failed to get GPU device properties, GPU speed = 0\n", CkMyPe()); + } + + int clockRate = 0; + if (hapiDeviceGetAttribute(&clockRate, hapiDevAttrClockRate, deviceId) != hapiSuccess) { + CmiAbort("LB> PE %d: Failed to get GPU clock rate, GPU speed = 0\n", CkMyPe()); + } + + // Calculate theoretical peak single-precision FLOPS + // Formula: multiProcessorCount * maxThreadsPerMultiProcessor * clockRate(KHz) * 2(FMA) + // Convert to GFLOPS and then scale to integer for comparison with CPU speed + long long peakFLOPS = (long long)prop.multiProcessorCount * + prop.maxThreadsPerMultiProcessor * + clockRate * 2LL; // 2 for FMA (multiply-add) + + // Convert from KHz*ops to GFLOPS, then scale to reasonable integer range + double gflops = peakFLOPS / 1e6; // KHz to GHz conversion for GFLOPS + + // Scale to integer range similar to CPU ProcessorSpeed (typically 1-10000) + // Use a scaling factor to make GPU speeds comparable to CPU speeds + gpuSpeed = (int)(gflops / 100.0); // Scale down GFLOPS to reasonable range + + if (gpuSpeed < 1) gpuSpeed = 1; // Minimum speed + + if (_lb_args.debug() > 1) { + CmiPrintf("LB> PE %d GPU %s: %d SMs, %d threads/SM, %d MHz, %.1f GFLOPS -> speed %d\n", + CkMyPe(), prop.name, prop.multiProcessorCount, + prop.maxThreadsPerMultiProcessor, clockRate/1000, gflops, gpuSpeed); + } + + return gpuSpeed; +#else + CmiAbort("LB> PE %d: ProcessorGPUSpeed() GPU support not enabled in this build\n", CkMyPe()); +#endif +} + /* callable from user's code */ diff --git a/src/ck-ldb/LBManager.ci b/src/ck-ldb/LBManager.ci index e424b4cd67..5556777ad8 100644 --- a/src/ck-ldb/LBManager.ci +++ b/src/ck-ldb/LBManager.ci @@ -10,6 +10,7 @@ module LBManager { group [migratable] LBManager { entry void LBManager(void); + entry void StartLB(); entry void ResumeClients(); initnode void initnodeFn(); }; diff --git a/src/ck-ldb/LBManager.h b/src/ck-ldb/LBManager.h index 314513422e..af2999088e 100644 --- a/src/ck-ldb/LBManager.h +++ b/src/ck-ldb/LBManager.h @@ -7,6 +7,7 @@ #define LBMANAGER_H #include +#include #include "LBDatabase.h" #include "json_fwd.hpp" @@ -44,6 +45,7 @@ class CkLBArgs bool _lb_metaLbOn; char* _lb_metaLbModelDir; char* _lb_treeLBFile = (char*)"treelb.json"; + int _lb_percentMovesAllowed; // for GreedyRefineCentralLB, as percentage of chares that can be moved public: CkLBArgs() @@ -60,6 +62,7 @@ class CkLBArgs _lb_targetRatio = 1.05; _lb_metaLbOn = false; _lb_metaLbModelDir = nullptr; + _lb_percentMovesAllowed = 100; } inline char*& treeLBFile() { return _lb_treeLBFile; } inline double& lbperiod() { return _autoLbPeriod; } @@ -82,6 +85,7 @@ class CkLBArgs inline double& targetRatio() { return _lb_targetRatio; } inline bool& metaLbOn() { return _lb_metaLbOn; } inline char*& metaLbModelDir() { return _lb_metaLbModelDir; } + inline int& percentMovesAllowed() { return _lb_percentMovesAllowed; } }; extern CkLBArgs _lb_args; @@ -96,6 +100,8 @@ extern bool _lb_psizer_on; #define PredictorPrintf \ if (PREDICT_DEBUG) CmiPrintf +extern void realloc(char*); + // used in constructor of all load balancers class CkLBOptions { @@ -134,6 +140,7 @@ typedef BaseLB* (*LBAllocFn)(); void LBDefaultCreate(LBCreateFn f); void LBRegisterBalancer(std::string, LBCreateFn, LBAllocFn, std::string, bool shown = true); +bool LBHasBalancersRegistered(); template void LBRegisterBalancer(std::string name, std::string description, bool shown = true) @@ -239,8 +246,12 @@ class LBManager : public CBase_LBManager int startLBFn_count; + char* reallocBuffer; + public: int chare_count; + bool lb_in_progress; + LBManager(void) { init(); } LBManager(CkMigrateMessage* m) : CBase_LBManager(m) { init(); } @@ -264,6 +275,22 @@ class LBManager : public CBase_LBManager void configureTreeLB(const char* json_str); void configureTreeLB(json& config); + void bufferRealloc(char* bitmap) + { + int size = CkNumPes() + 2 * sizeof(int); + reallocBuffer = (char*)malloc(size); + memcpy(reallocBuffer, bitmap, size); + } + + void callRealloc() + { + if (reallocBuffer != nullptr) + { + realloc(reallocBuffer); + reallocBuffer = nullptr; + } + } + /* * Calls from object managers to load database */ @@ -288,6 +315,7 @@ class LBManager : public CBase_LBManager void NonMigratable(LDObjHandle h) { lbdb_obj->NonMigratable(h); } void Migratable(LDObjHandle h) { lbdb_obj->Migratable(h); } void setPupSize(LDObjHandle h, size_t pup_size) { lbdb_obj->setPupSize(h, pup_size); } + void setGPUPupSize(LDObjHandle h, size_t gpu_pup_size) { lbdb_obj->setGPUPupSize(h, gpu_pup_size); } void UseAsyncMigrate(LDObjHandle h, bool flag) { lbdb_obj->UseAsyncMigrate(h, flag); }; int GetObjDataSz(void) { return lbdb_obj->GetObjDataSz(); } int GetCommDataSz(void) { return lbdb_obj->GetCommDataSz(); } @@ -310,6 +338,14 @@ class LBManager : public CBase_LBManager { lbdb_obj->GetObjLoad(h, walltime, cputime); }; + void GetObjGPULoad(LDObjHandle& h, LBRealType& gputime) + { + lbdb_obj->GetObjGPULoad(h, gputime); + }; + void SetObjGPULoad(std::unordered_map &id_gputimeMap) + { + lbdb_obj->SetObjGPULoad(id_gputimeMap); + } void* GetObjUserData(LDObjHandle& h) { return lbdb_obj->GetObjUserData(h); } void MetaLBCallLBOnChares() { lbdb_obj->MetaLBCallLBOnChares(); } void MetaLBResumeWaitingChares(int lb_period) @@ -329,6 +365,10 @@ class LBManager : public CBase_LBManager { lbdb_obj->GetTime(total_walltime, total_cputime, idletime, bg_walltime, bg_cputime); } + void GetGPUBGTime(LBRealType* bg_gputime) + { + lbdb_obj->GetGPUBGTime(bg_gputime); + } LDObjHandle RegisterObj(LDOMHandle omh, CmiUInt8 id, void* userPtr, int migratable) { return lbdb_obj->RegisterObj(omh, id, userPtr, migratable); @@ -338,6 +378,10 @@ class LBManager : public CBase_LBManager { lbdb_obj->EstObjLoad(h, cpuload); } + void EstObjGPULoad(const LDObjHandle& h, double gputime) + { + lbdb_obj->EstObjGPULoad(h, gputime); + } void BackgroundLoad(LBRealType* walltime, LBRealType* cputime) { lbdb_obj->BackgroundLoad(walltime, cputime); @@ -476,6 +520,7 @@ class LBManager : public CBase_LBManager void LocalBarrierOff(void); void ResumeClients(); static int ProcessorSpeed(); + static int ProcessorGPUSpeed(); static void SetLBPeriod(double period) { _lb_args.lbperiod() = period; diff --git a/src/ck-ldb/LBObj.C b/src/ck-ldb/LBObj.C index 36ca87d85f..df04e87d62 100644 --- a/src/ck-ldb/LBObj.C +++ b/src/ck-ldb/LBObj.C @@ -28,6 +28,17 @@ void LBObj::Clear(void) data.minWall = 1e6; data.maxWall = 0.; #endif + +#if CMK_CUDA || CMK_HIP + data.gpuTime = 0.; +#endif + + startWTime = -1.0; + lastWallTime = .0; +#if CMK_LB_CPUTIMER + startCTime = -1.0; + lastCpuTime = .0; +#endif } void LBObj::IncrementTime(LBRealType walltime, LBRealType cputime) @@ -42,6 +53,13 @@ void LBObj::IncrementTime(LBRealType walltime, LBRealType cputime) #endif } +void LBObj::IncrementGPUTime(LBRealType walltime) +{ +#if CMK_CUDA || CMK_HIP + data.gpuTime += walltime; +#else + CmiAbort("LBObj::IncrementGPUTime called but CMK_CUDA is not set"); +#endif +} #endif - /*@}*/ diff --git a/src/ck-ldb/LBObj.h b/src/ck-ldb/LBObj.h index be61c68b02..f944b3a19d 100644 --- a/src/ck-ldb/LBObj.h +++ b/src/ck-ldb/LBObj.h @@ -20,19 +20,7 @@ friend class LBDatabase; data.migratable = _migratable; data.asyncArrival = _asyncArrival; Clear(); -// data.cpuTime = 0.; -// data.wallTime = 0.; -// data.minWall = 1e6; -// data.maxWall = 0.; localUserData = usr_ptr; -// migratable = _migratable; -// registered = true; - startWTime = -1.0; - lastWallTime = .0; -#if CMK_LB_CPUTIMER - startCTime = -1.0; - lastCpuTime = .0; -#endif } ~LBObj() { }; @@ -40,26 +28,29 @@ friend class LBDatabase; void Clear(void); void IncrementTime(LBRealType walltime, LBRealType cputime); + void IncrementGPUTime(LBRealType walltime); + inline void StartTimer(void) { - startWTime = CkWallTimer(); + startWTime = CkWallTimer(); #if CMK_LB_CPUTIMER - startCTime = CkCpuTimer(); + startCTime = CkCpuTimer(); #endif } + inline void StopTimer(LBRealType* walltime, LBRealType* cputime) { - if (startWTime >= 0.0) { // in case startOn in middle of entry - const double endWTime = CkWallTimer(); - *walltime = endWTime - startWTime; + if (startWTime >= 0.0) { // in case startOn in middle of entry + const double endWTime = CkWallTimer(); + *walltime = endWTime - startWTime; #if CMK_LB_CPUTIMER - const double endCTime = CkCpuTimer(); - *cputime = endCTime - startCTime; + const double endCTime = CkCpuTimer(); + *cputime = endCTime - startCTime; #else - *cputime = *walltime; + *cputime = *walltime; #endif - } - else { - *walltime = *cputime = 0.0; - } + } + else { + *walltime = *cputime = 0.0; + } } inline void getTime(LBRealType *w, LBRealType *c) { @@ -71,6 +62,14 @@ friend class LBDatabase; #endif } + inline void getGPUTime(LBRealType *w) { + #if CMK_CUDA || CMK_HIP + *w = data.gpuTime; + #else + CmiAbort("LBObj::getGPUTime called but CMK_CUDA is not set"); + #endif + } + inline void setTiming(LBRealType cputime) { data.wallTime = cputime; @@ -79,12 +78,25 @@ friend class LBDatabase; #endif } + inline void setGPUTiming(LBRealType gputime) + { + #if CMK_CUDA || CMK_HIP + data.gpuTime = gputime; + #else + CmiAbort("LBObj::setGPUTiming called but CMK_CUDA is not set"); + #endif + } + inline LDOMHandle &parentOM() { return data.handle.omhandle; } inline const LDObjHandle &GetLDObjHandle() const { return data.handle; } inline void SetMigratable(bool mig) { data.migratable = mig; } inline void setPupSize(size_t obj_pup_size) { data.pupSize = pup_encodeSize(obj_pup_size); } + inline void setGPUPupSize(size_t obj_gpu_pup_size){ + data.gpuPupSize = obj_gpu_pup_size; + } + inline void UseAsyncMigrate(bool async) { data.asyncArrival = async; } inline LDObjData &ObjData() { return data; }; inline void lastKnownLoad(LBRealType *w, LBRealType *c) { diff --git a/src/ck-ldb/Make.lb b/src/ck-ldb/Make.lb index 4b53c60d48..a34616b2bb 100644 --- a/src/ck-ldb/Make.lb +++ b/src/ck-ldb/Make.lb @@ -4,6 +4,8 @@ COMMON_LDBS=\ DistributedLB \ MetisLB \ RecBipartLB \ + GreedyCentralLB \ + GreedyRefineCentralLB \ manager.o ALL_LDBS=\ @@ -11,6 +13,8 @@ ALL_LDBS=\ DistributedLB \ MetisLB \ RecBipartLB \ + GreedyCentralLB \ + GreedyRefineCentralLB \ manager.o $(L)/libmoduleTreeLB.a: @@ -19,6 +23,12 @@ LBHEADERS += TreeLB.h TreeLB.decl.h $(L)/libmoduleDistributedLB.a: LBHEADERS += DistributedLB.h DistributedLB.decl.h +$(L)/libmoduleGreedyCentralLB.a: +LBHEADERS += GreedyCentralLB.h GreedyCentralLB.decl.h + +$(L)/libmoduleGreedyRefineCentralLB.a: +LBHEADERS += GreedyRefineCentralLB.h GreedyRefineCentralLB.decl.h + $(L)/libmoduleMetisLB.a: LBHEADERS += MetisLB.h MetisLB.decl.h @@ -37,6 +47,8 @@ ALL_LB_OBJS=EveryLB.o \ TreeLB.o \ DistributedLB.o \ MetisLB.o \ + GreedyCentralLB.o \ + GreedyRefineCentralLB.o \ RecBipartLB.o \ ScotchLB.o \ TempAwareRefineLB.o \ @@ -46,12 +58,16 @@ EVERYLB_DEPS=EveryLB.o \ DistributedLB.o \ MetisLB.o \ RecBipartLB.o \ + GreedyCentralLB.o \ + GreedyRefineCentralLB.o # CommonLBs dependencies COMMONLBS_DEPS=CommonLBs.o \ TreeLB.o \ DistributedLB.o \ MetisLB.o \ RecBipartLB.o \ + GreedyCentralLB.o \ + GreedyRefineCentralLB \ manager.o \ $(L)/libmoduleEveryLB.a: $(EVERYLB_DEPS) diff --git a/src/ck-ldb/Makefile_lb.sh b/src/ck-ldb/Makefile_lb.sh index 662052596d..96ee55bb30 100755 --- a/src/ck-ldb/Makefile_lb.sh +++ b/src/ck-ldb/Makefile_lb.sh @@ -1,7 +1,7 @@ #!/bin/bash #Typical load balancers -COMMON_LDBS="TreeLB DistributedLB MetisLB RecBipartLB" +COMMON_LDBS="TreeLB DistributedLB MetisLB GreedyCentralLB RecBipartLB" #Load balancers for more specialized circumstances SPECIALIZED_LDBS="" #Load balanders which have an external dependency, or require some other kind of intervention diff --git a/src/ck-ldb/MetisLB.C b/src/ck-ldb/MetisLB.C index e1fc0fabf1..8d1a6a0ee1 100644 --- a/src/ck-ldb/MetisLB.C +++ b/src/ck-ldb/MetisLB.C @@ -85,12 +85,19 @@ void MetisLB::work(LDStats* stats) std::vector adjwgt(numEdges); int edgeNum = 0; - const double ratio = 256.0 / maxLoad; + double ratio; + if (maxLoad == 0) + ratio = 0; + else + ratio = 256.0 / maxLoad; for (int i = 0; i < numVertices; i++) { xadj[i] = edgeNum; - vwgt[i] = (int)ceil(ogr->vertices[i].getVertexLoad() * ratio); + if (ogr->vertices[i].getVertexLoad() == 0 && ratio == 0) + vwgt[i] = 1; + else + vwgt[i] = (int)ceil(ogr->vertices[i].getVertexLoad() * ratio); for (const auto& outEdge : ogr->vertices[i].sendToList) { adjncy[edgeNum] = outEdge.getNeighborId(); @@ -151,9 +158,16 @@ void MetisLB::work(LDStats* stats) // tpwghts: target partition weight, can pass NULL to equally divide // ubvec: of size ncon to indicate allowed load imbalance tolerance (> 1.0) // options: array of options; edgecut: stores the edgecut; pemap: mapping - METIS_PartGraphRecursive(&numVertices, &ncon, xadj.data(), adjncy.data(), vwgt.data(), - vsize, adjwgt.data(), &numPes, tpwgts, ubvec.data(), - options.data(), &edgecut, pemap.data()); + CkPrintf("Metis partitioning in %i partitions\n", parr->availProcSize); + + if (parr->availProcSize > 1) + METIS_PartGraphRecursive(&numVertices, &ncon, xadj.data(), adjncy.data(), vwgt.data(), + vsize, adjwgt.data(), &parr->availProcSize, tpwgts, ubvec.data(), + options.data(), &edgecut, pemap.data()); + else + pemap.resize(numVertices, 0); + + parr->reassignPeMapToAvailable(pemap); if (_lb_args.debug() >= 1) { diff --git a/src/ck-ldb/TreeBuilder.h b/src/ck-ldb/TreeBuilder.h index ecdcb25333..0780e9c32b 100644 --- a/src/ck-ldb/TreeBuilder.h +++ b/src/ck-ldb/TreeBuilder.h @@ -133,24 +133,6 @@ class PE_Root_Tree : public LBTreeBuilder logic[1] = level; } - if (CkMyPe() == 0 && !quietModeRequested) - { - CkPrintf("[%d] TreeLB: Using PE_Root tree with: ", CkMyPe()); - for (const auto& strategy : strategies) - { - CkPrintf("%s ", strategy.c_str()); - } - CkPrintf("\n"); - - if (_lb_args.debug() > 0) - { - CkPrintf( - "\tUsing %d as root\n" - "\tTest PE Speed: %s\n", - rootPE, _lb_args.testPeSpeed() ? "true" : "false"); - } - } - return L; } }; diff --git a/src/ck-ldb/TreeLB.C b/src/ck-ldb/TreeLB.C index 9ca4ba8f5d..564ff9fea6 100644 --- a/src/ck-ldb/TreeLB.C +++ b/src/ck-ldb/TreeLB.C @@ -4,11 +4,21 @@ #include "TreeLB.h" #include "TreeStrategyFactory.h" #include "spanningTree.h" +#include "ck.h" #include // TODO delete if json file is read from LBManager #include #include "json.hpp" extern int quietModeRequested; +#if CMK_SHRINK_EXPAND +extern "C" void charmrun_realloc(char *s); +extern char willContinue; +extern realloc_state pending_realloc_state; +extern char * se_avail_vector; +extern char *_shrinkexpand_basedir; +extern int numProcessAfterRestart; +extern bool load_balancer_created; +#endif static void lbinit() { @@ -20,6 +30,9 @@ static void lbinit() } LBRegisterBalancer( "TreeLB", "Pluggable hierarchical LB with available strategies:" + o.str()); +#if CMK_SHRINK_EXPAND + load_balancer_created = true; +#endif } void TreeLB::Migrated(int waitBarrier) @@ -27,6 +40,25 @@ void TreeLB::Migrated(int waitBarrier) objMovedIn(waitBarrier); } +void TreeLB::StartLB(){ + CkPrintf("TreeLB::StartLB called on PE %d\n", CkMyPe()); + if (logic[1]) { + CkPrintf("size of stats_msgs = %d\n", logic[1]->stats_msgs.size()); + } + + bool rateAware = false; + LBStatsMsg_1* mm = (LBStatsMsg_1*)logic[1]->stats_msgs[0]; + if ((void*)mm->speeds != (void*)mm->obj_start) rateAware = true; + + // if (logic[1]->getNumNewPes() == 0 || !rateAware) { + // CkPrintf("TreeLB::StartLB: no new PEs detected, starting load balancing\n"); + // loadBalanceSubtree(numLevels - 1); + // } + // else + thisProxy.restartFromSE(rateAware); + +} + void TreeLB::loadConfigFile(const CkLBOptions& opts) { config.clear(); @@ -121,6 +153,55 @@ void TreeLB::init(const CkLBOptions& opts) #endif } +void TreeLB::collectSpeeds(int pe_id, float speed) { + if (_lb_args.debug() > 2) CkPrintf("[PE %d] TreeLB::collectSpeeds from PE %d speed=%f\n", CkMyPe(), pe_id, speed); + if (logic[1]->collectSpeeds(pe_id, speed)) + loadBalanceSubtree(numLevels - 1); + else + CkPrintf("[PE %d] TreeLB::collectSpeeds: still waiting for more speeds\n", CkMyPe()); +} + +void TreeLB::restartFromSE(bool rateAware) { + // TODO: need to collect and recompute bg load as well for the new pes + + if (CkMyPe() == 0 && rateAware) { + // if there was just 1 pe initially, the speed isn't set, so recompute it here + // TODO: ideally this should be rearranged so that the stats msgs are always set up correctly + LBStatsMsg_1* msg = (LBStatsMsg_1*)logic[1]->stats_msgs[0]; + for (int i = 0; i < msg->nPes; i++) { + if (msg->pe_ids[i] == 0 && msg->speeds[i] == 1.0 ) { + msg->speeds[i] = lbmgr->ProcessorSpeed(); + } + } + } + if (thisPeNew && rateAware) { + if (CkMyPe() == 0) CkAbort("[PE %d] Should never be new\n", CkMyPe()); + float speed = float(lbmgr->ProcessorSpeed()); + thisProxy[0].collectSpeeds(CkMyPe(), speed); + thisPeNew = false; + } + + logic[0]->resetObjs(); + + if (CkMyPe() == 0 && !rateAware) { + loadBalanceSubtree(numLevels - 1); + } +} + +void TreeLB::expand_init() +{ + awaitingLB[0] = true; + awaitingLB[1] = false; + + if (CkMyPe() == 0) + awaitingLB[1] = true; // root level also needs to do LB + + if (CkNumPes() == 1) + awaitingLB[0] = awaitingLB[1] = false; // no need for PE level if only 1 PE + + numLevels = 2; +} + TreeLB::~TreeLB() { #if CMK_LBDB_ON @@ -139,7 +220,7 @@ TreeLB::~TreeLB() void TreeLB::configure(LBTreeBuilder& builder, json& config) { #if CMK_LBDB_ON - + if (_lb_args.debug() > 0) if (numLevels > 0 && CkMyPe() == 0 && !quietModeRequested) { CkPrintf("[%d] Reconfiguring TreeLB\n", CkMyPe()); @@ -206,23 +287,57 @@ void TreeLB::configure(json& config) void TreeLB::pup(PUP::er& p) { - std::string configString; - if (p.isPacking()) - { - configString = config.dump(); - } - p | configString; - if (p.isUnpacking()) - { - config = json::parse(configString); + if (_lb_args.debug() > 2) + CkPrintf("[%d] TreeLB::pup numLevels=%d\n", CkMyPe(), numLevels); + + p|seqno; + + if(p.isUnpacking()){ + loadConfigFile(CkLBOptions(seqno)); init(CkLBOptions(seqno)); + manager_init(); } + + assert(numLevels == 2); // rn this only supports the two level tree + + if (logic[1] == nullptr) { // TODO: delete this memory + logic[1] = new RootLevel(); // this is needed because logic[1] is null on PE1, but PE1 still needs to participate in this... confusing? + } + + if (_lb_args.debug() > 2) + CkPrintf("[%d] TreeLB::pupping logic things\n", CkMyPe()); + + int oldPE; + if (p.isPacking()) oldPE = CkMyPe(); + p|oldPE; + if (p.isUnpacking()) { + if (CkMyPe() != oldPE) { + thisPeNew = true; + } + } + + p|*logic[0]; + p|*logic[1]; + + if (p.isUnpacking()) + expand_init(); } -void TreeLB::InvokeLB() +void TreeLB::CallLB() { -#if CMK_LBDB_ON - // NOTE: I'm assuming new LBManager will know when (and when not to) call AtSync + #if CMK_LBDB_ON + #if CMK_SHRINK_EXPAND + + // if (pending_realloc_state != NO_REALLOC) { + // // if (_lb_args.debug() > 0) + // // CkPrintf("TreeLB::CallLB pending_realloc_state=%d (EXPAND_MSG_RECEIVED %d, NO_REALLOC %d)\n", pending_realloc_state, EXPAND_MSG_RECEIVED, NO_REALLOC); + // configure(config); // reconfigure tree in case number of PEs changed + // CkPrintf("Done reconfiguring tree\n"); + // } + + + #endif + if (barrier_before_lb) { contribute(CkCallback(CkReductionTarget(TreeLB, ProcessAtSync), thisProxy)); @@ -231,6 +346,15 @@ void TreeLB::InvokeLB() { thisProxy[CkMyPe()].ProcessAtSync(); } + #endif +} + +void TreeLB::InvokeLB() +{ +#if CMK_LBDB_ON + // NOTE: I'm assuming new LBManager will know when (and when not to) call AtSync + lbmgr->lb_in_progress = true; + CallLB(); #endif } @@ -242,22 +366,46 @@ void TreeLB::ProcessAtSync() { CkPrintf("--------- Started LB step %d ---------\n", lbmgr->step()); } - // CmiAssert(CmiNodeAlive(CkMyPe())); // TODO move this logic to LBManager - int level = 0; // load balancing starts at the lowest level - CkAssert(numLevels > 0 && !awaitingLB[level]); - TreeLBMessage* stats = logic[level]->getStats(); - stats->level = level; - awaitingLB[level] = true; + CkAssert(numLevels > 0 && !awaitingLB[0]); + TreeLBMessage* stats = logic[0]->getStats(); + stats->level = 0; + awaitingLB[0] = true; + sendStatsUp((CkMessage*)stats); #endif } +void TreeLB::CheckForLB() { +#if CMK_SHRINK_EXPAND +// // if (_lb_args.debug() > 0) +// // CkPrintf("TreeLB::CheckForLB pending_realloc_state=%d (EXPAND_MSG_RECEIVED %d, NO_REALLOC %d)\n", pending_realloc_state, EXPAND_MSG_RECEIVED, NO_REALLOC); + + if (pending_realloc_state == EXPAND_MSG_RECEIVED) + checkForRealloc(); + //else if (pending_realloc_state == NO_REALLOC) + // thisProxy.resumeClients(0); + else + loadBalanceSubtree(numLevels - 1); + //thisProxy.CallLB(); +#else + //thisProxy.CallLB(); + loadBalanceSubtree(numLevels - 1); +#endif +} + // send stats up using the comm-tree for this level void TreeLB::sendStatsUp(CkMessage* msg) { TreeLBMessage* stats = (TreeLBMessage*)msg; int level = stats->level; + if (comm_parent.size() <= level || comm_children.size() <= level || + comm_logic.size() <= level) + { + CkAbort("TreeLB: sendStatsUp invalid level %d, or comm_parent not initialized\n", level); + } + int comm_parent_pe = comm_parent[level]; + // fprintf(stderr, "[%d] TreeLB::sendStatsUp - received msg level=%d comm_parent=%d\n", // CkMyPe(), level, comm_parent_pe); if (comm_parent_pe == -1) @@ -298,7 +446,16 @@ void TreeLB::receiveStats(TreeLBMessage* stats, int level) { // cutoff can be adjusted dynamically, to prevent lb between upper-level domains. // can be used, for example, to only do within-node lb on some steps - loadBalanceSubtree(level); + TreeLBMessage* newMsg = l->mergeStats(); // this is IN PLACE + + #if CMK_SHRINK_EXPAND + //contribute(CkCallback(CkReductionTarget(TreeLB, CheckForLB), thisProxy[0])); + CheckForLB(); + #else + //CallLB(); + loadBalanceSubtree(level); + #endif + //loadBalanceSubtree(level); } else { @@ -311,6 +468,7 @@ void TreeLB::receiveStats(TreeLBMessage* stats, int level) void TreeLB::loadBalanceSubtree(int level) { + if (_lb_args.debug()) CkPrintf("[PE %d] TreeLB::loadBalanceSubtree called for level %d, awaiting %s\n", CkMyPe(), level, awaitingLB[level] ? "true" : "false"); if (!awaitingLB[level]) return; awaitingLB[level] = false; if (level == 0) return lb_done(); @@ -319,7 +477,8 @@ void TreeLB::loadBalanceSubtree(int level) /// CkMessage *inter_subtree_migrations = nullptr; IDM idm; - TreeLBMessage* decision = logic[level]->loadBalance(idm); + if (_lb_args.debug()) CkPrintf("[PE %d] Calling loadBalance at level %d\n", CkMyPe(), level); + TreeLBMessage* decision = logic[level]->loadBalance(idm); // this result is the MigMsg if (idm.size() > 0) { // this can happen when final destinations of chares has been decided, @@ -342,6 +501,8 @@ void TreeLB::loadBalanceSubtree(int level) // send decision to next level decision->level = level - 1; sendDecisionDown((CkMessage*)decision); + + } void TreeLB::multicastIDM(const IDM& mig_order, int num_pes, int* _pes) @@ -357,6 +518,8 @@ void TreeLB::multicastIDM(const IDM& mig_order, int num_pes, int* _pes) thisProxy[*tb.begin(i)].multicastIDM(mig_order, tb.subtreeSize(i), tb.begin(i)); } migrateObjects(mig_order); + + } void TreeLB::sendDecisionDown(CkMessage* msg) @@ -397,11 +560,11 @@ void TreeLB::sendDecisionDown(CkMessage* msg) void TreeLB::receiveDecision(TreeLBMessage* decision, int level) { // fprintf(stderr, "[%d] TreeLB::receiveDecision, level=%d\n", CkMyPe(), level); - // incoming and outgoing are integers. logic objects determine and interpret these // values int& incoming = expected_incoming[level]; int& outgoing = expected_outgoing[level]; + //CkPrintf("[PE %d] TreeLB::receiveDecision at level %d, incoming=%d outgoing=%d\n", CkMyPe(), level, incoming, outgoing); logic[level]->processDecision(decision, incoming, outgoing); // fprintf(stderr, "[%d] level=%d incoming=%d outgoing=%d\n", CkMyPe(), level, incoming, // outgoing); @@ -471,10 +634,12 @@ void TreeLB::recvLoadTokens(CkMessage* tokens) #endif int load = logic[level]->tokensReceived(token_set); load_received[level] += load; + CkPrintf("[PE %d] TreeLB::recvLoadTokens, load_received = %d\n", CkMyPe(), load_received[level]); + checkLoadExchanged(level); } -void TreeLB::objMovedIn(bool waitBarrier) +void TreeLB::objMovedIn(bool waitBarrier) // this should be called, but is not { if (!waitBarrier) CkAbort("TreeLB future migrates not supported\n"); @@ -483,6 +648,7 @@ void TreeLB::objMovedIn(bool waitBarrier) int level = 0; CkAssert(numLevels > 0 && awaitingLB[level]); load_received[level] += 1; + checkLoadExchanged(level); } @@ -497,7 +663,79 @@ void TreeLB::migrateObjects(const IDM& mig_order) checkLoadExchanged(level); } +void TreeLB::checkForRealloc() +{ +#if CMK_SHRINK_EXPAND +if (_lb_args.debug() > 0) { + CkPrintf( + "Check for Realloc. Number of stats messages: %d\n", + logic[1]->stats_msgs.size() + ); +} + + if(pending_realloc_state != NO_REALLOC) { + pending_realloc_state = (pending_realloc_state == SHRINK_MSG_RECEIVED) ? SHRINK_IN_PROGRESS : EXPAND_IN_PROGRESS; //in progress + CkPrintf("Load balancer invoking charmrun to handle reallocation on pe %d\n", CkMyPe()); + double end_lb_time = CkWallTimer(); + + // do checkpoint + CkCallback cb(CkIndex_TreeLB::resumeFromReallocCheckpoint(), thisProxy[0]); + + // print avail vector + if (_lb_args.debug() > 0) { + CkPrintf("Shrink/Expand se_avail_vector on pe %d: ", CkMyPe()); + for(int i=0;i(se_avail_vector, se_avail_vector + CkNumPes())); + } + else + { + thisProxy.lb_done_impl(); + } +#endif +} + +void TreeLB::resumeFromReallocCheckpoint() +{ +#if CMK_SHRINK_EXPAND + std::vector avail(se_avail_vector, se_avail_vector + CkNumPes()); + free(se_avail_vector); + thisProxy.willIbekilled(avail, numProcessAfterRestart); +#endif +} + +void TreeLB::willIbekilled(std::vector avail, int newnumProcessAfterRestart){ +#if CMK_SHRINK_EXPAND + numProcessAfterRestart = newnumProcessAfterRestart; + CkCallback cb(CkIndex_TreeLB::startCleanup(), thisProxy[0]); + contribute(cb); +#endif +} + +void TreeLB::startCleanup() +{ +#if CMK_SHRINK_EXPAND + CkCleanup(); +#endif +} + void TreeLB::lb_done() +{ +#if CMK_SHRINK_EXPAND + // barrier to check for reallocation + CkCallback cb(CkIndex_TreeLB::checkForRealloc(), thisProxy[0]); + contribute(cb); + return; +#else + lb_done_impl(); +#endif +} + +void TreeLB::lb_done_impl() { // fprintf(stderr, "[%d] lb_done step %d lb_time=%f\n", CkMyPe(), lbmgr->step(), // CkWallTimer() - startTime); @@ -505,8 +743,15 @@ void TreeLB::lb_done() // TODO LBManager should do all of this, including global syncResume ****** // Currently, TreeLB does syncResume by setting barrier_after_lb=true - // clear load stats + +#if CMK_SHRINK_EXPAND + // Only clear loads if not in the middle of a reallocation (EXPAND/SHRINK) + if (pending_realloc_state == NO_REALLOC){ + lbmgr->ClearLoads(); + } +#else lbmgr->ClearLoads(); +#endif if (CkMyPe() == 0 && _lb_args.debug() > 0) { @@ -555,6 +800,11 @@ void TreeLB::resumeClients() } } lbmgr->ResumeClients(); + + lbmgr->lb_in_progress = false; + + if (CkMyPe() == 0) + lbmgr->callRealloc(); } void TreeLB::reportLbTime(double* times, int n) diff --git a/src/ck-ldb/TreeLB.ci b/src/ck-ldb/TreeLB.ci index 312f9d38ca..4227fbe369 100644 --- a/src/ck-ldb/TreeLB.ci +++ b/src/ck-ldb/TreeLB.ci @@ -1,5 +1,7 @@ module TreeLB { - + PUPable LevelLogic; + PUPable RootLevel; + PUPable PELevel; include "idm.h"; extern module BaseLB; @@ -17,6 +19,17 @@ module TreeLB { entry void multicastIDM(IDM &mig_order, int num_pes, int _pes[num_pes]); entry [reductiontarget] void resumeClients(void); entry [reductiontarget] void reportLbTime(double times[n], int n); + + entry void resumeFromReallocCheckpoint(); + entry void lb_done_impl(); + entry void startCleanup(); + entry [reductiontarget] void CheckForLB(); + entry void CallLB(); + entry void checkForRealloc(); + entry void willIbekilled(std::vector avail, int newnumProcessAfterRestart); + + entry void restartFromSE(bool rateAware); + entry void collectSpeeds(int pe_id, float speed); }; }; diff --git a/src/ck-ldb/TreeLB.h b/src/ck-ldb/TreeLB.h index 31445f5dc3..aeadaf95ad 100644 --- a/src/ck-ldb/TreeLB.h +++ b/src/ck-ldb/TreeLB.h @@ -6,6 +6,7 @@ #include "BaseLB.h" #include "TreeLB.decl.h" #include "json.hpp" +#include "manager.h" #include using json = nlohmann::json; @@ -25,18 +26,29 @@ class TreeLBMessage public: uint8_t level; // WARNING: don't add any virtual methods here + + virtual void pup(PUP::er& p) { CkAbort("TreeLBMessage::pup not implemented\n"); } }; -class LevelLogic +class LevelLogic : public PUP::able { public: + std::vector stats_msgs; + + LevelLogic() : PUP::able() { + num_stats_msgs = 0; + num_strategies = 0; + } virtual ~LevelLogic() {} /// return msg with lb stats for this PE. only needed at leaves virtual TreeLBMessage* getStats() { CkAbort("LevelLogic::getStats not implemented\n"); } + virtual bool collectSpeeds(int pe_id, float speed) { CkAbort("LevelLogic::collectSpeeds not implemented\n"); return false; } + virtual int getNumNewPes() { CkAbort("LevelLogic::getNumNewPes not implemented\n"); return 0; } // Note: These are not "=0" methods, because then the subclass would have to // implement (and abort inside) empty methods if it doesn't need them + virtual void resetObjs() { CkAbort("LevelLogic::resetObjs not implemented\n"); } /// deposit stats msg received from a child virtual void depositStats(TreeLBMessage* stats) { stats_msgs.push_back(stats); } @@ -102,6 +114,18 @@ class LevelLogic CkAbort("LevelLogic::processDecision not implemented\n"); } + PUPable_decl(LevelLogic); + LevelLogic(CkMigrateMessage *m) : PUP::able(m) {} + virtual void pup(PUP::er& p) { + PUP::able::pup(p); + if (p.isPacking()) { + CkPrintf("[PE %d] PUPPING LevelLogic with %d stats and %d strategies\n", CkMyPe(), stats_msgs.size()); + num_stats_msgs = stats_msgs.size(); + } + p|num_stats_msgs; + } + + virtual bool makesTokens() { return false; } /// return nominal load that is being transferred in the tokens @@ -124,7 +148,8 @@ class LevelLogic } protected: - std::vector stats_msgs; + unsigned int num_stats_msgs; + int num_strategies; }; class LBTreeBuilder; @@ -139,10 +164,21 @@ class TreeLB : public CBase_TreeLB { loadConfigFile(opts); init(opts); +#if CMK_SHRINK_EXPAND + manager_init(); +#endif + } + + TreeLB(CkMigrateMessage* m) : CBase_TreeLB(m) + { +#if CMK_SHRINK_EXPAND + CkPrintf("TREELB MIGRATION constructor ON PE %d\n", CkMyPe()); +#endif } - TreeLB(CkMigrateMessage* m) : CBase_TreeLB(m) {} + virtual ~TreeLB(); + void expand_init(); void pup(PUP::er& p); void loadConfigFile(const CkLBOptions& opts); @@ -153,7 +189,7 @@ class TreeLB : public CBase_TreeLB // start load balancing (non-AtSync mode) NOTE: This seems to do a broadcast // (is this the behavior we want?) - inline void StartLB() { thisProxy.ProcessAtSync(); } + void StartLB(); // TODO: I would rename this group of functions (to maybe something like startLBLocal) // since they are also used in non-AtSync mode @@ -163,6 +199,8 @@ class TreeLB : public CBase_TreeLB // output look funny // TODO: do we still need this? + + // send stats up using the comm-tree for this level void sendStatsUp(CkMessage* stats); @@ -180,6 +218,20 @@ class TreeLB : public CBase_TreeLB void reportLbTime(double* times, int n); + void resumeFromReallocCheckpoint(); + + void lb_done_impl(); + + void startCleanup(); + void CallLB(); + void CheckForLB(); + + void checkForRealloc(); + + void willIbekilled(std::vector avail, int newnumProcessAfterRestart); + void restartFromSE(bool rateAware); + void collectSpeeds(int pe_id, float speed); + private: void init(const CkLBOptions& opts); @@ -187,6 +239,7 @@ class TreeLB : public CBase_TreeLB void receiveStats(TreeLBMessage* stats, int level); void loadBalanceSubtree(int level); + void setupForProcessing(int level); // receive lb decision from parent (decision could be empty -do nothing-) // a non-empty decision implies load is moved from one subtree to another subtree @@ -211,6 +264,7 @@ class TreeLB : public CBase_TreeLB // load can be actual objects or tokens inline bool checkLoadReceived(int level) { + //if (_lb_args.debug() > 2) CkPrintf("[PE %d] TreeLB::checkLoadReceived at level %d: received=%d expected=%d\n", CkMyPe(), level, load_received[level], expected_incoming[level]); if (load_received[level] == expected_incoming[level]) { load_received[level] = expected_incoming[level] = 0; @@ -229,6 +283,8 @@ class TreeLB : public CBase_TreeLB uint8_t numLevels = 0; // total number of tree levels (this chare won't necessarily // participate in all levels) + + bool thisPeNew = false; // true if this PE is new after a shrink/expand operation std::vector logic; // level -> my logic object at this level std::vector comm_parent; // level -> my parent PE in comm-tree connecting level to level+1 diff --git a/src/ck-ldb/TreeLevel.h b/src/ck-ldb/TreeLevel.h index db41872f74..1e1be9396f 100644 --- a/src/ck-ldb/TreeLevel.h +++ b/src/ck-ldb/TreeLevel.h @@ -9,6 +9,7 @@ #include "TreeStrategyFactory.h" #include #include // std::numeric_limits +#include #define FLOAT_TO_INT_MULT 10000 @@ -43,6 +44,32 @@ class LBStatsMsg_1 : public TreeLBMessage, public CMessage_LBStatsMsg_1 // considered to have ID i unsigned int* order; // list of obj ids sorted by load (ids are determined by position in oloads) + + void pup(PUP::er& p) + { + + p|nObjs; + p|nPes; + + + for (int i = 0; i < nPes; i++) + p|pe_ids[i]; + for (int i = 0; i < nPes; i++) + p|bgloads[i]; + for (int i = 0; i < nPes; i++) + p|speeds[i]; + for (int i = 0; i < nPes + 1; i++) + p|obj_start[i]; + for (int i = 0; i < nObjs; i++) + p|oloads[i]; + for (int i = 0; i < nObjs; i++) + p|order[i]; + + + CkPrintf("[PE %d] Done PUPPING LBStatsMsg_1 with %d objs and %d pes\n", CkMyPe(), nObjs, nPes); + + } + static TreeLBMessage* merge(std::vector& msgs) { @@ -101,28 +128,33 @@ class LBStatsMsg_1 : public TreeLBMessage, public CMessage_LBStatsMsg_1 int pe_cnt = 0; int obj_cnt = 0; float total_load = 0; - for (int i = 0; i < msgs.size(); i++) + + if (msgs.size() != 1) { + CkAbort("[PE %d] LBStatsMsg_1::fill should only have one msg, has %d\n", CkMyPe(), msgs.size()); + } + LBStatsMsg_1* msg = (LBStatsMsg_1*)msgs[0]; + //if (_lb_args.debug() > 1)CkPrintf("[PE %d] msg %d with %d pes and %d objs\n", CkMyPe(), 0, msg->nPes, msg->nObjs); + for (int j = 0; j < msg->nPes; j++) { - LBStatsMsg_1* msg = (LBStatsMsg_1*)msgs[i]; - for (int j = 0; j < msg->nPes; j++) + int pe = msg->pe_ids[j]; + CkAssert(pe >= 0 && pe < CkNumPes()); + //if (_lb_args.debug() > 2) CkPrintf("[PE %d] filling pe %d with %d objs\n", CkMyPe(), pe, + // msg->obj_start[j + 1] - msg->obj_start[j]); + procs[pe_cnt].populate(pe, msg->bgloads + j, msg->speeds + j); + procs[pe_cnt++].resetLoad(); + migMsg->obj_start[pe] = obj_cnt; + int local_id = 0; + for (int k = msg->obj_start[j]; k < msg->obj_start[j + 1]; + k++, obj_cnt++, local_id++) { - int pe = msg->pe_ids[j]; - CkAssert(pe >= 0 && pe < CkNumPes()); - procs[pe_cnt].populate(pe, msg->bgloads + j, msg->speeds + j); - procs[pe_cnt++].resetLoad(); - migMsg->obj_start[pe] = obj_cnt; - int local_id = 0; - for (int k = msg->obj_start[j]; k < msg->obj_start[j + 1]; - k++, obj_cnt++, local_id++) - { - objs[obj_cnt].populate(obj_cnt, msg->oloads + k, pe); - total_load += objs[obj_cnt].getLoad(); - migMsg->to_pes[obj_cnt] = pe; - // if obj_local_ids.size() > 0: - obj_local_ids[obj_cnt] = local_id; - } + objs[obj_cnt].populate(obj_cnt, msg->oloads + k, pe); + total_load += objs[obj_cnt].getLoad(); + migMsg->to_pes[obj_cnt] = pe; + // if obj_local_ids.size() > 0: + obj_local_ids[obj_cnt] = local_id; } } + CkAssert(obj_cnt == objs.size()); CkAssert(pe_cnt == procs.size()); return total_load; @@ -175,6 +207,9 @@ class IStrategyWrapper virtual void removeObj(int& local_id, int& oldPe, float& load) = 0; virtual void addForeignObject(int local_id, int oldPe, float load) = 0; + + virtual void pup(PUP::er& p) { CkAbort("IStrategyWrapper::pup not implemented\n"); + } // TODO: pup correctly }; // This wrapper allocates mem for objects and processors. to the lb algorithm, @@ -374,6 +409,35 @@ class StrategyWrapper : public IStrategyWrapper sol->setErrorChecking(objs, procs); #endif +#if CMK_SHRINK_EXPAND + if (se_avail_vector != NULL) { + if (_lb_args.debug() > 0) CkPrintf("se_avail_vector is not null on pe %d, removing procs that will be removed\n", CkMyPe()); + // if shrink/expand is enabled, remove processors that will be removed (this happens at shrink, before the checkpoint) + std::vector

procs2; + for (const auto& p : procs) { + if (se_avail_vector[p.id] != 0) procs2.push_back(p); + } + procs = procs2; + } +#endif + if (_lb_args.debug() > 0){ + CkPrintf("[PE %d] Procs are : ", CkMyPe()); + for (const auto& p : procs) { + CkPrintf("%d ", p.id); + } + CkPrintf("\n"); + CkPrintf("[PE %d] Objs per PE: ", CkMyPe()); + std::map counts; + for (const auto& o : objs) { + counts[o.oldPe]++; + } + for (const auto& p : procs) { + CkPrintf("%d:%d ", p.id, counts[p.id]); + } + CkPrintf("\n"); + } + + double t0 = CkWallTimer(); strategy->solve(objs, procs, *sol, false); @@ -455,13 +519,132 @@ class StrategyWrapper : public IStrategyWrapper class RootLevel : public LevelLogic { public: - RootLevel(int _num_groups = -1) : num_groups(_num_groups) {} - + RootLevel(int _num_groups = -1) : num_groups(_num_groups) { + nPes = CkNumPes(); + } virtual ~RootLevel() { for (auto w : wrappers) delete w; } + virtual int getNumNewPes() { return num_new_pes; } + + virtual bool collectSpeeds(int proc_id, float speed) { + if (rateAware) + { + CkPrintf("[PE %d] RootLevel::collectSpeeds proc_id=%d speed=%f\n", CkMyPe(), proc_id, speed); + LBStatsMsg_1* msg = (LBStatsMsg_1*)stats_msgs[0]; + for (int i = 0; i < msg->nPes; i++) { + if (msg->pe_ids[i] == proc_id) { + + msg->speeds[i] = speed; + } + } + + num_new_pes--; + if (num_new_pes == 0) { + if (_lb_args.debug() > 0){ + if (CkMyPe() == 0) { + CkPrintf("After speeds collected: My stats message on PE 0: %d\n", ((LBStatsMsg_1*)stats_msgs[0])->nObjs); + for (int i = 0; i < ((LBStatsMsg_1*)stats_msgs[0])->nPes; i++) { + CkPrintf(" pe %d: id=%d bgload=%f speed=%f obj_start=%d\n", i, ((LBStatsMsg_1*)stats_msgs[0])->pe_ids[i], ((LBStatsMsg_1*)stats_msgs[0])->bgloads[i], ((LBStatsMsg_1*)stats_msgs[0])->speeds[i], ((LBStatsMsg_1*)stats_msgs[0])->obj_start[i]); + } + + } + } + // all new pes have reported their speed, can run lb now + return true; + } + } + return false; + } + + virtual TreeLBMessage* mergeStats() + { + // send obj loads up + TreeLBMessage* newMsg = LBStatsMsg_1::merge(stats_msgs); + // need to cast pointer to ensure delete of CMessage_LBStatsMsg_1 is called + for (auto m : stats_msgs) delete (LBStatsMsg_1*)m; + stats_msgs.clear(); + stats_msgs.push_back(newMsg); + return newMsg; + } + + PUPable_decl(RootLevel); + RootLevel(CkMigrateMessage *m) : LevelLogic(m) {} + + virtual void pup(PUP::er& p) + { + if (_lb_args.debug() > 2) CkPrintf("[PE %d] PUPPING RootLevel\n", CkMyPe()); + LevelLogic::pup(p); // this packs num_stats_msgs + + if (num_stats_msgs > 1) CkAbort("RootLevel should have just one stats message! Has %d\n", num_stats_msgs); + p|nObjs; + p|nPes; + int nNewPes = CkNumPes(); + + if (_lb_args.debug() > 2) CkPrintf("[PE %d] Done with basics\n", CkMyPe()); + + + // stats_msgs stuff is only relevant for expand + if (p.isUnpacking()) { + stats_msgs.resize(1); + LBStatsMsg_1* msg; + if (rateAware) + msg= new (nNewPes, nNewPes, nNewPes, nNewPes + 1, nObjs, nObjs, 0) LBStatsMsg_1; + else + msg= new (nNewPes, nNewPes, 0, nNewPes + 1, nObjs, nObjs, 0) LBStatsMsg_1; + stats_msgs[0] = msg; + } + if (stats_msgs.size() == 0) stats_msgs.push_back(new (nNewPes, nNewPes, nNewPes, nNewPes + 1, nObjs, nObjs, 0) LBStatsMsg_1); + p|*stats_msgs[0]; // everyone needs to pup this cause pup is dumb + + if (p.isUnpacking() && CkMyPe() == 0 && num_stats_msgs > 0) { + // if num_stats_msgs = 0, then we aren't expanding + LBStatsMsg_1* msg = (LBStatsMsg_1*)stats_msgs[0]; + if (nObjs != msg->nObjs) { + CkAbort("In RootLevel::pup, nObjs (%d) != msg->nObjs (%d)\n", nObjs, msg->nObjs); + } + + msg->nPes = CkNumPes(); + + // TODO: this will not work if we do simultaneous shrink/expand + num_new_pes = 0; + for (int i = nPes; i < CkNumPes(); i++) + { + // on expand: need to reset the new PEs info + if (msg->pe_ids[i] > CkNumPes() - 1 || msg->pe_ids[i] <= 0) { + // you are a new pe! + if (_lb_args.debug() > 0) CkPrintf("[PE %d] RootLevel::pup PE %d is new, resetting its info\n", CkMyPe(), i); + num_new_pes++; + msg->pe_ids[i] = i; + msg->bgloads[i] = 0; + // msg->speeds[i] = 1.0; // speeds need to be recomputed for the new procs and sent back to the root + msg->obj_start[i+1] = msg->obj_start[i]; // no objects + } + } + + if (_lb_args.debug() > 0){ + if (CkMyPe() == 0) { + CkPrintf("My stats message on PE 0: %d\n", msg->nObjs); + for (int i = 0; i < msg->nPes; i++) { + CkPrintf(" pe %d: id=%d bgload=%f speed=%f obj_start=%d\n", i, msg->pe_ids[i], msg->bgloads[i], msg->speeds[i], msg->obj_start[i]); + } + + } + } + } + if (num_stats_msgs == 0) { + stats_msgs.clear(); + } + + + nPes = nNewPes; + num_stats_msgs = stats_msgs.size(); + + } + + /** * mode 0: receive obj stats * mode 1: receive aggregated group load @@ -470,7 +653,10 @@ class RootLevel : public LevelLogic json& config, bool repeat_strategies = false, bool token_passing = true) { + using namespace TreeStrategy; + this->rateAware = rateAware; + this->strategies = strategies; for (auto w : wrappers) delete w; wrappers.clear(); if (num_groups == -1) @@ -506,7 +692,6 @@ class RootLevel : public LevelLogic } else { - nPes += ((LBStatsMsg_1*)stats)->nPes; nObjs += ((LBStatsMsg_1*)stats)->nObjs; } } @@ -518,6 +703,9 @@ class RootLevel : public LevelLogic #endif const int num_children = stats_msgs.size(); + if (num_children != 1) { + CkAbort("RootLevel::loadBalance: expected just one stats message (merged) but received from %d\n", nPes, num_children); + } CkAssert(num_children > 0); #if DEBUG__TREE_LB_L1 CkPrintf("[%d] RootLevel::loadBalance, num_children=%d nPes=%d nObjs=%d\n", CkMyPe(), @@ -527,7 +715,8 @@ class RootLevel : public LevelLogic if (num_groups == -1) { // msg has object loads - CkAssert(wrappers.size() > current_strategy); + if (wrappers.size() == 0) + CkAbort("No strategies configured for TreeLB with obj-based strategies\n"); IStrategyWrapper* wrapper = wrappers[current_strategy]; CkAssert(wrapper != nullptr); CkAssert(nPes == CkNumPes()); @@ -538,6 +727,9 @@ class RootLevel : public LevelLogic #if DEBUG__TREE_LB_L1 double t0 = CkWallTimer(); #endif + if (nPes != CkNumPes()) { + CkAbort("nPes (%d) != CkNumPes() (%d) in RootLevel::loadBalance\n", nPes, CkNumPes()); + } wrapper->prepStrategy(nObjs, nPes, stats_msgs, migMsg); wrapper->runStrategy(migMsg); if (current_strategy == wrappers.size() - 1) @@ -555,7 +747,7 @@ class RootLevel : public LevelLogic // need to cast pointer to ensure delete of CMessage_LBStatsMsg_1 is called for (auto msg : stats_msgs) delete (LBStatsMsg_1*)msg; stats_msgs.clear(); - nPes = nObjs = 0; + nObjs = 0; return migMsg; } else @@ -624,6 +816,7 @@ class RootLevel : public LevelLogic } total_load = 0.0; + nObjs = 0; // cleanup for next round int nmoves = int(solution.size()); SubtreeMigrateDecisionMsg* migMsg = @@ -652,6 +845,7 @@ class RootLevel : public LevelLogic int load; }; + int num_new_pes = 0; // number of new pes on expand int num_groups; bool repeat_strategies; size_t current_strategy = 0; @@ -660,6 +854,8 @@ class RootLevel : public LevelLogic unsigned int nObjs = 0; // total number of objects in msgs I am processing float total_load = 0; std::vector wrappers; + bool rateAware; + std::vector strategies; }; // ---------------- NodeSetLevel ---------------- @@ -1055,7 +1251,11 @@ class PELevel : public LevelLogic { inline bool operator()(const LDObjData& o1, const LDObjData& o2) const { +#if CMK_CUDA || CMK_HIP + return (o1.gpuTime > o2.gpuTime); +#else return (o1.wallTime > o2.wallTime); +#endif } }; @@ -1063,10 +1263,56 @@ class PELevel : public LevelLogic virtual ~PELevel() {} + PUPable_decl(PELevel); + PELevel(CkMigrateMessage *m) : LevelLogic(m) {} + + virtual void pup(PUP::er& p) + { + LevelLogic::pup(p); // this packs num_stats_msgs + + p|nObjs; + // p|myObjs; + + int nPes; + if (p.isPacking()) nPes = CkNumPes(); + p|nPes; + + if (p.isUnpacking()) { + if (CkMyPe() >= nPes) { + myObjs.clear(); + nObjs = 0; + } + } + num_stats_msgs = 0; + + } + + virtual void resetObjs() { + int nobjs = lbmgr->GetObjDataSz(); + + std::vector allLocalObjs(nobjs); + if (nobjs > 0) lbmgr->GetObjData(allLocalObjs.data()); // populate allLocalObjs + + myObjs.clear(); + nObjs = 0; + + for (int i = 0; i < nobjs; i++) + { + if (allLocalObjs[i].migratable) + { + + myObjs.emplace_back(allLocalObjs[i]); + nObjs++; + + } + } + } + virtual TreeLBMessage* getStats() { const int mype = CkMyPe(); int nobjs = lbmgr->GetObjDataSz(); + std::vector allLocalObjs(nobjs); if (nobjs > 0) lbmgr->GetObjData(allLocalObjs.data()); // populate allLocalObjs myObjs.clear(); @@ -1079,17 +1325,25 @@ class PELevel : public LevelLogic } else { + #if CMK_CUDA || CMK_HIP + nonMigratableLoad += allLocalObjs[i].gpuTime; + #else nonMigratableLoad += allLocalObjs[i].wallTime; + #endif } } nobjs = myObjs.size(); - + nObjs = nobjs; // TODO verify that non-migratable objects are not added to msg and are only counted // as background load #if DEBUG__TREE_LB_L3 float total_obj_load = 0; + #if CMK_CUDA || CMK_HIP + for (int i = 0; i < nobjs; i++) total_obj_load += myObjs[i].gpuTime; + #else for (int i = 0; i < nobjs; i++) total_obj_load += myObjs[i].wallTime; + #endif CkPrintf("[%d] PELevel::getStats, myObjs=%d, aggregate_obj_load=%f\n", mype, int(myObjs.size()), total_obj_load); #endif @@ -1106,7 +1360,14 @@ class PELevel : public LevelLogic if (rateAware) { msg = new (1, 1, 1, 2, nobjs, nobjs, 0) LBStatsMsg_1; +#if CMK_CUDA || CMK_HIP + msg->speeds[0] = float(lbmgr->ProcessorGPUSpeed()); +#else msg->speeds[0] = float(lbmgr->ProcessorSpeed()); +#endif + + if (_lb_args.debug() > 1) + CkPrintf("[%d] PELevel: processor speed is %f\n", mype, msg->speeds[0]); } else msg = new (1, 1, 0, 2, nobjs, nobjs, 0) LBStatsMsg_1; @@ -1119,15 +1380,23 @@ class PELevel : public LevelLogic { // If rateAware, convert object loads by multiplying by processor speed // Note this conversion isn't done for bgloads because they never leave the PE + +#if CMK_CUDA || CMK_HIP + float oload = float(myObjs[i].gpuTime); +#else + float oload = float(myObjs[i].wallTime); +#endif if (rateAware) - msg->oloads[i] = float(myObjs[i].wallTime) * msg->speeds[0]; + msg->oloads[i] = oload * msg->speeds[0]; else - msg->oloads[i] = float(myObjs[i].wallTime); + msg->oloads[i] = oload; msg->order[i] = i; } LBRealType t1, t2, t3, bg_walltime; -#if CMK_LB_CPUTIMER +#if CMK_CUDA || CMK_HIP + lbmgr->GetGPUBGTime(&bg_walltime); +#elif CMK_LB_CPUTIMER LBRealType t4; lbmgr->GetTime(&t1, &t2, &t3, &bg_walltime, &t4); #else @@ -1153,18 +1422,22 @@ class PELevel : public LevelLogic outgoing = 0; int obj_start = decision->obj_start[mype]; int obj_end = obj_start + int(myObjs.size()); + assert(myObjs.size == nObjs); int j = 0; + for (int i = obj_start; i < obj_end; i++, j++) { + //if (_lb_args.debug() > 2) CkPrintf("[%d] PELevel: obj %d (abs=%d, handle=%d) to dest %d\n", CkMyPe(), j, i, + // myObjs[j].handle.handle, decision->to_pes[i]); int dest = decision->to_pes[i]; + if (dest > CkNumPes() - 1) + CkAbort("PELevel: processDecision found dest PE >= CkNumPes(): %d >= %d\n", dest, CkNumPes()); if (dest != mype) { if (dest >= 0) { -#if DEBUG__TREE_LB_L3 - CkPrintf("[%d] (processDecision) My obj %d (abs=%d) moving to %d\n", CkMyPe(), - j, i, dest); -#endif + //if (_lb_args.debug() > 1) CkPrintf("[%d] (processDecision) My obj %d (abs=%d, handle=%d) moving to %d\n", CkMyPe(), + // j, i, myObjs[j].handle.handle, dest); if (lbmgr->Migrate(myObjs[j].handle, dest) == 0) { CkAbort("PELevel: Migrate call returned 0\n"); @@ -1210,6 +1483,7 @@ class PELevel : public LevelLogic LBManager* lbmgr; bool rateAware; std::vector myObjs; + int nObjs = 0; }; // ---------------- MsgAggregator ---------------- diff --git a/src/ck-ldb/TreeStrategyBase.h b/src/ck-ldb/TreeStrategyBase.h index 57926535d6..be91b37b93 100644 --- a/src/ck-ldb/TreeStrategyBase.h +++ b/src/ck-ldb/TreeStrategyBase.h @@ -172,7 +172,7 @@ class Proc { public: int id = -1; - + float speed[N] = {1.0}; inline void populate(int _id, float* _bgload, float* _speed) { id = _id; diff --git a/src/ck-ldb/ckgraph.C b/src/ck-ldb/ckgraph.C index 326e6eb39c..bb9eea089e 100644 --- a/src/ck-ldb/ckgraph.C +++ b/src/ck-ldb/ckgraph.C @@ -15,23 +15,40 @@ ProcArray::ProcArray(BaseLB::LDStats *stats) { const int numPes = stats->procs.size(); + // fill the processor array procs.resize(numPes); + availPeMap.resize(numPes); + std::fill(availPeMap.data(), availPeMap.data() + numPes, -1); // Loop through the LDStats structure, copying data into this array and calculating // the average 'totalLoad' of all the PEs + availProcSize = 0; avgLoad = 0.0; + int currAvailPe = 0; for(int pe = 0; pe < numPes; pe++) { procs[pe].id = stats->procs[pe].pe; procs[pe].setOverhead(stats->procs[pe].bg_walltime); procs[pe].setTotalLoad(stats->procs[pe].total_walltime - stats->procs[pe].idletime); procs[pe].available = stats->procs[pe].available; + //CkPrintf("%i avail = %d\n", pe, procs[pe].available); + availProcSize += (procs[pe].available ? 1 : 0); avgLoad += procs[pe].getTotalLoad(); + if (!procs[pe].available) + currAvailPe++; + if (currAvailPe < numPes) + availPeMap[pe] = currAvailPe++; // CkPrintf("PE%d overhead:%f totalLoad:%f \n",pe,procs[pe].overhead(),procs[pe].totalLoad()); } + availPeMap.resize(availProcSize); avgLoad /= numPes; } +void ProcArray::reassignPeMapToAvailable(std::vector &pemap) { + for (int i = 0; i < pemap.size(); i++) + pemap[i] = availPeMap[pemap[i]]; +} + void ProcArray::resetTotalLoad() { for(int pe = 0; pe < procs.size(); pe++) procs[pe].setTotalLoad(procs[pe].getOverhead()); diff --git a/src/ck-ldb/ckgraph.h b/src/ck-ldb/ckgraph.h index 8b02e478f1..8ac427b39c 100644 --- a/src/ck-ldb/ckgraph.h +++ b/src/ck-ldb/ckgraph.h @@ -17,6 +17,9 @@ #include "BaseLB.h" #include +#define MAX(a,b) ((a)>(b)?(a):(b)) +#define MIN(a,b) ((a)<(b)?(a):(b)) + class ProcInfo { friend class ProcArray; @@ -56,9 +59,12 @@ class ProcArray ProcArray(BaseLB::LDStats* stats); double getAverageLoad() const { return avgLoad; } void resetTotalLoad(); + void reassignPeMapToAvailable(std::vector &pemap); // vector containing the list of processors std::vector procs; + std::vector availPeMap; + int availProcSize; protected: double avgLoad; @@ -143,7 +149,7 @@ class CkVertex } int getVertexId() const { return id; } - double getVertexLoad() const { return compLoad; } + double getVertexLoad() const { return MAX(compLoad, 0.1); } int getCurrentPe() const { return currPe; } int getNewPe() const { return newPe; } void setNewPe(int _newpe) { newPe = _newpe; } diff --git a/src/ck-ldb/greedy.h b/src/ck-ldb/greedy.h index c87599859c..995becb1e2 100644 --- a/src/ck-ldb/greedy.h +++ b/src/ck-ldb/greedy.h @@ -83,7 +83,7 @@ class GreedyRefine : public Strategy // TODO improve the case where the proc is not in my list of processors (because // it belongs to a foreing domain). procHeap API should return an error? P& oldPe = procHeap.getProc(ptr(o)->oldPe); - if ((oldPe.id >= 0) && (oldPe.getLoad() + ptr(o)->getLoad() <= M)) + if ((oldPe.id >= 0) && (oldPe.getLoad() + (ptr(o)->getLoad() / oldPe.speed[0]) <= M)) p = oldPe; else p = procHeap.top(); diff --git a/src/ck-ldb/lbdb.h b/src/ck-ldb/lbdb.h index 12f330eddd..22ef1dfb73 100644 --- a/src/ck-ldb/lbdb.h +++ b/src/ck-ldb/lbdb.h @@ -157,6 +157,9 @@ class LBObjUserData { struct LDObjData { LDObjHandle handle; LBRealType wallTime; +#if CMK_CUDA || CMK_HIP + LBRealType gpuTime; +#endif #if CMK_LB_CPUTIMER LBRealType cpuTime; #endif @@ -171,6 +174,9 @@ struct LDObjData { // An encoded approximation of the amount of data the object would pack; // call pup_decodeSize(pupSize) to get the actual approximate value CmiUInt2 pupSize; +#if CMK_CUDA || CMK_HIP + size_t gpuPupSize; +#endif inline const LDOMHandle &omHandle() const { return handle.omhandle; } inline const LDOMid &omID() const { return handle.omhandle.id; } inline const CmiUInt8 &objID() const { return handle.id; } @@ -333,6 +339,9 @@ inline void LBObjUserData::pup(PUP::er &p) { inline void LDObjData::pup(PUP::er &p) { p|handle; p|wallTime; +#if CMK_CUDA || CMK_HIP + p|gpuTime; +#endif #if CMK_LB_CPUTIMER p|cpuTime; #endif @@ -348,6 +357,9 @@ inline void LDObjData::pup(PUP::er &p) { } #endif p|pupSize; +#if CMK_CUDA || CMK_HIP + p|gpuPupSize; +#endif } inline bool LDCommDesc::operator==(const LDCommDesc &obj) const { diff --git a/src/ck-ldb/manager.C b/src/ck-ldb/manager.C index 9ef1111bee..345a2e0abe 100644 --- a/src/ck-ldb/manager.C +++ b/src/ck-ldb/manager.C @@ -13,38 +13,152 @@ #include "converse.h" #include "conv-ccs.h" + #if CMK_SHRINK_EXPAND realloc_state pending_realloc_state; char * se_avail_vector; -extern "C" int numProcessAfterRestart; +int numProcessAfterRestart; extern "C" CcsDelayedReply shrinkExpandreplyToken; extern "C" char willContinue; char willContinue; #endif -extern int load_balancer_created; +bool load_balancer_created; + +void realloc(char* reallocMsg) +{ +#if CMK_SHRINK_EXPAND + numProcessAfterRestart = *((int *)reallocMsg); + reallocMsg += sizeof(int); + int numBits = *((int *)reallocMsg); + reallocMsg += sizeof(int); + + CkPrintf("Charm> numProcessAfterRestart = %d, numBits = %d\n", numProcessAfterRestart, numBits); + + if (LBManagerObj()->lb_in_progress) + { + CkPrintf("Charm> Rescaling called while load balancing is in progress!\n"); + LBManagerObj()->bufferRealloc(reallocMsg - 2 * sizeof(int)); + } + else + { + //if (numProcessAfterRestart > CkNumPes()) + // pending_realloc_state = EXPAND_MSG_RECEIVED; + //else + // pending_realloc_state = SHRINK_MSG_RECEIVED; + + char* old_bitmap = (char *)malloc(sizeof(char) * CkNumPes()); + LBManagerObj()->get_avail_vector(old_bitmap); + + char* new_bitmap = (char *)malloc(sizeof(char) * CkNumPes()); + memcpy(new_bitmap, old_bitmap, sizeof(char) * CkNumPes()); + + int last_pe = -1; + int j = 0; + for (int i = 0; i < numBits; i++) + { + if (reallocMsg[i] == 0) + { + while (last_pe < i && j < CkNumPes()) + last_pe += old_bitmap[j++]; + + if (last_pe == i) + new_bitmap[j-1] = 0; + } + } + + for (int i = 0; i < CkNumPes(); i++) + { + CkPrintf("Charm> before old_bitmap[%d] = %d\n", i, old_bitmap[i]); + CkPrintf("Charm> reallocMsg[%d] = %d\n", i, reallocMsg[i]); + //new_bitmap[i] = reallocMsg[i] & new_bitmap[i]; + CkPrintf("Charm> after new_bitmap[%d] = %d\n", i, new_bitmap[i]); + } + + if((CkMyPe() == 0) && (load_balancer_created)) + LBManagerObj()->set_avail_vector(new_bitmap, 0); + + se_avail_vector = (char *)malloc(sizeof(char) * CkNumPes()); + LBManagerObj()->get_avail_vector(se_avail_vector); + + // now find whether this is shrink/expand + pending_realloc_state = NO_REALLOC; + for (int i = 0; i < CkNumPes(); i++) + if (se_avail_vector[i] == 0) + { + pending_realloc_state = SHRINK_MSG_RECEIVED; + break; + } + + if (numProcessAfterRestart > CkNumPes() || (numProcessAfterRestart == CkNumPes() && + pending_realloc_state == SHRINK_MSG_RECEIVED)) + pending_realloc_state = static_cast(static_cast(pending_realloc_state) | + static_cast(EXPAND_MSG_RECEIVED)); + + //free(reallocMsg); + free(new_bitmap); + free(old_bitmap); + } +#endif +} + static void handler(char *bit_map) { #if CMK_SHRINK_EXPAND + printf("Charm> Rescaling called!\n"); shrinkExpandreplyToken = CcsDelayReply(); bit_map += CmiMsgHeaderSizeBytes; - pending_realloc_state = REALLOC_MSG_RECEIVED; - - if((CkMyPe() == 0) && (load_balancer_created)) - LBManagerObj()->set_avail_vector(bit_map); - - se_avail_vector = (char *)malloc(sizeof(char) * CkNumPes()); - LBManagerObj()->get_avail_vector(se_avail_vector); + realloc(bit_map); +#endif +} - numProcessAfterRestart = *((int *)(bit_map + CkNumPes())); +static void realloc_handler(char *msg) +{ +#if CMK_SHRINK_EXPAND + printf("Charm> Rescaling called!\n"); + int myPes = CkNumPes(); + shrinkExpandreplyToken = CcsDelayReply(); + msg += CmiMsgHeaderSizeBytes; + bool isExpand = *((bool *)msg); + int numPes = *((int *)(msg + sizeof(bool))); + printf("Charm> realloc_handler: isExpand=%d numPes=%d CkNumPes()=%d\n", isExpand, numPes, CkNumPes()); + + char* bit_map = (char *)malloc(CkNumPes() + 2 * sizeof(int)); + memcpy(bit_map, &numPes, sizeof(int)); + memcpy(&bit_map[sizeof(int)], &myPes, sizeof(int)); + char* start_bitmap = bit_map + 2 * sizeof(int); + + if (isExpand) + { + for (int i = 0; i < CkNumPes(); i++) { + start_bitmap[i] = 1; + } + } + else + { + for (int i = 0; i < CkNumPes(); i++) { + if (i < numPes) + start_bitmap[i] = 1; + else + start_bitmap[i] = 0; + } + } + + realloc(bit_map); #endif } +void rescale(char* bit_map) +{ + realloc(bit_map); +} + void manager_init(){ #if CMK_SHRINK_EXPAND static int inited = 0; willContinue = 0; if (inited) return; CcsRegisterHandler("set_bitmap", (CmiHandler) handler); + CcsRegisterHandler("realloc", (CmiHandler) realloc_handler); inited = 1; pending_realloc_state = NO_REALLOC; #endif diff --git a/src/ck-ldb/manager.h b/src/ck-ldb/manager.h index d2ffb69fba..473686ede4 100644 --- a/src/ck-ldb/manager.h +++ b/src/ck-ldb/manager.h @@ -13,6 +13,8 @@ void manager_init(void); +void rescale(char* bit_map); + #endif /*@}*/ diff --git a/src/ck-ldb/refine.h b/src/ck-ldb/refine.h index b0d24e68ec..b58628ffaa 100644 --- a/src/ck-ldb/refine.h +++ b/src/ck-ldb/refine.h @@ -38,6 +38,7 @@ class RefineA : public Strategy void solve(std::vector& objs, std::vector

& procs, S& solution, bool objsSorted) { + CkPrintf("Solving with RefineA strategy\n"); float M = calcGreedyMaxload(objs, procs, objsSorted); if (CkMyPe() == 0 && _lb_args.debug() > 0) CkPrintf("[%d] RefineA: greedy maxload is %f\n", CkMyPe(), M); @@ -73,7 +74,6 @@ class RefineA : public Strategy while (reldiff(lower, upper) > 1.01) { M = (lower + upper) / 2; - solutions.emplace_back(initialAssignment); std::unordered_map> proc_objs( proc_objs0); // real pe -> list of its objects @@ -104,13 +104,13 @@ class RefineA : public Strategy for (auto it = heavy_objs.begin(); it != heavy_objs.end(); it++) { O& o = *it; - if (lightest.getLoad() + o.getLoad() <= M) + if (lightest.getLoad() + (o.getLoad() / lightest.speed[0]) <= M) { heavy_processors.pop(); - heavy.load -= o.getLoad(); + heavy.load -= o.getLoad() / heavy.speed[0]; for (auto& light : light_processors) { - if (light.getLoad() + o.getLoad() <= M) + if (light.getLoad() + (o.getLoad() / light.speed[0]) <= M) { solutions.back().assign(o, light); lightH.remove(light); @@ -178,6 +178,7 @@ class RefineB : public Strategy void solve(std::vector& objs, std::vector

& procs, S& solution, bool objsSorted) { + CkPrintf("Solving with RefineB strategy\n"); float M = calcGreedyMaxload(objs, procs, objsSorted); if (CkMyPe() == 0 && _lb_args.debug() > 0) CkPrintf("[%d] RefineB: greedy maxload is %f\n", CkMyPe(), M); diff --git a/src/ck-perf/trace-projections.h b/src/ck-perf/trace-projections.h index add502f10c..015078f243 100644 --- a/src/ck-perf/trace-projections.h +++ b/src/ck-perf/trace-projections.h @@ -628,6 +628,7 @@ class toProjectionsGZFile : public PUP::er { gzFile f; protected: virtual void bytes(void *p,size_t n,size_t itemSize,dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode) {} virtual void pup_buffer(void *&p,size_t n,size_t itemSize,dataType t); virtual void pup_buffer(void *&p,size_t n, size_t itemSize, dataType t, std::function allocate, std::function deallocate); public: diff --git a/src/conv-ccs/ccs-builtins.C b/src/conv-ccs/ccs-builtins.C index a2059a565b..b519a4ff3e 100644 --- a/src/conv-ccs/ccs-builtins.C +++ b/src/conv-ccs/ccs-builtins.C @@ -63,6 +63,14 @@ void CcsImpl_kill(void) SOCKET fd=skt_connect(killList->ip,killList->port,20); if (fd!=INVALID_SOCKET) { skt_sendN(fd,"die\n",strlen("die\n")+1); + + // Set SO_LINGER to ensure the "die" message is sent before we exit. + // This forces close() to block until the kernel has transmitted the data. + struct linger linger_opt; + linger_opt.l_onoff = 1; // Enable linger + linger_opt.l_linger = 5; // Timeout in seconds + setsockopt(fd, SOL_SOCKET, SO_LINGER, &linger_opt, sizeof(linger_opt)); + skt_close(fd); } killList=killList->next; diff --git a/src/conv-ccs/ccs-builtins.h b/src/conv-ccs/ccs-builtins.h index f768dda7f3..ee3fee9729 100644 --- a/src/conv-ccs/ccs-builtins.h +++ b/src/conv-ccs/ccs-builtins.h @@ -35,6 +35,8 @@ class PUP_fmt : public PUP::wrap_er { virtual void comment(const char *message); virtual void synchronize(unsigned int m); virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {} + virtual void bytes(void **p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {} virtual void pup_buffer(void *&p,size_t n,size_t itemSize,PUP::dataType t); virtual void pup_buffer(void *&p,size_t n, size_t itemSize, PUP::dataType t, std::function allocate, std::function deallocate); }; diff --git a/src/conv-ccs/conv-ccs.C b/src/conv-ccs/conv-ccs.C index d191fe35a1..24f4022055 100644 --- a/src/conv-ccs/conv-ccs.C +++ b/src/conv-ccs/conv-ccs.C @@ -8,6 +8,7 @@ #include "ccs-server.h" #include "sockRoutines.h" #include "queueing.h" +#include #ifdef _WIN32 # include @@ -232,6 +233,35 @@ void CcsSendDelayedReply(CcsDelayedReply d,int replyLen, const void *replyData) free(h); } +void CcsSendDelayedReplyAndTerm(CcsDelayedReply d, int replyLen, const void *replyData) +{ + CcsImplHeader *h = d.hdr; + int fd = ChMessageInt(h->replyFd); + + // 1. Send the reply data, same as CcsReply. + h->len = ChMessageInt_new(replyLen); + skt_sendN(fd, &replyLen, sizeof(int)); + if (replyLen > 0) { + skt_sendN(fd, replyData, replyLen); + } + + // 2. Perform a synchronous close to ensure data is sent before returning. + // shutdown() tells the kernel to send all buffered data, then a FIN packet. + shutdown(fd, SHUT_WR); + + // 3. Wait for the peer (charmrun) to close its side. The recv() will block + // until charmrun reads the data and closes its socket, which gives us an + // EOF (recv returns 0). This is our acknowledgment. + char dummy_buffer[32]; + recv(fd, dummy_buffer, sizeof(dummy_buffer), 0); + + // 4. Now that the handshake is complete, we can safely close our end. + skt_close(fd); + + // 5. Free the handle resource. + free(h); +} + void CcsNoReply(void) { if (CpvAccess(ccsReq)==NULL) return; diff --git a/src/conv-ccs/conv-ccs.h b/src/conv-ccs/conv-ccs.h index 451c42b31d..1ffd449bd4 100644 --- a/src/conv-ccs/conv-ccs.h +++ b/src/conv-ccs/conv-ccs.h @@ -94,6 +94,18 @@ void CcsSendReply(int replyLen, const void *replyData); void CcsSendReplyNoError(int replyLen, const void *replyData); CcsDelayedReply CcsDelayReply(void); void CcsSendDelayedReply(CcsDelayedReply d,int replyLen, const void *replyData); + +/** + * Send a delayed reply and then perform a synchronous close on the socket. + * This function blocks until the peer has acknowledged receipt of the data + * by closing its end of the connection. This is intended for final replies + * before program exit to prevent race conditions. + */ +void CcsSendDelayedReplyAndTerm(CcsDelayedReply d, int replyLen, const void *replyData); + +/** + Send an empty reply for a request that was previously delayed. +*/ void CcsNoReply(); void CcsNoDelayedReply(CcsDelayedReply d); diff --git a/src/conv-core/conv-config.h b/src/conv-core/conv-config.h index 51aebc5bf7..cb0af5635b 100644 --- a/src/conv-core/conv-config.h +++ b/src/conv-core/conv-config.h @@ -123,6 +123,10 @@ #define CMK_CUDA 0 #endif +#if !defined(CMK_HIP) +#define CMK_HIP 0 +#endif + #ifndef CMI_QD #define CMI_QD (CMK_REPLAYSYSTEM) #endif diff --git a/src/conv-core/conv-rdma.h b/src/conv-core/conv-rdma.h index f65e080ad9..efa99d78f0 100644 --- a/src/conv-core/conv-rdma.h +++ b/src/conv-core/conv-rdma.h @@ -3,54 +3,62 @@ #include "cmirdmautils.h" #include "pup.h" +#include -/*********************************** Zerocopy Direct API **********************************/ +// User specified configuration +// TODO: move to a better location +extern bool CmiUseCopyBasedRDMA; + +// LCI layer definition +#define CMK_REG_REQUIRED 1 +// 8-byte for mr, 16-byte for rmr +// TODO: better to use dynamic allocation and PUP +#define CMK_NOCOPY_DIRECT_BYTES 24 + +/*********************************** Zerocopy Direct API + * **********************************/ typedef void (*RdmaAckCallerFn)(void *token); /* Support for Direct API */ void CmiSetRdmaCommonInfo(void *info, const void *ptr, int size); int CmiGetRdmaCommonInfoSize(void); -void CmiSetRdmaBufferInfo(void *info, const void *ptr, int size, unsigned short int mode); +void CmiSetRdmaBufferInfo(void *info, const void *ptr, int size, + unsigned short int mode); // Function to set the ack handler for the Direct API void CmiSetDirectNcpyAckHandler(RdmaAckCallerFn fn); -/* CmiIssueRget initiates an RDMA read operation, transferring 'size' bytes of data from the address space of 'srcPe' to local address, 'destAddr'. - * When the runtime invokes srcAck on the source (target), it indicates safety to overwrite or free the srcAddr buffer. - * When the runtime invokes destAck on the destination (initiator), it indicates that the data has been successfully received in the - * destAddr buffer. +/* CmiIssueRget initiates an RDMA read operation, transferring 'size' bytes of + * data from the address space of 'srcPe' to local address, 'destAddr'. When the + * runtime invokes srcAck on the source (target), it indicates safety to + * overwrite or free the srcAddr buffer. When the runtime invokes destAck on the + * destination (initiator), it indicates that the data has been successfully + * received in the destAddr buffer. */ void CmiIssueRget(NcpyOperationInfo *ncpyOpInfo); -/* CmiIssueRput initiates an RDMA write operation, transferring 'size' bytes of data from the local address, 'srcAddr' to the address space of 'destPe'. - * When the runtime invokes srcAck on the source (initiator), it indicates safety to overwrite or free the srcAddr buffer. - * When the runtime invokes destAck on the destination (target), it indicates that the data has been successfully received in the - * destAddr buffer. +/* CmiIssueRput initiates an RDMA write operation, transferring 'size' bytes of + * data from the local address, 'srcAddr' to the address space of 'destPe'. When + * the runtime invokes srcAck on the source (initiator), it indicates safety to + * overwrite or free the srcAddr buffer. When the runtime invokes destAck on the + * destination (target), it indicates that the data has been successfully + * received in the destAddr buffer. */ void CmiIssueRput(NcpyOperationInfo *ncpyOpInfo); -void CmiDeregisterMem(const void *ptr, void *info, int pe, unsigned short int mode); +void CmiDeregisterMem(const void *ptr, void *info, int pe, + unsigned short int mode); #if CMK_USE_CMA -void CmiIssueRgetUsingCMA( - const void* srcAddr, - void *srcInfo, - int srcPe, - const void* destAddr, - void *destInfo, - int destPe, - size_t size); - -void CmiIssueRputUsingCMA( - const void* destAddr, - void *destInfo, - int destPe, - const void* srcAddr, - void *srcInfo, - int srcPe, - size_t size); +void CmiIssueRgetUsingCMA(const void *srcAddr, void *srcInfo, int srcPe, + const void *destAddr, void *destInfo, int destPe, + size_t size); + +void CmiIssueRputUsingCMA(const void *destAddr, void *destInfo, int destPe, + const void *srcAddr, void *srcInfo, int srcPe, + size_t size); #endif // Allocation from pool @@ -82,22 +90,25 @@ void CmiSetNcpyAckSize(int ackSize); #endif // Represents the mode of host-side zerocopy transfer -// CkNcpyMode::MEMCPY indicates that the PEs are on the logical node and memcpy can be used -// CkNcpyMode::CMA indicates that the PEs are on the same physical node and CMA can be used -// CkNcpyMode::RDMA indicates that the neither MEMCPY or CMA can be used and REMOTE Direct Memory Access needs to be used +// CkNcpyMode::MEMCPY indicates that the PEs are on the logical node and memcpy +// can be used CkNcpyMode::CMA indicates that the PEs are on the same physical +// node and CMA can be used CkNcpyMode::RDMA indicates that the neither MEMCPY +// or CMA can be used and REMOTE Direct Memory Access needs to be used enum class CmiNcpyMode : char { MEMCPY, CMA, RDMA }; -// Represents the completion status of the zerocopy transfer (used as a return value for CkNcpyBuffer::get & CkNcpyBuffer:::put) -// CMA and MEMCPY transfers complete instantly and return CkNcpyStatus::complete -// RDMA transfers use a remote asynchronous call and hence return CkNcpyStatus::incomplete +// Represents the completion status of the zerocopy transfer (used as a return +// value for CkNcpyBuffer::get & CkNcpyBuffer:::put) CMA and MEMCPY transfers +// complete instantly and return CkNcpyStatus::complete RDMA transfers use a +// remote asynchronous call and hence return CkNcpyStatus::incomplete enum class CmiNcpyStatus : char { incomplete, complete }; // Represents the remote handler tag that should be invoked // ncpyHandlerIdx::EM_ACK tag is used to remotely invoke CkRdmaEMAckHandler -// ncpyHandlerIdx::BCAST_ACK tag is used to remotely invoke CkRdmaEMBcastAckHandler -// ncpyHandlerIdx::BCAST_POST_ACK is used to remotely invoke CkRdmaEMBcastPostAckHandler -// ncpyHandlerIdx::CMA_DEREG_ACK is used to remotely invoke CkRdmaEMDeregAndAckHandler -enum class ncpyHandlerIdx: char { +// ncpyHandlerIdx::BCAST_ACK tag is used to remotely invoke +// CkRdmaEMBcastAckHandler ncpyHandlerIdx::BCAST_POST_ACK is used to remotely +// invoke CkRdmaEMBcastPostAckHandler ncpyHandlerIdx::CMA_DEREG_ACK is used to +// remotely invoke CkRdmaEMDeregAndAckHandler +enum class ncpyHandlerIdx : char { EM_ACK, BCAST_ACK, BCAST_POST_ACK, @@ -109,21 +120,20 @@ enum class ncpyHandlerIdx: char { class CmiNcpyBuffer { - //private: - public: - + // private: +public: // bool to indicate registration for current values of ptr and cnt on pe bool isRegistered; - // machine specific information about the buffer - #if defined(__GNUC__) || defined(__clang__) - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wpedantic" - #endif +// machine specific information about the buffer +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#endif char layerInfo[CMK_COMMON_NOCOPY_DIRECT_BYTES + CMK_NOCOPY_DIRECT_BYTES]; - #if defined(__GNUC__) || defined(__clang__) - #pragma GCC diagnostic pop - #endif +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif #if CMK_ERROR_CHECKING void checkRegModeIsValid() { @@ -158,20 +168,38 @@ class CmiNcpyBuffer { // ack handling pointer used for bcast and CMA p2p transfers const void *refAckInfo; - CmiNcpyBuffer() : isRegistered(false), ptr(NULL), cnt(0), pe(-1), regMode(CMK_BUFFER_REG), deregMode(CMK_BUFFER_DEREG), ref(NULL), refAckInfo(NULL) {} + // ipc specific + void* deviceRdmaOpInfo; - explicit CmiNcpyBuffer(const void *ptr_, size_t cnt_, unsigned short int regMode_=CMK_BUFFER_REG, unsigned short int deregMode_=CMK_BUFFER_DEREG) { + CmiNcpyBuffer() + : isRegistered(false), ptr(NULL), cnt(0), pe(-1), regMode(CMK_BUFFER_REG), + deregMode(CMK_BUFFER_DEREG), ref(NULL), refAckInfo(NULL) {} + + explicit CmiNcpyBuffer(const void *ptr_, size_t cnt_, + unsigned short int regMode_ = CMK_BUFFER_REG, + unsigned short int deregMode_ = CMK_BUFFER_DEREG) : deviceRdmaOpInfo(nullptr) { + init(ptr_, cnt_, regMode_, deregMode_); + } + + explicit CmiNcpyBuffer(const void *ptr_, size_t cnt_, void* deviceRdmaOpInfo_, + unsigned short int regMode_ = CMK_BUFFER_REG, + unsigned short int deregMode_ = CMK_BUFFER_DEREG) : deviceRdmaOpInfo(deviceRdmaOpInfo_) { init(ptr_, cnt_, regMode_, deregMode_); } void print() { - CmiPrintf("[%d][%d][%d] CmiNcpyBuffer print: ptr:%p, size:%zu, pe:%d, regMode=%d, deregMode=%d, ref:%p, refAckInfo:%p\n", CmiMyPe(), CmiMyNode(), CmiMyRank(), ptr, cnt, pe, regMode, deregMode, ref, refAckInfo); + CmiPrintf("[%d][%d][%d] CmiNcpyBuffer print: ptr:%p, size:%zu, pe:%d, " + "regMode=%d, deregMode=%d, ref:%p, refAckInfo:%p\n", + CmiMyPe(), CmiMyNode(), CmiMyRank(), ptr, cnt, pe, regMode, + deregMode, ref, refAckInfo); } - void init(const void *ptr_, size_t cnt_, unsigned short int regMode_=CMK_BUFFER_REG, unsigned short int deregMode_=CMK_BUFFER_DEREG) { - ptr = ptr_; - cnt = cnt_; - pe = CmiMyPe(); + void init(const void *ptr_, size_t cnt_, + unsigned short int regMode_ = CMK_BUFFER_REG, + unsigned short int deregMode_ = CMK_BUFFER_DEREG) { + ptr = ptr_; + cnt = cnt_; + pe = CmiMyPe(); regMode = regMode_; deregMode = deregMode_; @@ -189,36 +217,33 @@ class CmiNcpyBuffer { refAckInfo = NULL; // Register memory everytime new values are initialized - if(cnt > 0) + if (cnt > 0) registerMem(); } - void setRef(const void *ref_) { - ref = ref_; - } + void setRef(const void *ref_) { ref = ref_; } - const void *getRef() { - return ref; - } + const void *getRef() { return ref; } // Register(Pin) the memory for the buffer - void registerMem() - { + void registerMem() { // Check that this object is local when registerMem is called CmiAssert(CmiNodeOf(pe) == CmiMyNode()); // Set machine layer information when regMode is not CMK_BUFFER_NOREG - if(regMode != CMK_BUFFER_NOREG) { + if (regMode != CMK_BUFFER_NOREG) { CmiSetRdmaCommonInfo(&layerInfo[0], ptr, cnt); - /* Set the pointer layerInfo unconditionally for layers that don't require pinning (MPI, PAMI) - * or if regMode is REG, PREREG on layers that require pinning (GNI, Verbs, OFI, UCX) */ + /* Set the pointer layerInfo unconditionally for layers that don't require + * pinning (MPI, PAMI) or if regMode is REG, PREREG on layers that require + * pinning (GNI, Verbs, OFI, UCX) */ #if CMK_REG_REQUIRED - if(regMode == CMK_BUFFER_REG || regMode == CMK_BUFFER_PREREG) + if (regMode == CMK_BUFFER_REG || regMode == CMK_BUFFER_PREREG) #endif { - CmiSetRdmaBufferInfo(layerInfo + CmiGetRdmaCommonInfoSize(), ptr, cnt, regMode); + CmiSetRdmaBufferInfo(layerInfo + CmiGetRdmaCommonInfoSize(), ptr, cnt, + regMode); isRegistered = true; } } @@ -231,12 +256,13 @@ class CmiNcpyBuffer { // Check that this object is local when deregisterMem is called CmiAssert(CmiNodeOf(pe) == CmiMyNode()); - if(isRegistered == false) + if (isRegistered == false) return; #if CMK_REG_REQUIRED - if(regMode != CMK_BUFFER_NOREG) { - CmiDeregisterMem(ptr, layerInfo + CmiGetRdmaCommonInfoSize(), pe, regMode); + if (regMode != CMK_BUFFER_NOREG) { + CmiDeregisterMem(ptr, layerInfo + CmiGetRdmaCommonInfoSize(), pe, + regMode); isRegistered = false; } #endif @@ -246,6 +272,7 @@ class CmiNcpyBuffer { p((char *)&ptr, sizeof(ptr)); p((char *)&ref, sizeof(ref)); p((char *)&refAckInfo, sizeof(refAckInfo)); + p((char *)&deviceRdmaOpInfo, sizeof(deviceRdmaOpInfo)); p|cnt; p|pe; p|regMode; @@ -262,44 +289,48 @@ class CmiNcpyBuffer { void cmaPut(CmiNcpyBuffer &destination); #endif - NcpyOperationInfo *createNcpyOpInfo(CmiNcpyBuffer &source, CmiNcpyBuffer &destination, int ackSize, char *srcAck, char *destAck, int rootNode, int opMode, void *refPtr); + NcpyOperationInfo *createNcpyOpInfo(CmiNcpyBuffer &source, + CmiNcpyBuffer &destination, int ackSize, + char *srcAck, char *destAck, int rootNode, + int opMode, void *refPtr); void rdmaGet(CmiNcpyBuffer &source, int ackSize, char *srcAck, char *destAck); - void rdmaPut(CmiNcpyBuffer &destination, int ackSize, char *srcAck, char *destAck); + void rdmaPut(CmiNcpyBuffer &destination, int ackSize, char *srcAck, + char *destAck); friend inline void deregisterBuffer(CmiNcpyBuffer &buffInfo); - - }; /***************************** Other Util *********************************/ void invokeZCPupHandler(void *ref, int pe); inline void deregisterBuffer(CmiNcpyBuffer &buffInfo) { - CmiDeregisterMem(buffInfo.ptr, buffInfo.layerInfo + CmiGetRdmaCommonInfoSize(), buffInfo.pe, buffInfo.regMode); + CmiDeregisterMem(buffInfo.ptr, + buffInfo.layerInfo + CmiGetRdmaCommonInfoSize(), buffInfo.pe, + buffInfo.regMode); buffInfo.isRegistered = false; } CmiNcpyMode findTransferMode(int srcPe, int destPe); CmiNcpyMode findTransferModeWithNodes(int srcNode, int destNode); - // Converse message to invoke the Ncpy handler on a remote process -struct ncpyHandlerMsg{ +struct ncpyHandlerMsg { char cmicore[CmiMsgHeaderSizeBytes]; ncpyHandlerIdx opMode; void *ref; }; -struct zcPupSourceInfo{ +struct zcPupSourceInfo { CmiNcpyBuffer src; - std::function deallocate; + std::function deallocate; }; void zcPupDone(void *ref); void zcPupHandler(ncpyHandlerMsg *msg); zcPupSourceInfo *zcPupAddSource(CmiNcpyBuffer &src); -zcPupSourceInfo *zcPupAddSource(CmiNcpyBuffer &src, std::function deallocate); +zcPupSourceInfo *zcPupAddSource(CmiNcpyBuffer &src, + std::function deallocate); void zcPupGet(CmiNcpyBuffer &src, CmiNcpyBuffer &dest); diff --git a/src/conv-core/conv-rdmadevice.C b/src/conv-core/conv-rdmadevice.C index 2d695fc31a..22c2a00d91 100644 --- a/src/conv-core/conv-rdmadevice.C +++ b/src/conv-core/conv-rdmadevice.C @@ -1,7 +1,8 @@ #include "converse.h" #include "conv-rdmadevice.h" +#include "ck.h" -#if CMK_CUDA +#if CMK_CUDA || CMK_HIP CmiNcpyModeDevice findTransferModeDevice(int srcPe, int dstPe) { CmiEnforce((srcPe >= 0) && (srcPe <= CmiNumPes())); @@ -10,12 +11,10 @@ CmiNcpyModeDevice findTransferModeDevice(int srcPe, int dstPe) { if (CmiNodeOf(srcPe) == CmiNodeOf(dstPe)) { // Same logical node return CmiNcpyModeDevice::MEMCPY; - } - else if (CmiPeOnSamePhysicalNode(srcPe, dstPe)) { + } else if (CmiPeOnSamePhysicalNode(srcPe, dstPe)) { // Different logical nodes, same physical node return CmiNcpyModeDevice::IPC; - } - else { + } else { // Different physical nodes, requires GPUDirect RDMA return CmiNcpyModeDevice::RDMA; } @@ -24,8 +23,8 @@ CmiNcpyModeDevice findTransferModeDevice(int srcPe, int dstPe) { #if CMK_GPU_COMM #include "machine-rdma.h" -void CmiSendDevice(int dest_pe, const void*& ptr, size_t size, uint64_t& tag) { - LrtsSendDevice(dest_pe, ptr, size, tag); +void CmiSendDevice(int dest_rank, int src_rank, const void*& ptr, size_t size, uint64_t& tag) { + LrtsSendDevice(dest_rank, src_rank, ptr, size, tag); } void CmiRecvDevice(DeviceRdmaOp* op, DeviceRecvType type) { @@ -40,6 +39,7 @@ void CmiRdmaDeviceRecvInit(RdmaAckHandlerFn fn) { } void CmiInvokeRecvHandler(void* data) { + QdProcess(1); rdmaDeviceRecvHandlerFn(data); } #endif // CMK_GPU_COMM diff --git a/src/conv-core/conv-rdmadevice.h b/src/conv-core/conv-rdmadevice.h index 88e5d3b1b4..d9bd75a6f4 100644 --- a/src/conv-core/conv-rdmadevice.h +++ b/src/conv-core/conv-rdmadevice.h @@ -5,12 +5,15 @@ #include "converse.h" #include "cmirdmautils.h" #include "pup.h" +#include "conv-rdma.h" -#if CMK_CUDA -#include +#define CMK_GPU_COMM 1 + +#if CMK_CUDA || CMK_HIP +#include "hapi_portable.h" // Represents the mode of device-side zerocopy transfer -// MEMCPY indicates that the PEs are on the same logical node and cudaMemcpyDeviceToDevice can be used +// MEMCPY indicates that the PEs are on the same logical node and hapiMemcpyDeviceToDevice can be used // IPC indicates that the PEs are on different logical nodes within the same physical node and CUDA IPC can be used // RDMA indicates that the PEs are on different physical nodes and requires GPUDirect RDMA enum class CmiNcpyModeDevice : char { MEMCPY, IPC, RDMA }; @@ -23,12 +26,13 @@ class CmiDeviceBuffer { // Pointer to and size of the buffer const void* ptr; size_t cnt; - cudaStream_t cuda_stream; + hapiStream_t hapi_stream; -#if !CMK_GPU_COMM // Source and destination PEs int src_pe; + int src_mpi_rank; int dest_pe; + int dest_mpi_rank; // Used for CUDA IPC int device_idx; @@ -39,32 +43,28 @@ class CmiDeviceBuffer { bool data_stored; void* data; + CmiNcpyBuffer lci_ncpy_buffer; + CmiDeviceBuffer() : ptr(NULL), cnt(0), src_pe(-1), dest_pe(-1) { init(); } explicit CmiDeviceBuffer(const void* ptr_, size_t cnt_) : ptr(ptr_), cnt(cnt_), - src_pe(CmiMyPe()), dest_pe(-1) { init(); } + src_pe(CmiMyPe()), src_mpi_rank(CmiNodeOf(CmiMyPe())), dest_pe(-1), dest_mpi_rank(-1) { init(); } void init() { device_idx = -1; comm_offset = 0; event_idx = -1; - cuda_stream = cudaStreamPerThread; + hapi_stream = hapiStreamPerThread; data_stored = false; data = NULL; } -#else - uint64_t tag; - - CmiDeviceBuffer() : ptr(NULL), cnt(0) {} - explicit CmiDeviceBuffer(const void* ptr_, size_t cnt_) : ptr(ptr_), cnt(cnt_) {} -#endif // CMK_GPU_COMM + uint64_t tag; void pup(PUP::er &p) { p((char *)&ptr, sizeof(ptr)); p|cnt; -#if !CMK_GPU_COMM p|src_pe; p|dest_pe; p|device_idx; @@ -73,18 +73,21 @@ class CmiDeviceBuffer { p|data_stored; if (data_stored) { if (p.isUnpacking()) { - cudaMallocHost(&data, cnt); + hapiMallocHost(&data, cnt); } PUParray(p, (char*)data, cnt); } -#else p|tag; -#endif // CMK_GPU_COMM + p|src_pe; + p|src_mpi_rank; + p|dest_pe; + p|dest_mpi_rank; + p|lci_ncpy_buffer; } ~CmiDeviceBuffer() { #if !CMK_GPU_COMM - if (data) cudaFreeHost(data); + if (data) hapiFreeHost(data); #endif } }; @@ -94,7 +97,7 @@ CmiNcpyModeDevice findTransferModeDevice(int srcPe, int destPe); #if CMK_GPU_COMM typedef void (*RdmaAckCallerFn)(void *token); -void CmiSendDevice(int dest_pe, const void*& ptr, size_t size, uint64_t& tag); +void CmiSendDevice(int dest_rank, int src_rank, const void*& ptr, size_t size, uint64_t& tag); void CmiRecvDevice(DeviceRdmaOp* op, DeviceRecvType type); void CmiRdmaDeviceRecvInit(RdmaAckCallerFn fn); void CmiInvokeRecvHandler(void* data); diff --git a/src/conv-core/cpuaffinity.C b/src/conv-core/cpuaffinity.C index 27267a7262..542fc00102 100644 --- a/src/conv-core/cpuaffinity.C +++ b/src/conv-core/cpuaffinity.C @@ -833,9 +833,8 @@ void CmiCheckAffinity(void) cpu_set_t my_aff; if (get_affinity(&my_aff) == -1) CmiAbort("get_affinity failed\n"); CPU_OR(&core_usage, &core_usage, &my_aff); // add my affinity (pe0) - cpuAffSyncWait(cpuPhyAffCheckDone); - + #if CMK_SMP && !CMK_SMP_NO_COMMTHD CmiNodeBarrier(); diff --git a/src/libs/ck-libs/ampi/ampi.C b/src/libs/ck-libs/ampi/ampi.C index 621cfcbe37..01d999ef62 100644 --- a/src/libs/ck-libs/ampi/ampi.C +++ b/src/libs/ck-libs/ampi/ampi.C @@ -1430,7 +1430,7 @@ void ampiParent::pup(PUP::er &p) noexcept { case AMPI_G_REQ: blockingReq = new GReq; break; -#if CMK_CUDA +#if CMK_CUDA case AMPI_GPU_REQ: CkAbort("AMPI> error trying to PUP a non-migratable GPU request!"); break; @@ -11720,7 +11720,7 @@ int AMPI_GPU_Iinvoke_wr(hapiWorkRequest *to_call, MPI_Request *request) /* Submit GPU request that will be notified of completion once the previous * operations in the given CUDA stream are complete */ CLINKAGE -int AMPI_GPU_Iinvoke(cudaStream_t stream, MPI_Request *request) +int AMPI_GPU_Iinvoke(hapiStream_t stream, MPI_Request *request) { AMPI_API("AMPI_GPU_Iinvoke", stream, request); @@ -11748,7 +11748,7 @@ int AMPI_GPU_Invoke_wr(hapiWorkRequest *to_call) } CLINKAGE -int AMPI_GPU_Invoke(cudaStream_t stream) +int AMPI_GPU_Invoke(hapiStream_t stream) { AMPI_API("AMPI_GPU_Invoke", stream); diff --git a/src/libs/ck-libs/ampi/ampi_functions.h b/src/libs/ck-libs/ampi/ampi_functions.h index 48b2bd6e22..43c8a3512a 100644 --- a/src/libs/ck-libs/ampi/ampi_functions.h +++ b/src/libs/ck-libs/ampi/ampi_functions.h @@ -23,7 +23,7 @@ # error You must define AMPI_CUSTOM_FUNC before including this file! #endif -#if CMK_CUDA +#if CMK_CUDA || CMK_HIP #include "hapi_functions.h" #endif @@ -607,11 +607,11 @@ AMPI_CUSTOM_FUNC(int, AMPI_Alltoall_long, void *sendbuf, int sendcount, MPI_Data #ifdef __cplusplus -#if CMK_CUDA +#if CMK_CUDA || CMK_HIP AMPI_CUSTOM_FUNC(int, AMPI_GPU_Iinvoke_wr, hapiWorkRequest *to_call, MPI_Request *request) -AMPI_CUSTOM_FUNC(int, AMPI_GPU_Iinvoke, cudaStream_t stream, MPI_Request *request) +AMPI_CUSTOM_FUNC(int, AMPI_GPU_Iinvoke, hapiStream_t stream, MPI_Request *request) AMPI_CUSTOM_FUNC(int, AMPI_GPU_Invoke_wr, hapiWorkRequest *to_call) -AMPI_CUSTOM_FUNC(int, AMPI_GPU_Invoke, cudaStream_t stream) +AMPI_CUSTOM_FUNC(int, AMPI_GPU_Invoke, hapiStream_t stream) #endif #endif diff --git a/src/scripts/Make.depends b/src/scripts/Make.depends index 8d5c014cf6..a73bc09427 100644 --- a/src/scripts/Make.depends +++ b/src/scripts/Make.depends @@ -169,7 +169,7 @@ EveryLB.o: EveryLB.C LBManager.h LBDatabase.h lbdb.h converse.h \ PathHistory.decl.h ckcallback-ccs.h CkCallback.decl.h BaseLB.decl.h \ EveryLB.decl.h charm++.h envelope.h sdag.h TreeLB.decl.h idm.h \ BaseLB.decl.h DistributedLB.decl.h DistBaseLB.decl.h LBManager.decl.h \ - MetisLB.decl.h CentralLB.decl.h CentralLBMsg.h RecBipartLB.decl.h \ + MetisLB.decl.h GreedyCentralLB.decl.h CentralLB.decl.h CentralLBMsg.h RecBipartLB.decl.h \ EveryLB.def.h HybridBaseLB.o: HybridBaseLB.C HybridBaseLB.h charm++.h charm.h \ @@ -394,6 +394,30 @@ MetisLB.o: MetisLB.C MetisLB.h CentralLB.h BaseLB.h LBManager.h \ CkCallback.decl.h BaseLB.decl.h CentralLB.decl.h CentralLBMsg.h \ manager.h MetisLB.decl.h ckgraph.h MetisLB.def.h + GreedyCentralLB.o: GreedyCentralLB.C GreedyCentralLB.h CentralLB.h BaseLB.h LBManager.h \ + LBDatabase.h lbdb.h converse.h conv-header.h conv-config.h \ + conv-autoconfig.h conv-common.h conv-mach-common.h conv-mach.h \ + conv-mach-opt.h lrts-common.h cmiqueue.h pup_c.h pup_c_functions.h \ + lrtslock.h queueing.h conv-cpm.h conv-cpath.h conv-qd.h conv-random.h \ + conv-lists.h conv-trace.h persistent.h cmirdmautils.h debug-conv.h \ + charm.h conv-rdma.h pup.h middle.h middle-conv.h LBObj.h LBOM.h LBComm.h \ + LBMachineUtil.h json_fwd.hpp LBManager.decl.h charm++.h cklists.h \ + pup_stl.h conv-config.h ckbitvector.h ckstream.h init.h charm-api.h \ + ckhashtable.h ckrdma.h envelope.h pup.h charm.h middle.h cklists.h \ + objid.h charm.h converse.h pup.h ckcallback.h cksection.h ckarrayindex.h \ + objid.h conv-ccs.h sockRoutines.h ccs-server.h register.h debug-charm.h \ + debug-conv++.h simd.h ckmessage.h CkMarshall.decl.h sdag.h pup_stl.h \ + envelope.h debug-charm.h ckrdmadevice.h conv-rdmadevice.h ckobjQ.h \ + ckreduction.h CkReduction.decl.h ckmemcheckpoint.h \ + CkMemCheckpoint.decl.h readonly.h ckarray.h cklocation.h MetaBalancer.h \ + RandomForestModel.h MetaBalancer.decl.h CkLocation.decl.h \ + ckarrayoptions.h ckmulticast.h CkMulticast.decl.h cklocrec.h \ + ckmigratable.h CkArray.decl.h ckfutures.h CkFutures.decl.h waitqd.h \ + waitqd.decl.h ckcheckpoint.h ckcallback.h CkCheckpointStatus.decl.h \ + trace.h pathHistory.h PathHistory.decl.h ckcallback-ccs.h \ + CkCallback.decl.h BaseLB.decl.h CentralLB.decl.h CentralLBMsg.h \ + manager.h GreedyCentralLB.decl.h ckgraph.h GreedyCentralLB.def.h + RecBipartLB.o: RecBipartLB.C RecBipartLB.h CentralLB.h BaseLB.h \ LBManager.h LBDatabase.h lbdb.h converse.h conv-header.h conv-config.h \ conv-autoconfig.h conv-common.h conv-mach-common.h conv-mach.h \ diff --git a/src/scripts/charmc b/src/scripts/charmc index 7863c144ae..aab371b07d 100755 --- a/src/scripts/charmc +++ b/src/scripts/charmc @@ -1614,7 +1614,7 @@ modInitObj="$modInitName.o" MAKE_LDXX="0" MAKE_LD="0" -CORE_LIBS=(-lreconverse -lcharm_cxx_utils "${TRACE_OBJ[@]}" -lm) +CORE_LIBS=(-lreconverse -lcharm_cxx_utils "${TRACE_OBJ[@]}" -lm -lckrescale) if [[ "$BUILD_SHARE" = '0' && "$USER_INITIATED_SHARED" = '0' ]] then @@ -2297,14 +2297,41 @@ fi if [[ "$COPY_CHARMRUN" = 'true' ]] then - targ="charmrun$CMK_POST_EXE" - [[ ! -x "$CHARMBIN/$targ" && -n "$CMK_POST_EXE" ]] && targ=charmrun + targ="charmrun$CMK_POST_EXE" + [[ ! -x "$CHARMBIN/$targ" && -n "$CMK_POST_EXE" ]] && targ=charmrun - if [[ -x "$CHARMBIN/$targ" ]] - then + if [[ -x "$CHARMBIN/$targ" ]] + then DoNoErrCheck $RM "$targ" DoNoErrCheck $CP "$CHARMBIN/$targ" "$targ" 2> /dev/null - fi + fi + + targ_elastic="charmrun_elastic$CMK_POST_EXE" + [[ ! -x "$CHARMBIN/$targ_elastic" && -n "$CMK_POST_EXE" ]] && targ_elastic=charmrun_elastic + + if [[ -x "$CHARMBIN/$targ_elastic" ]] + then + DoNoErrCheck $RM "$targ_elastic" + DoNoErrCheck $CP "$CHARMBIN/$targ_elastic" "$targ_elastic" 2> /dev/null + fi + + targ_elastic="charmrun_hapi$CMK_POST_EXE" + [[ ! -x "$CHARMBIN/$targ_elastic" && -n "$CMK_POST_EXE" ]] && targ_elastic=charmrun_hapi + + if [[ -x "$CHARMBIN/$targ_elastic" ]] + then + DoNoErrCheck $RM "$targ_elastic" + DoNoErrCheck $CP "$CHARMBIN/$targ_elastic" "$targ_elastic" 2> /dev/null + fi + + targ_elastic="hapi_memory_daemon$CMK_POST_EXE" + [[ ! -x "$CHARMBIN/$targ_elastic" && -n "$CMK_POST_EXE" ]] && targ_elastic=hapi_memory_daemon + + if [[ -x "$CHARMBIN/$targ_elastic" ]] + then + DoNoErrCheck $RM "$targ_elastic" + DoNoErrCheck $CP "$CHARMBIN/$targ_elastic" "$targ_elastic" 2> /dev/null + fi fi [[ -z "$SKIP_MODULEINIT" && -z "$SAVE" ]] && DoNoErrCheck $RM "$modInitSrc" "$modInitObj" > /dev/null 2>&1 diff --git a/src/util/charmrun-src/CMakeLists.txt b/src/util/charmrun-src/CMakeLists.txt index 95a3ccf389..f0f6e96d20 100644 --- a/src/util/charmrun-src/CMakeLists.txt +++ b/src/util/charmrun-src/CMakeLists.txt @@ -8,7 +8,7 @@ target_link_libraries(charmd_faceless PRIVATE -seq) add_executable(charmrun charmrun.C) target_compile_options(charmrun PRIVATE -seq -DCMK_NOT_USE_CONVERSE=1) -target_link_libraries(charmrun PRIVATE -seq) +target_link_libraries(charmrun PRIVATE -seq ckrescale) target_include_directories(charmrun PRIVATE ../../conv-ccs ..) # for ccs-auth.c sockRoutines.c diff --git a/src/util/charmrun-src/Makefile b/src/util/charmrun-src/Makefile index 2fbc3b4bf6..cf80f1c5db 100644 --- a/src/util/charmrun-src/Makefile +++ b/src/util/charmrun-src/Makefile @@ -5,13 +5,13 @@ SHELL=/bin/sh INCLUDED=../conv-mach.h ../conv-mach-opt.h \ ../ccs-server.C ../ccs-server.h ../ccs-auth.C ../ccs-auth.h \ - ../sockRoutines.C ../sockRoutines.h + ../sockRoutines.C ../sockRoutines.h ../ckcheckpoint.C ../ckcheckpoint.h all: charmrun charmd charmd_faceless -charmrun: charmrun.C $(INCLUDED) +charmrun: charmrun.C $(INCLUDED) ck.o $(CHARMC) -c -seq -DCMK_NOT_USE_CONVERSE=1 charmrun.C -o charmrun.o - $(CHARMC) -cp $(BIN) -seq -language c++ -o charmrun charmrun.o + $(CHARMC) -cp $(BIN) -seq -language c++ -o charmrun charmrun.o ck.o charmd: daemon.C daemon.h ../sockRoutines-seq.o $(CHARMC) -seq -c daemon.C -o daemon.o diff --git a/src/util/charmrun-src/charmrun.C b/src/util/charmrun-src/charmrun.C index 9403096137..79ecf20efa 100644 --- a/src/util/charmrun-src/charmrun.C +++ b/src/util/charmrun-src/charmrun.C @@ -1,4 +1,5 @@ #include "converse.h" +#include "ckrescale.h" #include "sockRoutines.h" #include "sockRoutines.C" @@ -25,6 +26,7 @@ #include #include +#include #include #include #include @@ -32,6 +34,12 @@ #include #include +#include +#include +#include +#include + + #if defined(_WIN32) /*Win32 has screwy names for the standard UNIX calls:*/ #define getcwd _getcwd @@ -360,6 +368,8 @@ static char *getenv_display_no_tamper() static unsigned int server_port; static char server_addr[1024]; /* IP address or hostname of charmrun*/ static SOCKET server_fd; + +static std::unordered_set node_set; /***************************************************************************** * * * PPARAM - obtaining "program parameters" from the user. * @@ -745,8 +755,8 @@ static char **saved_argv; static int saved_argc; static int arg_realloc_pes; static int arg_old_pes; -static int arg_shrinkexpand; static int arg_charmrun_port; +static int arg_shrinkexpand; static const char *arg_shrinkexpand_basedir; #endif @@ -779,7 +789,6 @@ static int arg_server_port = 0; static const char *arg_server_auth = NULL; static int replay_single = 0; - struct TopologyRequest { int host, socket, core, pu; @@ -817,6 +826,93 @@ TopologyRequest proc_per; TopologyRequest onewth_per; int auto_provision; +void print_nodelist(){ + FILE *f=fopen("/app/hostfile","r"); + char c; + c = fgetc(f); + while (c != EOF) { + printf ("%c", c); + c = fgetc(f); + } + fclose(f); +} + +int count_num_slots() +{ + std::ifstream infile("/etc/mpi/hostfile"); + std::string sLine; + + std::regex rgx("host (.*)-worker-(\\d+)\\.(.*) \\+\\+cpus (\\d+)"); + std::smatch match; + int total_slots = 0; + + printf("Counting slots in hostfile\n"); + + while(getline(infile, sLine)) + { + if (std::regex_search(sLine, match, rgx)) + { + total_slots += std::stoi(match[4]); + } + else + { + printf("Error parsing hostfile regex\n"); + return 0; + } + } + printf("Total slots = %d\n", total_slots); + std::cout << std::flush; + return total_slots; +} + +void wait_hostfile(int numProcs) +{ + int i = 0; + while (count_num_slots() != numProcs) + { + sleep(1 << i++); + } +} + +void write_hostfile(int numProcesses) +{ + std::ifstream infile("/etc/mpi/hostfile"); + std::string sLine; + getline(infile, sLine); + printf("Line = %s\n", sLine.c_str()); + std::cout << std::flush; + std::regex rgx("host (.*)-worker-(\\d+)\\.(.*) \\+\\+cpus (\\d+)"); + std::smatch match; + char hostStr[200]; + + if (std::regex_search(sLine, match, rgx)) + { + std::string name = match[1]; + std::string suffix = match[3]; + int slots = std::stoi(match[4]); + + infile.close(); + + std::ofstream outfile("/app/hostfile"); + + for (int i = 0; i < numProcesses; i++) + { + sprintf(hostStr, "host %s-worker-%i.%s ++cpus %i\n", name.c_str(), i, suffix.c_str(), slots); + printf("Writing: %s\n", hostStr); + outfile << hostStr; + } + + outfile.flush(); + outfile.close(); + + print_nodelist(); + } + else + { + printf("Error parsing hostfile regex\n"); + } +} + static void arg_init(int argc, const char **argv) { static char buf[1024]; @@ -867,6 +963,7 @@ static void arg_init(int argc, const char **argv) pparam_flag(&arg_child_charmrun, 0, "child-charmrun", "child charmrun"); #endif #if CMK_SHRINK_EXPAND + arg_shrinkexpand = 0; pparam_int(&arg_realloc_pes, 1, "newp", "New number of processes to create"); pparam_int(&arg_old_pes, 1, "oldp", "Old number of processes to create"); pparam_flag(&arg_shrinkexpand, 0, "shrinkexpand", "Enable shrink/expand support"); @@ -964,7 +1061,13 @@ static void arg_init(int argc, const char **argv) #if CMK_SHRINK_EXPAND if (arg_shrinkexpand) { arg_requested_pes = arg_realloc_pes; - printf("\n \nCharmrun> %d Reallocated pes\n \n", arg_requested_pes); + //arg_nodelist = "/etc/mpi/hostfileScaled"; + //write_hostfile(arg_requested_pes); + //printf("Waiting\n"); + //wait_hostfile(arg_requested_nodes); + //printf("\n \nCharmrun> %d Reallocated pes\n \n", arg_requested_pes); + //print_nodelist(); + //arg_nodelist = new_hostfile; } #endif @@ -1319,6 +1422,7 @@ struct nodetab_host skt_ip_t ip = _skt_invalid_ip; /*IP address of host*/ int cpus = 1; /* # of physical CPUs*/ + int remaining_cpus = 1; /* # of physical CPUs remaining for this host */ int nice = -100; /* process priority */ // int forks = 0; /* number of processes to fork on remote node */ @@ -1393,6 +1497,42 @@ static std::vector my_host_table; static std::vector my_process_table; static std::vector pe_to_process_map; +#if CMK_SHRINK_EXPAND + /*This little snippet creates a OLDNODENAMES + environment variable entry*/ + char *create_oldnodenames() + { + static char dest1[1024 * 1000]; + int i; + for (i = 0; i < my_process_table.size(); i++) + sprintf(dest1, "%s %s", dest1, (my_process_table[i].host)->name); + printf("Charmrun> Created oldnames %s \n", dest1); + return dest1; + } + + int isPresent(const char *names, char **listofnames) + { + int k; + for (k = 0; k < arg_old_pes; k++) { + if (strcmp(names, listofnames[k]) == 0) + return 1; + } + return 0; + } + void parse_oldnodenames(char **oldnodelist) + { + char *ns; + ns = getenv("OLDNODENAMES"); + int i; + char buffer[1024 * 1000]; + for (i = 0; i < arg_old_pes; i++) { + oldnodelist[i] = (char *) malloc(100 * sizeof(char)); + int nread = sscanf(ns, "%s %[^\n]", oldnodelist[i], buffer); + ns = buffer; + } + } + #endif + static const char *nodetab_args(const char *args, nodetab_host *h) { while (*args != 0) @@ -1438,7 +1578,7 @@ static const char *nodetab_args(const char *args, nodetab_host *h) args = skipblanks(e2); } - + h->remaining_cpus = h->cpus; return args; } @@ -1531,6 +1671,7 @@ static void nodetab_init_with_nodelist() host->name = strdup(hostname.c_str()); host->ip = nodetab_host::resolve(hostname.c_str()); host->hostno = hostno++; + printf("Adding host %s, %i\n", host->name, host->hostno); temp_hosts.insert({hostname, host}); nodetab_args(b3, host); } @@ -1595,6 +1736,8 @@ static void nodeinfo_add(const ChSingleNodeinfo *in, nodetab_process & p) fprintf(stderr, "Charmrun> Warning: Process #%d received ChSingleNodeInfo #%d\n", p.nodeno, node); p.info = in->info; + fprintf(stdout, "Charmrun> client %d added -> dataport = %d\n", node, ChMessageInt(p.info.dataport)); + fflush(stdout); p.num_pus = ChMessageInt(in->num_pus); p.num_cores = ChMessageInt(in->num_cores); p.num_sockets = ChMessageInt(in->num_sockets); @@ -1961,7 +2104,6 @@ static int req_handle_initnode(ChMessage *msg, nodetab_process & p) fprintf(stderr, "Charmrun: possibly because: %s.\n", msg->data); exit(1); } - nodeinfo_add((ChSingleNodeinfo *) msg->data, p); return REQ_OK; } @@ -2430,6 +2572,8 @@ static int req_handle_realloc(ChMessage *msg, SOCKET fd) ret[saved_argc + index++] = NULL; } + setenv("OLDNODENAMES", create_oldnodenames(), 1); + ChMessage ackmsg; ChMessage_new("realloc_ack", 0, &ackmsg); for (const nodetab_process & p : my_process_table) @@ -3181,6 +3325,8 @@ static SOCKET errorcheck_one_client_connect(void) const SOCKET req_client = skt_accept(server_fd, &clientIP, &clientPort); + //printf("clientPort = %d\n", clientPort); + /* FIXME: will this ever be triggered? It seems the skt_abort handler here is * 'client_connect_problem', which calls exit(1), so we'd exit * in skt_accept. */ @@ -3314,6 +3460,13 @@ static void req_set_client_connect(std::vector & process_table, curclientend = 0; #endif + printf("Charmrun> Waiting for %d clients to connect.\n", count); + for (int i = 0; i < process_table.size(); i++) + { + nodetab_process & p = process_table[i]; + printf("Charmrun> process table nodeno %d, name %s\n", p.nodeno, p.host->name); + } + int finished = 0; while (finished < count) { @@ -3327,8 +3480,18 @@ static void req_set_client_connect(std::vector & process_table, curclientend++; } + //fprintf(stdout, "open_sockets.size() = %d, clientstart,end=%d, %d\n", open_sockets.size(), + // curclientstart, curclientend); + //fflush(stdout); #endif /* check appropriate clients for messages */ + + //for (int i = 0; i < process_table.size(); i++) + //{ + // nodetab_process & p = process_table[i]; + // printf("Charmrun> process table nodeno %d\n", p.nodeno); + //} + while (!open_sockets.empty()) { const SOCKET req_client = open_sockets.front(); @@ -3340,6 +3503,17 @@ static void req_set_client_connect(std::vector & process_table, ChMessage_recv(req_client, &msg); int nodeNo = ChMessageInt(((ChSingleNodeinfo *)msg.data)->nodeNo); + + printf("Charmrun> node %d is connecting\n", nodeNo); + + if (node_set.find(nodeNo) != node_set.end()) + { + printf("Charmrun> node %d is already in the node set\n", nodeNo); + continue; + } + + node_set.insert(nodeNo); + nodetab_process & p = get_process_for_nodeno(process_table, nodeNo); p.req_client = req_client; @@ -3578,24 +3752,71 @@ static void req_construct_phase2_processes(std::vector & phase2 for (nodetab_process & p : my_process_table) { - p.forkstart = active_host_count + p.nodeno * new_processes_per_host; + //p.forkstart = active_host_count + p.nodeno * new_processes_per_host; p.host->processes = 1; + p.host->remaining_cpus--; } - for (int i = 0; i < num_new_processes; ++i) + int i = 0; + //int curr_pe = active_host_count; + int num_forks = 0; + + // FIXME this will hang if total PEs requested > total PEs available + while (num_forks < num_new_processes) { - nodetab_process & src = my_process_table[i % active_host_count]; - phase2_processes.push_back(src); + nodetab_process & src = my_process_table[i++ % active_host_count]; - nodetab_process & p = phase2_processes.back(); - p.nodeno = src.forkstart + (src.host->processes++ - 1); + int prev_pe = src.nodeno; + while (src.host->remaining_cpus > 0) + { + if (num_forks >= num_new_processes) + break; + ++prev_pe; + if (src.forkstart == 0) + src.forkstart = prev_pe; + src.host->processes++; + src.host->remaining_cpus--; + + phase2_processes.push_back(src); + nodetab_process & p = phase2_processes.back(); + p.nodeno = prev_pe; + num_forks++; + } } + + printf("PHASE2> %d processes will be forked\n", phase2_processes.size()); } static void start_nodes_local(const std::vector &); static void start_nodes_ssh(std::vector &); static void finish_nodes(std::vector &); +static void req_client_reconnect(std::vector & process_table) +{ + skt_set_abort(client_connect_problem_skt); + + std::vector phase2_processes; + + req_construct_phase2_processes(phase2_processes); + printf("Phase2 reconnect: %d processes will be forked\n", phase2_processes.size()); + if (phase2_processes.size() > 0) + { + if (!arg_local) + { +#if CMK_SHRINK_EXPAND + if (arg_requested_pes > arg_old_pes) +#endif + { + assert(!arg_mpiexec); + start_nodes_ssh(phase2_processes); + } + } + } + req_add_phase2_processes(phase2_processes); + req_client_connect_table(process_table); + req_all_clients_connected(); +} + static void req_client_connect(std::vector & process_table) { skt_set_abort(client_connect_problem_skt); @@ -3641,23 +3862,28 @@ static void req_client_connect(std::vector & process_table) } else { - // send nodefork packets - ChMessageHeader hdr; - ChMessageInt_t mydata[ChInitNodeforkFields]; - ChMessageHeader_new("nodefork", sizeof(mydata), &hdr); - for (const nodetab_process & p : process_table) +#if CMK_SHRINK_EXPAND + if (!arg_shrinkexpand) +#endif { - int numforks = p.host->processes - 1; - if (numforks <= 0) - continue; + // send nodefork packets + ChMessageHeader hdr; + ChMessageInt_t mydata[ChInitNodeforkFields]; + ChMessageHeader_new("nodefork", sizeof(mydata), &hdr); + for (const nodetab_process & p : process_table) + { + int numforks = p.host->processes - 1; + if (numforks <= 0) + continue; - if (arg_verbose) - printf("Charmrun> Instructing host \"%s\" to fork() x %d\n", p.host->name, numforks); + if (arg_verbose) + printf("Charmrun> Instructing host \"%s\" to fork() x %d\n", p.host->name, numforks); - mydata[0] = ChMessageInt_new(numforks); - mydata[1] = ChMessageInt_new(p.forkstart); - skt_sendN(p.req_client, (const char *) &hdr, sizeof(hdr)); - skt_sendN(p.req_client, (const char *) mydata, sizeof(mydata)); + mydata[0] = ChMessageInt_new(numforks); + mydata[1] = ChMessageInt_new(p.forkstart); + skt_sendN(p.req_client, (const char *) &hdr, sizeof(hdr)); + skt_sendN(p.req_client, (const char *) mydata, sizeof(mydata)); + } } } @@ -4166,7 +4392,7 @@ int main(int argc, const char **argv, char **envp) for (const nodetab_host * h : host_table) { skt_print_ip(ips, sizeof(ips), h->ip); - printf("Charmrun> added host \"%s\", IP:%s\n", h->name, ips); + printf("Charmrun> added host \"%s\", hostno %d, IP:%s\n", h->name, h->hostno, ips); } } @@ -4195,12 +4421,14 @@ int main(int argc, const char **argv, char **envp) ? (arg_requested_nodes > 0 ? std::min(my_host_count, arg_requested_nodes) : my_host_count) : std::min(my_host_count, get_old_style_process_count()); my_process_table.resize(my_initial_process_count); + int curr_nodeno = 0; for (int i = 0; i < my_initial_process_count; ++i) { nodetab_host * h = my_host_table[i]; nodetab_process & p = my_process_table[i]; p.host = h; - p.nodeno = h->hostno; + p.nodeno = curr_nodeno; + curr_nodeno += h->cpus; } /* start the node processes */ @@ -4301,6 +4529,11 @@ int main(int argc, const char **argv, char **envp) finish_nodes(my_process_table); #endif if (!arg_batch_spawn) +#if CMK_SHRINK_EXPAND + if (arg_shrinkexpand) +#endif + req_client_reconnect(my_process_table); + else if (!arg_batch_spawn) req_client_connect(my_process_table); } #if CMK_SSH_KILL @@ -5259,10 +5492,27 @@ static void start_one_node_ssh(nodetab_process & p, const char ** argv) static void start_nodes_ssh(std::vector & process_table) { + char **oldnodenames; +#if CMK_SHRINK_EXPAND + if (arg_shrinkexpand) + { + oldnodenames = (char **) malloc(arg_old_pes * sizeof(char *)); + parse_oldnodenames(oldnodenames); + } + for (nodetab_process & p : process_table) { + if (arg_shrinkexpand && !isPresent(p.host->name, oldnodenames)) + start_one_node_ssh(p); + else if (!arg_shrinkexpand) start_one_node_ssh(p); } +#else + for (nodetab_process & p : process_table) + { + start_one_node_ssh(p); + } +#endif } /* for mpiexec, for once calling mpiexec to start on all nodes */ diff --git a/src/util/ckrescale.C b/src/util/ckrescale.C new file mode 100644 index 0000000000..d1cb2c8c5c --- /dev/null +++ b/src/util/ckrescale.C @@ -0,0 +1,19 @@ +bool shrinkexpand_exit = false; // Flag to indicate if we are in the process of shrinking/expanding +bool in_restart = false; // Flag to indicate if we are in a restart process + + +void set_shrinkexpand_exit(bool value) { + shrinkexpand_exit = value; +} + +bool get_shrinkexpand_exit() { + return shrinkexpand_exit; +} + +void set_in_restart(bool value) { + in_restart = value; +} + +bool get_in_restart() { + return in_restart; +} \ No newline at end of file diff --git a/src/util/ckrescale.h b/src/util/ckrescale.h new file mode 100644 index 0000000000..ed0c4a2a6e --- /dev/null +++ b/src/util/ckrescale.h @@ -0,0 +1,5 @@ +void set_shrinkexpand_exit(bool value); +bool get_shrinkexpand_exit(); + +void set_in_restart(bool value); +bool get_in_restart(); \ No newline at end of file diff --git a/src/util/cmirdmautils.h b/src/util/cmirdmautils.h index cbd622987a..62ccbaa6ac 100644 --- a/src/util/cmirdmautils.h +++ b/src/util/cmirdmautils.h @@ -6,7 +6,7 @@ #include #include -#if CMK_CUDA +#if CMK_CUDA || CMK_HIP enum DeviceRecvType { DEVICE_RECV_TYPE_CHARM, DEVICE_RECV_TYPE_AMPI, @@ -20,18 +20,21 @@ typedef struct DeviceRdmaInfo_ { } DeviceRdmaInfo; typedef struct DeviceRdmaOp_ { - int dest_pe; const void* dest_ptr; size_t size; DeviceRdmaInfo* info; void* src_cb; void* dst_cb; uint64_t tag; + int dest_pe; + int src_pe; + int src_mpi_rank; + int dest_mpi_rank; } DeviceRdmaOp; typedef struct DeviceRdmaOpMsg_ { char header[CmiMsgHeaderSizeBytes]; - DeviceRdmaOp op; + DeviceRdmaOp* op; } DeviceRdmaOpMsg; #endif // CMK_CUDA diff --git a/src/util/pup.h b/src/util/pup.h index 365c05876c..481d24b1d1 100644 --- a/src/util/pup.h +++ b/src/util/pup.h @@ -132,6 +132,11 @@ typedef enum { dataType_last //<- for setting table lengths, etc. } dataType; +enum class PUPMode { + HOST, // Host mode, no special handling + DEVICE +}; + static inline dataType getXlateDataType(signed char *a) { return Tchar; } #if CMK_SIGNEDCHAR_DIFF_CHAR static inline dataType getXlateDataType(char *a) { return Tchar; } @@ -208,10 +213,10 @@ class er { /// These state bits describe the PUP::er's direction. enum { - IS_SIZING = 0x0100, - IS_PACKING = 0x0200, - IS_UNPACKING = 0x0400, - TYPE_MASK = 0xFF00 + IS_SIZING = 0x0100, + IS_PACKING = 0x0200, + IS_UNPACKING = 0x0400, + TYPE_MASK = 0xFF00 }; public: virtual ~er();//<- does nothing, but might be needed by some child @@ -262,8 +267,19 @@ class er { //For arrays: template - void operator()(T *a,size_t nItems) { - bytes((void *)a,nItems, sizeof(T), getXlateDataType(a)); + void operator()(T *a, size_t nItems) { + bytes((void *)a, nItems, sizeof(T), getXlateDataType(a)); + } + + // Overload for T** (array of pointers) + template + void operator()(T **a, size_t nItems) { + bytes((void *)(*a), nItems, sizeof(T), getXlateDataType(*a)); + } + + template + void operator()(T *a,size_t nItems, PUPMode mode) { + bytes((void *)a,nItems, sizeof(T), getXlateDataType(a), mode); } // Standard pup_buffer API that calls malloc for allocation on isUnpacking and free for deallocation on isPacking @@ -323,6 +339,7 @@ class er { //Generic bottleneck: pack/unpack n items of size itemSize // and data type t from p. Desc describes the data item virtual void bytes(void *p,size_t n,size_t itemSize,dataType t) =0; + virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode) =0; virtual void object(able** a); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t) = 0; @@ -390,21 +407,25 @@ enum { class sizer : public er { protected: size_t nBytes; + size_t gpuBytes; //Generic bottleneck: n items of size itemSize virtual void bytes(void *p,size_t n,size_t itemSize,dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function allocate, std::function deallocate); public: //Write data to the given buffer - sizer(const unsigned int purpose = 0) : er(IS_SIZING | purpose), nBytes(0) + sizer(const unsigned int purpose = 0) : er(IS_SIZING | purpose), nBytes(0), gpuBytes(0) { CmiAssert((purpose & TYPE_MASK) == 0); } //Return the current number of bytes to be packed size_t size(void) const {return nBytes;} + + size_t gpu_size(void) const {return gpuBytes;} }; template @@ -417,8 +438,13 @@ class mem : public er { //Memory-buffer packers and unpackers protected: myByte *origBuf;//Start of memory buffer myByte *buf;//Memory buffer (stuff gets packed into/out of here) - mem(const unsigned int type, myByte* Nbuf, const unsigned int purpose = 0) + myByte *gpuBuf; + myByte *gpuOrigBuf; + mem(const unsigned int type, myByte* Nbuf, + myByte* gpuNbuf, + const unsigned int purpose = 0) : er(type | purpose), origBuf(Nbuf), buf(Nbuf) + , gpuOrigBuf(gpuNbuf), gpuBuf(gpuNbuf) { CmiAssert((purpose & TYPE_MASK) == 0); } @@ -455,14 +481,26 @@ class toMem : public mem { protected: //Generic bottleneck: pack n items of size itemSize from p. virtual void bytes(void *p,size_t n,size_t itemSize,dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function allocate, std::function deallocate); public: //Write data to the given buffer - toMem(void* Nbuf, const unsigned int purpose = 0) - : mem(IS_PACKING, (myByte*)Nbuf, purpose) + toMem(void* Nbuf, + void* gpuNbuf, + const unsigned int purpose = 0, int state = IS_PACKING) + : mem(state, (myByte*)Nbuf, + (myByte*)gpuNbuf, + purpose) + { + } + + toMem(void* Nbuf, const unsigned int purpose = 0, int state = IS_PACKING) + : mem(state, (myByte*)Nbuf, + nullptr, + purpose) { } }; @@ -479,6 +517,7 @@ class fromMem : public mem { protected: //Generic bottleneck: unpack n items of size itemSize from p. virtual void bytes(void *p,size_t n,size_t itemSize,dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function allocate, std::function deallocate); @@ -486,9 +525,19 @@ class fromMem : public mem { void pup_buffer_generic(void *&p,size_t n, size_t itemSize, dataType t, std::function allocate, bool isMalloc); public: - //Read data from the given buffer - fromMem(const void* Nbuf, const unsigned int purpose = 0) - : mem(IS_UNPACKING, (myByte*)Nbuf, purpose) + fromMem(const void* Nbuf, + const void* gpuNbuf, + const unsigned int purpose = 0, int state = IS_UNPACKING) + : mem(state, (myByte*)Nbuf, + (myByte*)gpuNbuf, + purpose) + { + } + + fromMem(const void* Nbuf, const unsigned int purpose = 0, int state = IS_UNPACKING) + : mem(state, (myByte*)Nbuf, + nullptr, + purpose) { } }; @@ -524,6 +573,7 @@ class toDisk : public disk { protected: //Generic bottleneck: pack n items of size itemSize from p. virtual void bytes(void *p,size_t n,size_t itemSize,dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function allocate, std::function deallocate); @@ -545,6 +595,7 @@ class fromDisk : public disk { protected: //Generic bottleneck: unpack n items of size itemSize from p. virtual void bytes(void *p,size_t n,size_t itemSize,dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function allocate, std::function deallocate); @@ -576,6 +627,7 @@ class toTextUtil : public er { virtual void synchronize(unsigned int m); protected: virtual void bytes(void *p,size_t n,size_t itemSize,dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode) {} virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function allocate, std::function deallocate); @@ -615,6 +667,7 @@ class toTextFile : public er { protected: FILE *f; virtual void bytes(void *p,size_t n,size_t itemSize,dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode) {} virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function allocate, std::function deallocate); @@ -636,6 +689,7 @@ class fromTextFile : public er { double readDouble(void); virtual void bytes(void *p,size_t n,size_t itemSize,dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode) {} virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function allocate, std::function deallocate); @@ -724,6 +778,7 @@ class xlater : public wrap_er { //Generic bottleneck: unpack n items of size itemSize from p. virtual void bytes(void *p,size_t n,size_t itemSize,dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode) {} virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function allocate, std::function deallocate); diff --git a/src/util/pup_toNetwork.h b/src/util/pup_toNetwork.h index 7c5663af2e..ced89d7246 100644 --- a/src/util/pup_toNetwork.h +++ b/src/util/pup_toNetwork.h @@ -32,6 +32,8 @@ typedef CMK_NETWORK_INT4 CMK_POINTER_SIZED_INT; class PUP_toNetwork_sizer : public PUP::er { size_t nBytes; virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {} + virtual void bytes(void **p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {} virtual void pup_buffer(void *&p, size_t n, size_t itemSize, PUP::dataType t); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, PUP::dataType t, std::function allocate, std::function deallocate); @@ -70,6 +72,8 @@ class PUP_toNetwork_pack : public PUP::er { } virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {} + virtual void bytes(void **p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {} virtual void pup_buffer(void *&p, size_t n, size_t itemSize, PUP::dataType t); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, PUP::dataType t, std::function allocate, std::function deallocate); @@ -118,6 +122,8 @@ class PUP_toNetwork_unpack : public PUP::er { } virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t); + virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {} + virtual void bytes(void **p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {} virtual void pup_buffer(void *&p, size_t n, size_t itemSize, PUP::dataType t); virtual void pup_buffer(void *&p, size_t n, size_t itemSize, PUP::dataType t, std::function allocate, std::function deallocate); diff --git a/src/util/pup_util.C b/src/util/pup_util.C index 561b7ecbad..a8fc92b9fc 100644 --- a/src/util/pup_util.C +++ b/src/util/pup_util.C @@ -22,6 +22,13 @@ virtual functions are defined here. #include "converse.h" #include "pup.h" #include "ckhashtable.h" +#include "conv-mach-cuda.h" +#include "conv-mach-hip.h" + +#if CMK_CUDA || CMK_HIP +#include "hapi_portable.h" +#include "hapi_impl.h" +#endif #include "conv-rdma.h" #if defined(_WIN32) @@ -145,9 +152,21 @@ void PUP::sizer::bytes(void * /*p*/,size_t n,size_t itemSize,dataType /*t*/) nBytes+=n*itemSize; } +void PUP::sizer::bytes(void * p,size_t n,size_t itemSize,dataType t, PUPMode mode) +{ +#ifdef CK_CHECK_PUP + nBytes+=sizeof(pupCheckRec); +#endif + if (mode == PUPMode::HOST) + nBytes+=n*itemSize; + else if (mode == PUPMode::DEVICE) + gpuBytes += n * itemSize; +} + /*Memory PUP::er's*/ void PUP::toMem::bytes(void *p,size_t n,size_t itemSize,dataType t) { + //CmiPrintf("[%d] PUP::toMem::bytes called with p=%p, n=%zu, itemSize=%zu, t=%d\n", CmiMyPe(), p, n, itemSize, t); #ifdef CK_CHECK_PUP ((pupCheckRec *)buf)->write(t,n); buf+=sizeof(pupCheckRec); @@ -156,8 +175,10 @@ void PUP::toMem::bytes(void *p,size_t n,size_t itemSize,dataType t) memcpy((void *)buf,p,n); buf+=n; } + void PUP::fromMem::bytes(void *p,size_t n,size_t itemSize,dataType t) { + //CmiPrintf("[%d] PUP::fromMem::bytes called with p=%p, n=%zu, itemSize=%zu, t=%d\n", CmiMyPe(), p, n, itemSize, t); #ifdef CK_CHECK_PUP ((pupCheckRec *)buf)->check(t,n); buf+=sizeof(pupCheckRec); @@ -167,6 +188,53 @@ void PUP::fromMem::bytes(void *p,size_t n,size_t itemSize,dataType t) buf+=n; } +void PUP::toMem::bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode) +{ + //CmiPrintf("[%d] PUP::toMem::bytes called with p=%p, n=%zu, itemSize=%zu, t=%d, mode=%d\n", CmiMyPe(), p, n, itemSize, t, mode); +#ifdef CK_CHECK_PUP + ((pupCheckRec *)buf)->write(t,n); + buf+=sizeof(pupCheckRec); +#endif + n*=itemSize; + if (mode == PUPMode::HOST) + { + memcpy((void *)buf,p,n); + buf+=n; + } + else + { + //CmiPrintf("[%d] Copying %zu bytes from p=%p to GPU buffer\n", CmiMyPe(), n, p); + // For GPU mode, we assume p is a device pointer and copy directly +#if CMK_CUDA || CMK_HIP + hapiMemcpy((void *)gpuBuf, p, n, hapiMemcpyDeviceToDevice); + gpuBuf += n; +#endif + } +} + +void PUP::fromMem::bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode) +{ + //CmiPrintf("[%d] PUP::fromMem::bytes called with p=%p, n=%zu, itemSize=%zu, t=%d, mode=%d\n", CmiMyPe(), p, n, itemSize, t, mode); +#ifdef CK_CHECK_PUP + ((pupCheckRec *)buf)->check(t,n); + buf+=sizeof(pupCheckRec); +#endif + n*=itemSize; + if (mode == PUPMode::HOST) + { + memcpy(p,(const void *)buf,n); + buf+=n; + } + else + { + //CmiPrintf("[%d] Copying %zu bytes from GPU buffer to p=%p\n", CmiMyPe(), n, p); +#if CMK_CUDA || CMK_HIP + hapiMemcpy(p, (const void *)gpuBuf, n, hapiMemcpyDeviceToDevice); + gpuBuf += n; +#endif + } +} + void PUP::sizer::pup_buffer(void *&p,size_t n, size_t itemSize, dataType t) { #ifdef CK_CHECK_PUP nBytes+=sizeof(pupCheckRec); @@ -375,6 +443,23 @@ void PUP::toDisk::bytes(void *p,size_t n,size_t itemSize,dataType /*t*/) } } +void PUP::toDisk::bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode) +{ + if (mode == PUPMode::HOST) { + bytes(p, n, itemSize, t); + } else if (mode == PUPMode::DEVICE) { +#if CMK_CUDA || CMK_HIP + // For GPU mode, we assume p is a device pointer and copy directly + int allocId = hapiCheckpoint(p, itemSize * n); + //CmiPrintf("Alloc ID = %d\n", allocId); + if(CmiFwrite(&allocId,sizeof(int),1,F) != 1) + { + error = true; + } +#endif + } +} + void PUP::toDisk::pup_buffer(void *&p,size_t n,size_t itemSize,dataType t) { bytes(p, n, itemSize, t); if(isDeleting()) free(p); @@ -385,8 +470,24 @@ void PUP::toDisk::pup_buffer(void *&p,size_t n, size_t itemSize, dataType t, std if(isDeleting()) deallocate(p); } -void PUP::fromDisk::bytes(void *p,size_t n,size_t itemSize,dataType /*t*/) -{/* CkPrintf("reading %d bytes\n",itemSize*n); */ CmiFread(p,itemSize,n,F);} +void PUP::fromDisk::bytes(void *p,size_t n,size_t itemSize,dataType t) +{ + CmiFread(p,itemSize,n,F); +} + +void PUP::fromDisk::bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode) +{ + if (mode == PUPMode::HOST) { + bytes(p, n, itemSize, t); + } else if (mode == PUPMode::DEVICE) { +#if CMK_CUDA || CMK_HIP + // For GPU mode, we assume p is a device pointer and copy directly + int allocId; + CmiFread(&allocId,sizeof(int),1,F); + hapiRestore(p, itemSize * n, allocId); +#endif + } +} void PUP::fromDisk::pup_buffer(void *&p,size_t n,size_t itemSize,dataType t) { if(isUnpacking()) p = malloc(n * itemSize); diff --git a/tests/ampi/migration/Makefile b/tests/ampi/migration/Makefile index 860fc1adb2..aa8e0d6b96 100644 --- a/tests/ampi/migration/Makefile +++ b/tests/ampi/migration/Makefile @@ -4,10 +4,10 @@ CHARMC=../../../bin/ampicxx $(OPTS) all: migration migration: test.o - $(CHARMC) -o migration test.o + $(CHARMC) -pieglobals -o migration test.o test.o: test.C - $(CHARMC) -c test.C + $(CHARMC) -pieglobals -c test.C # #