diff --git a/CMakeLists.txt b/CMakeLists.txt
index 32544d8897..9dfa2cb79f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,6 +208,7 @@ option(BUILD_SHARED        "Build Charm++ dynamic libraries" OFF)
 
 # Other options
 option(BUILD_CUDA          "Build with CUDA support" OFF)
+option(BUILD_HIP           "Build with HIP support" OFF)
 option(PXSHM               "Build with PXSHM" OFF)
 
 # LRTS PMI options
@@ -511,7 +512,7 @@ if(EXISTS ${CMAKE_SOURCE_DIR}/src/arch/${NETWORK}/gdir_link)
   file(STRINGS src/arch/${NETWORK}/gdir_link GDIR)
 elseif(${NETWORK} MATCHES "gni-")
   set(GDIR "gni")
-elseif(${NETWORK} MATCHES "mpi-cray")
+elseif(${NETWORK} MATCHES "`mpi`-cray")
   set(GDIR "mpi")
 elseif(${NETWORK} MATCHES "ofi-cray")
   set(GDIR "ofi")
@@ -533,8 +534,12 @@ else()
   set(CMK_BUILD_CHARMRUN 1)
 endif()
 
+set(CHARMRUN_ELASTIC_DIR src/arch/common)
+set(CHARMRUN_HAPI_DIR src/arch/common)
+
 include(cmake/detect-features.cmake)
 include(cmake/ci-files.cmake)
+add_custom_target(ci-generated DEPENDS ${all-ci-outputs})
 
 
 if(${TARGET} STREQUAL "all-test")
@@ -660,6 +665,8 @@ configure_file(src/arch/common/cc-msvc.sh                    include/ COPYONLY)
 configure_file(src/arch/common/conv-mach-craype.sh           include/ COPYONLY)
 configure_file(src/arch/common/conv-mach-cuda.sh             include/ COPYONLY)
 configure_file(src/arch/common/conv-mach-cuda.h              include/ COPYONLY)
+configure_file(src/arch/common/conv-mach-hip.sh              include/ COPYONLY)
+configure_file(src/arch/common/conv-mach-hip.h               include/ COPYONLY)
 configure_file(src/arch/common/conv-mach-darwin.sh           include/ COPYONLY)
 configure_file(src/arch/common/conv-mach-flang.h             include/ COPYONLY)
 configure_file(src/arch/common/conv-mach-flang.sh            include/ COPYONLY)
@@ -688,9 +695,13 @@ configure_file(src/arch/common/conv-mach-tsan.h              include/ COPYONLY)
 configure_file(src/arch/common/conv-mach-tsan.sh             include/ COPYONLY)
 configure_file(src/scripts/conv-config.sh                    include/ COPYONLY)
 configure_file(src/arch/${VDIR}/conv-mach.sh                 include/ COPYONLY)
+configure_file(src/util/ckrescale.h                         include/ COPYONLY)
+
+add_library(ckrescale src/util/ckrescale.C)
 
 set(CUDA_DIR "")
-if(BUILD_CUDA)
+set(HIP_DIR "")
+if(BUILD_CUDA OR BUILD_HIP)
 
   file(GLOB_RECURSE hybridAPI-h-sources ${CMAKE_SOURCE_DIR}/src/arch/cuda/*.h)
   file(GLOB_RECURSE hybridAPI-cxx-sources ${CMAKE_SOURCE_DIR}/src/arch/cuda/*.cpp)
@@ -698,19 +709,96 @@ if(BUILD_CUDA)
     configure_file(${file} include/ COPYONLY)
   endforeach()
 
-  if(CMAKE_VERSION VERSION_GREATER 3.17 OR CMAKE_VERSION VERSION_EQUAL 3.17)
-    find_package(CUDAToolkit REQUIRED)
-    set(CMAKE_CUDA_COMPILER "${CUDAToolkit_NVCC_EXECUTABLE}")
-    enable_language(CUDA)
-    set(CUDA_DIR "${CUDAToolkit_TARGET_DIR}")
-  else()
-    find_package(CUDA REQUIRED)
-    set(CUDA_DIR "${CUDA_TOOLKIT_ROOT_DIR}")
+  if (BUILD_CUDA)
+    if(CMAKE_VERSION VERSION_GREATER 3.17 OR CMAKE_VERSION VERSION_EQUAL 3.17)
+      find_package(CUDAToolkit REQUIRED)
+      set(CMAKE_CUDA_COMPILER "${CUDAToolkit_NVCC_EXECUTABLE}")
+      enable_language(CUDA)
+      set(CUDA_DIR "${CUDAToolkit_TARGET_DIR}")
+    else()
+      find_package(CUDA REQUIRED)
+      set(CUDA_DIR "${CUDA_TOOLKIT_ROOT_DIR}")
+    endif()
+
+    # Find CUPTI library and include directory
+    find_library(CUPTI_LIBRARY cupti
+      HINTS "${CUDA_DIR}/extras/CUPTI/lib64"
+            "${CUDA_DIR}/lib64"
+    )
+    set(CUPTI_INCLUDE_DIR "${CUDA_DIR}/extras/CUPTI/include")
+    if(NOT CUPTI_LIBRARY)
+      message(WARNING "CUPTI library not found. GPU load balancing will not be available.")
+    else()
+      message(STATUS "Found CUPTI: ${CUPTI_LIBRARY}")
+    endif()
+
+    add_library(hybridapi ${hybridAPI-cxx-sources} $<TARGET_OBJECTS:ckrescale>)
+    add_dependencies(hybridapi ci-generated)
+
+    if(CUPTI_LIBRARY)
+      target_include_directories(hybridapi PRIVATE "${CUPTI_INCLUDE_DIR}")
+      target_link_libraries(hybridapi ${CUPTI_LIBRARY})
+    endif()
+
+    if(TRACING)
+      target_compile_definitions(hybridapi PRIVATE HAPI_TRACE)
+    endif()
   endif()
-  add_library(cudahybridapi ${hybridAPI-cxx-sources})
-  if(TRACING)
-    target_compile_definitions(cudahybridapi PRIVATE HAPI_TRACE)
+
+  if (BUILD_HIP)
+    add_compile_definitions(__HIP_PLATFORM_AMD__)
+    # Modern ROCm/HIP detection
+    if(NOT DEFINED ROCM_PATH)
+      if(NOT DEFINED ENV{ROCM_PATH})
+        set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation")
+      else()
+        set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation")
+      endif()
+    endif()
+    
+    # Find hipcc wrapper for reference
+    find_program(HIP_HIPCC_EXECUTABLE
+      NAMES hipcc
+      PATHS "${ROCM_PATH}/bin" "${ROCM_PATH}/hip/bin"
+      NO_DEFAULT_PATH
+    )
+    
+    if(NOT HIP_HIPCC_EXECUTABLE)
+      message(FATAL_ERROR "Could not find hipcc. Please set ROCM_PATH to your ROCm installation directory.")
+    endif()
+    
+    # Find the actual clang compiler used by ROCm (required by CMake)
+    find_program(CMAKE_HIP_COMPILER
+      NAMES clang++
+      PATHS "${ROCM_PATH}/llvm/bin" "${ROCM_PATH}/bin"
+      NO_DEFAULT_PATH
+    )
+    
+    if(NOT CMAKE_HIP_COMPILER)
+      message(FATAL_ERROR "Could not find ROCm clang++ compiler in ${ROCM_PATH}")
+    endif()
+    
+    set(HIP_DIR "${ROCM_PATH}")
+    set(CMAKE_HIP_ARCHITECTURES "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100" CACHE STRING "HIP architectures")
+    
+    # Enable HIP language support
+    enable_language(HIP)
+    
+    add_library(hybridapi ${hybridAPI-cxx-sources})
+    add_dependencies(hybridapi ci-generated)
+    target_include_directories(hybridapi PRIVATE "${ROCM_PATH}/include")
+
+    if(TRACING)
+      target_compile_definitions(hybridapi PRIVATE HAPI_TRACE)
+    endif()
   endif()
+
+  # hapi_memory_daemon - standalone executable for shrink/expand GPU memory management
+  if(BUILD_CUDA)
+    add_executable(hapi_memory_daemon src/arch/cuda/hybridAPI/hapi_memory_daemon.cpp)
+    add_dependencies(hapi_memory_daemon hybridapi ck converse ckqt moduleNDMeshStreamer ckmain modulecompletion conv-static)
+  endif()
+
 endif()
 
 if(EXISTS ${CMAKE_SOURCE_DIR}/src/arch/${VDIR}/conv-mach-cxi.sh)
@@ -911,6 +999,12 @@ if(${CMK_BUILD_CHARMRUN})
   add_dependencies(charmrun create_symlinks)
 else()
   configure_file(${CHARMRUN_DIR}/charmrun ${CMAKE_BINARY_DIR}/bin COPYONLY)
+  if(EXISTS ${CMAKE_SOURCE_DIR}/${CHARMRUN_ELASTIC_DIR}/charmrun_elastic)
+    configure_file(${CHARMRUN_ELASTIC_DIR}/charmrun_elastic ${CMAKE_BINARY_DIR}/bin COPYONLY)
+  endif()
+  if(EXISTS ${CMAKE_SOURCE_DIR}/${CHARMRUN_HAPI_DIR}/charmrun_hapi)
+    configure_file(${CHARMRUN_HAPI_DIR}/charmrun_hapi ${CMAKE_BINARY_DIR}/bin COPYONLY)
+  endif()
 endif()
 configure_file(src/scripts/testrun bin/ COPYONLY)
 
@@ -1000,7 +1094,11 @@ if(${TARGET} STREQUAL "charm4py")
   endif()
 
   if (${BUILD_CUDA})
-    target_link_libraries(charm cudart cudahybridapi)
+    target_link_libraries(charm cudart hybridapi)
+  endif()
+
+  if (${BUILD_HIP})
+    target_link_libraries(charm hiprtc hybridapi)
   endif()
 
   if(${TRACING})
@@ -1019,9 +1117,12 @@ else()
   if(RECONVERSE)
     target_link_libraries(ckhello PRIVATE reconverse)
   endif()
-  add_dependencies(ckhello ck ckqt conv-static
+  add_dependencies(ckhello ck ckqt ckrescale conv-static
     converse ckmain
     moduleNDMeshStreamer modulecompletion)
+  if(BUILD_CUDA OR BUILD_HIP)
+    add_dependencies(ckhello hybridapi)
+  endif()
 endif()
 
 # Create conv-mach-opt.sh
@@ -1067,7 +1168,7 @@ foreach(l BUILDOPTS CMK_AMPI_WITH_ROMIO CMK_BUILD_PYTHON CMK_CAN_LINK_FORTRAN
         CXX_NO_AS_NEEDED LDXX_WHOLE_ARCHIVE_PRE LDXX_WHOLE_ARCHIVE_POST
         CMK_MACOSX CMK_POST_EXE CMK_SHARED_SUF CMK_USER_SUFFIX OPTS_LD
         CMK_COMPILER_KNOWS_FVISIBILITY CMK_LINKER_KNOWS_UNDEFINED
-        CMK_SUPPORTS_MEMORY_ISOMALLOC CUDA_DIR CMK_USER_DISABLED_TLS CMK_CXI)
+        CMK_SUPPORTS_MEMORY_ISOMALLOC CUDA_DIR HIP_DIR CMK_USER_DISABLED_TLS CMK_CXI)
     file(APPEND ${optfile_sh} "${l}=\"${${l}}\"\n" )
 endforeach(l)
 
@@ -1104,7 +1205,7 @@ endif()
 set(optfile_mak ${CMAKE_BINARY_DIR}/include/conv-mach-opt.mak)
 
 file(WRITE  ${optfile_mak} "# Build-time options header for Makefiles, automatically generated by cmake.\n")
-foreach(l CUDA_DIR BUILD_CUDA CMK_AMPI_WITH_ROMIO CMK_MACOSX CMK_BUILD_PYTHON
+foreach(l CUDA_DIR HIP_DIR BUILD_CUDA BUILD_HIP CMK_AMPI_WITH_ROMIO CMK_MACOSX CMK_BUILD_PYTHON
         CMK_CHARMDEBUG CMK_COMPILER CMK_GDIR CMK_HAS_MALLOC_HOOK CMK_HAS_MMAP CMK_LIBJPEG
         CMK_LUSTREAPI CMK_MULTICORE CMK_NO_BUILD_SHARED CMK_NO_PARTITIONS CMK_SHARED_SUF
         CMK_SMP CMK_SUPPORTS_FSGLOBALS CMK_SUPPORTS_PIPGLOBALS CMK_SUPPORTS_PIEGLOBALS
@@ -1117,7 +1218,8 @@ endforeach(l)
 
 # Add options
 set(CUDA ${BUILD_CUDA}) # need CUDA to match conv-mach file name
-foreach(opt SMP OMP TCP PTHREADS SYNCFT PXSHM PERSISTENT OOC CUDA PAPI CXI)
+set(HIP ${BUILD_HIP})   # need HIP to match conv-mach file name
+foreach(opt SMP OMP TCP PTHREADS SYNCFT PXSHM PERSISTENT OOC CUDA HIP PAPI CXI)
     if(${opt})
         string(TOLOWER ${opt} optl)
         file(APPEND ${optfile_sh} ". ${CMAKE_BINARY_DIR}/include/conv-mach-${optl}.sh\n")
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000..1fe1b02e00
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,25 @@
+FROM mpioperator/openmpi
+
+RUN apt update && apt install -y build-essential zlib1g-dev ca-certificates cmake git
+
+RUN apt update \
+    && apt install -y --no-install-recommends \
+        g++ \
+        gfortran \
+        libopenmpi-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+#RUN git clone https://github.com/charmplusplus/charm.git
+RUN mkdir /home/mpiuser/charm
+COPY . /home/mpiuser/charm
+RUN cd charm && git checkout shrinkexpand-mpi && ./build charm++ mpi-linux-x86_64 --enable-shrinkexpand -j8 --force --with-production
+
+RUN cd charm/examples/charm++/shrink_expand && make clean && make
+RUN cd charm/examples/charm++/shrink_expand/jacobi2d-iter && make clean && make
+RUN cd charm/examples/charm++/shrink_expand/startup && make clean && make
+RUN mkdir /app
+RUN cp charm/examples/charm++/shrink_expand/jacobi2d-iter/charmrun /app/
+RUN cp charm/examples/charm++/shrink_expand/jacobi2d-iter/charmrun_elastic /app/
+RUN cp charm/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d /app/
+RUN cp charm/examples/charm++/shrink_expand/startup/startup /app/
+RUN chmod 777 /app
diff --git a/benchmarks/charm++/cuda/gpudirect/latency/latency.C b/benchmarks/charm++/cuda/gpudirect/latency/latency.C
index 512e2c7edf..6945280745 100644
--- a/benchmarks/charm++/cuda/gpudirect/latency/latency.C
+++ b/benchmarks/charm++/cuda/gpudirect/latency/latency.C
@@ -125,6 +125,7 @@ public:
   int peer;
 
   double start_time;
+  double allocTime;
   double* times;
 
   char* h_local_data;
@@ -211,6 +212,13 @@ public:
       cudaStreamSynchronize(stream);
       thisProxy[peer].receiveReg(size, h_local_data);
     } else {
+      double allocStart = CkWallTimer();
+      char* d_local_data_new;
+      hapiCheck(cudaMalloc(&d_local_data_new, max_size));
+      hapiCheck(cudaFree(d_local_data));
+      d_local_data = d_local_data_new;
+      send_buffer = CkDeviceBuffer(d_local_data_new);
+      allocTime = CkWallTimer() - allocStart;
       thisProxy[peer].receiveZC(size, send_buffer);
     }
   }
@@ -230,7 +238,7 @@ public:
     // Inform the runtime where the incoming data should be stored
     // and which CUDA stream should be used for the transfer
     data = d_remote_data;
-    devicePost[0].cuda_stream = stream; // Not used with UCX
+    devicePost[0].hapi_stream = stream; // Not used with UCX
   }
 
   // Second receive (regular entry method), invoked once the data transfers complete
@@ -247,7 +255,7 @@ public:
     } else {
       // PE 0: received pong
       if (iter > warmup_iters) {
-        times[iter-warmup_iters-1] = (CkWallTimer() - start_time) / 2.0;
+        times[iter-warmup_iters-1] = (CkWallTimer() - start_time) / 2.0 - allocTime;
       }
 
       // Start next iteration or end test for current size
diff --git a/buildcmake b/buildcmake
index 777c295172..170b9a5e2e 100755
--- a/buildcmake
+++ b/buildcmake
@@ -103,6 +103,7 @@ opt_ccs=0
 opt_charmdebug=0
 opt_controlpoint=0
 opt_cuda=0
+opt_hip=0
 opt_destination=""
 opt_disabletls=0
 opt_install_prefix=""
@@ -181,6 +182,9 @@ function parse_platform_compilers() {
     cuda)
       opt_cuda=1
       ;;
+    hip)
+      opt_hip=1
+      ;;
     cxi)
       opt_cxi=1
       ;;
@@ -681,6 +685,7 @@ CC=$opt_CC CXX=$opt_CXX FC=$opt_FC cmake "$my_srcdir" \
   -DCHARMDEBUG="$opt_charmdebug" \
   -DCONTROLPOINT="$opt_controlpoint" \
   -DBUILD_CUDA="$opt_cuda" \
+  -DBUILD_HIP="$opt_hip" \
   -DDISABLE_TLS="$opt_disabletls" \
   -DDRONE_MODE="$opt_drone_mode" \
   -DENABLE_FORTRAN=$opt_enable_fortran \
diff --git a/cmake/converse.cmake b/cmake/converse.cmake
index 2c0c4f34b8..badf4523fd 100644
--- a/cmake/converse.cmake
+++ b/cmake/converse.cmake
@@ -240,7 +240,8 @@ add_library(charm_cxx_utils STATIC
 
 add_library(topomanager STATIC
     ${tmgr-cxx-sources}
-    ${tmgr-h-sources})
+    ${tmgr-h-sources}
+    $<TARGET_OBJECTS:ckrescale>)
 
 target_include_directories(topomanager PUBLIC
     src/util/topomanager
@@ -253,7 +254,7 @@ target_include_directories(topomanager PUBLIC
 #     charm_cxx_utils
 # )
 add_custom_target(converse)
-add_dependencies(converse reconverse topomanager charm_cxx_utils)
+add_dependencies(converse reconverse topomanager charm_cxx_utils ckrescale)
 
 #file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/include/comm_backend)
 
diff --git a/cmake/fetch_reconverse/CMakeLists.txt b/cmake/fetch_reconverse/CMakeLists.txt
index c26ef0edb8..1586a5c41c 100644
--- a/cmake/fetch_reconverse/CMakeLists.txt
+++ b/cmake/fetch_reconverse/CMakeLists.txt
@@ -27,5 +27,6 @@ FetchContent_MakeAvailable(reconverse)
 set(BUILD_SHARED_LIBS ${_save_BUILD_SHARED_LIBS} CACHE INTERNAL "")
 
 configure_file(${reconverse_SOURCE_DIR}/include/converse.h ${CMAKE_BINARY_DIR}/include/ COPYONLY)
+configure_file(${reconverse_SOURCE_DIR}/include/conv-rdma.h ${CMAKE_BINARY_DIR}/include/ COPYONLY)
 configure_file(${reconverse_SOURCE_DIR}/src/cldb.h ${CMAKE_BINARY_DIR}/include/ COPYONLY)
 configure_file(${reconverse_SOURCE_DIR}/include/charm-config.h ${CMAKE_BINARY_DIR}/include/ COPYONLY)
diff --git a/doc/charm++/manual.rst b/doc/charm++/manual.rst
index 9a7e871430..e4db2af2ec 100644
--- a/doc/charm++/manual.rst
+++ b/doc/charm++/manual.rst
@@ -9314,7 +9314,8 @@ This entry method should be invoked on the sender by wrapping the
 source buffer with ``CkDeviceBuffer``, whose constructor takes a pointer
 to the source buffer, a Charm++ callback to be invoked once the transfer
 completes (optional), and a CUDA stream associated with the transfer
-(which is only used internally in the CUDA memcpy and IPC based implementation and is also optional):
+(which is only used internally in the CUDA memcpy and IPC based implementation and is also optional).
+The user guarantees that the GPU buffer won't be modified until the callback is called:
 
 .. code-block:: c++
 
diff --git a/examples/ampi/Cjacobi3D/Makefile b/examples/ampi/Cjacobi3D/Makefile
index 257f7ca208..277268a009 100644
--- a/examples/ampi/Cjacobi3D/Makefile
+++ b/examples/ampi/Cjacobi3D/Makefile
@@ -1,7 +1,7 @@
 -include ../../common.mk
 -include ../../../include/conv-mach-opt.mak
 CHARMBASE=../../../
-CHARMC=../../../bin/ampicxx $(OPTS)
+CHARMC=../../../netlrts-linux-x86_64/bin/ampicxx $(OPTS)
 TOKENS=6
 
 -include $(CHARMBASE)/include/conv-mach-opt.mak
@@ -12,6 +12,7 @@ AMPI_TARGETS := \
   jacobi \
   jacobi.pup \
   jacobi-get \
+  jacobi.pie
 
 ifeq (1,$(CMK_SUPPORTS_TLSGLOBALS))
   AMPI_TARGETS += jacobi.tls
@@ -47,6 +48,10 @@ jacobi.tls: jacobi.C
 	$(CHARMC) -c -tlsglobals jacobi.C -o jacobi.tls.o
 	$(CHARMC) -o jacobi.tls jacobi.tls.o -tlsglobals
 
+jacobi.pie: jacobi-pie.C
+	$(CHARMC) -c -pieglobals jacobi-pie.C -o jacobi.pie.o
+	$(CHARMC) -o jacobi.pie jacobi.pie.o -pieglobals
+
 jacobi.rose: jacobi.C
 	$(CHARMC) -roseomptlsglobals -o jacobi.rose.o -c $<
 	$(CHARMC) -roseomptlsglobals -o $@ jacobi.rose.o
@@ -93,5 +98,5 @@ endif
 
 
 clean:
-	rm -f *.o jacobi *~ moduleinit.C charmrun conv-host jacobi-cpp jacobi.iso jacobi-get jacobi.tls ampirun
+	rm -f *.o jacobi *~ moduleinit.C charmrun conv-host jacobi-cpp jacobi.iso jacobi-get jacobi.tls jacobi.pie ampirun
 	rm -rf 40 80 120
diff --git a/examples/ampi/Cjacobi3D/jacobi.C b/examples/ampi/Cjacobi3D/jacobi.C
index 37d0cc7e7a..b15e6f3e96 100644
--- a/examples/ampi/Cjacobi3D/jacobi.C
+++ b/examples/ampi/Cjacobi3D/jacobi.C
@@ -27,39 +27,7 @@ class chunk {
     double rbzp[DIMX*DIMY];
 };
 
-#ifdef AMPI
-void chunk_pup(pup_er p, void *d)
-{
-  chunk **cpp = (chunk **) d;
-  if(pup_isUnpacking(p))
-    *cpp = new chunk;
-  chunk *cp = *cpp;
-  pup_doubles(p, &cp->t[0][0][0], (DIMX+2)*(DIMY+2)*(DIMZ+2));
-  pup_int(p, &cp->xidx);
-  pup_int(p, &cp->yidx);
-  pup_int(p, &cp->zidx);
-  pup_int(p, &cp->xp);
-  pup_int(p, &cp->xm);
-  pup_int(p, &cp->yp);
-  pup_int(p, &cp->ym);
-  pup_int(p, &cp->zp);
-  pup_int(p, &cp->zm);
-  pup_doubles(p, cp->sbxm, (DIMY*DIMZ));
-  pup_doubles(p, cp->sbxp, (DIMY*DIMZ));
-  pup_doubles(p, cp->rbxm, (DIMY*DIMZ));
-  pup_doubles(p, cp->rbxp, (DIMY*DIMZ));
-  pup_doubles(p, cp->sbym, (DIMX*DIMZ));
-  pup_doubles(p, cp->sbyp, (DIMX*DIMZ));
-  pup_doubles(p, cp->rbym, (DIMX*DIMZ));
-  pup_doubles(p, cp->rbyp, (DIMX*DIMZ));
-  pup_doubles(p, cp->sbzm, (DIMX*DIMY));
-  pup_doubles(p, cp->sbzp, (DIMX*DIMY));
-  pup_doubles(p, cp->rbzm, (DIMX*DIMY));
-  pup_doubles(p, cp->rbzp, (DIMX*DIMY));
-  if(pup_isDeleting(p))
-    delete cp;
-}
-#endif
+__thread chunk cp;
 
 #define abs(x) ((x)<0.0 ? -(x) : (x))
 
@@ -102,7 +70,6 @@ int main(int ac, char** av)
   int i, j, k, m;
   int iter, niter, cp_idx;
   double maxerr, error, tval, starttime, itertime;
-  chunk *cp;
   int rank, size;
   MPI_Request req[12];
 
@@ -132,74 +99,64 @@ int main(int ac, char** av)
 
   MPI_Bcast(&niter, 1, MPI_INT, 0, MPI_COMM_WORLD);
 
-  cp = new chunk;
-#if defined(AMPI) && ! defined(NO_PUP)
-  AMPI_Register_pup((MPI_PupFn)chunk_pup, (void*)&cp, &cp_idx);
-#endif
-
-  index3d(rank, cp->xidx, cp->yidx, cp->zidx);
-  cp->xp = index1d((cp->xidx+1)%NX,cp->yidx,cp->zidx);
-  cp->xm = index1d((cp->xidx+NX-1)%NX,cp->yidx,cp->zidx);
-  cp->yp = index1d(cp->xidx,(cp->yidx+1)%NY,cp->zidx);
-  cp->ym = index1d(cp->xidx,(cp->yidx+NY-1)%NY,cp->zidx);
-  cp->zp = index1d(cp->xidx,cp->yidx,(cp->zidx+1)%NZ);
-  cp->zm = index1d(cp->xidx,cp->yidx,(cp->zidx+NZ-1)%NZ);
+  index3d(rank, cp.xidx, cp.yidx, cp.zidx);
+  cp.xp = index1d((cp.xidx+1)%NX,cp.yidx,cp.zidx);
+  cp.xm = index1d((cp.xidx+NX-1)%NX,cp.yidx,cp.zidx);
+  cp.yp = index1d(cp.xidx,(cp.yidx+1)%NY,cp.zidx);
+  cp.ym = index1d(cp.xidx,(cp.yidx+NY-1)%NY,cp.zidx);
+  cp.zp = index1d(cp.xidx,cp.yidx,(cp.zidx+1)%NZ);
+  cp.zm = index1d(cp.xidx,cp.yidx,(cp.zidx+NZ-1)%NZ);
   for(i=1; i<=DIMZ; i++)
     for(j=1; j<=DIMY; j++)
       for(k=1; k<=DIMX; k++)
-        cp->t[k][j][i] = DIMY*DIMX*(i-1) + DIMX*(j-2) + (k-1);
+        cp.t[k][j][i] = DIMY*DIMX*(i-1) + DIMX*(j-2) + (k-1);
 
   MPI_Barrier(MPI_COMM_WORLD);
   starttime = MPI_Wtime();
 
   for(iter=1; iter<=niter; iter++) {
     maxerr = 0.0;
-    copyout(cp->sbxm, cp->t, 1, 1, 1, DIMY, 1, DIMZ);
-    copyout(cp->sbxp, cp->t, DIMX, DIMX, 1, DIMY, 1, DIMZ);
-    copyout(cp->sbym, cp->t, 1, DIMX, 1, 1, 1, DIMZ);
-    copyout(cp->sbyp, cp->t, 1, DIMX, DIMY, DIMY, 1, DIMZ);
-    copyout(cp->sbzm, cp->t, 1, DIMX, 1, DIMY, 1, 1);
-    copyout(cp->sbzp, cp->t, 1, DIMX, 1, DIMY, DIMZ, DIMZ);
-
-    MPI_Irecv(cp->rbxp, DIMY*DIMZ, MPI_DOUBLE, cp->xp, 0, MPI_COMM_WORLD, &req[0]);
-    MPI_Irecv(cp->rbxm, DIMY*DIMZ, MPI_DOUBLE, cp->xm, 1, MPI_COMM_WORLD, &req[1]);
-    MPI_Irecv(cp->rbyp, DIMX*DIMZ, MPI_DOUBLE, cp->yp, 2, MPI_COMM_WORLD, &req[2]);
-    MPI_Irecv(cp->rbym, DIMX*DIMZ, MPI_DOUBLE, cp->ym, 3, MPI_COMM_WORLD, &req[3]);
-    MPI_Irecv(cp->rbzm, DIMX*DIMY, MPI_DOUBLE, cp->zm, 5, MPI_COMM_WORLD, &req[4]);
-    MPI_Irecv(cp->rbzp, DIMX*DIMY, MPI_DOUBLE, cp->zp, 4, MPI_COMM_WORLD, &req[5]);
-
-    MPI_Isend(cp->sbxm, DIMY*DIMZ, MPI_DOUBLE, cp->xm, 0, MPI_COMM_WORLD, &req[6]);
-    MPI_Isend(cp->sbxp, DIMY*DIMZ, MPI_DOUBLE, cp->xp, 1, MPI_COMM_WORLD, &req[7]);
-    MPI_Isend(cp->sbym, DIMX*DIMZ, MPI_DOUBLE, cp->ym, 2, MPI_COMM_WORLD, &req[8]);
-    MPI_Isend(cp->sbyp, DIMX*DIMZ, MPI_DOUBLE, cp->yp, 3, MPI_COMM_WORLD, &req[9]);
-    MPI_Isend(cp->sbzm, DIMX*DIMY, MPI_DOUBLE, cp->zm, 4, MPI_COMM_WORLD, &req[10]);
-    MPI_Isend(cp->sbzp, DIMX*DIMY, MPI_DOUBLE, cp->zp, 5, MPI_COMM_WORLD, &req[11]);
+    copyout(cp.sbxm, cp.t, 1, 1, 1, DIMY, 1, DIMZ);
+    copyout(cp.sbxp, cp.t, DIMX, DIMX, 1, DIMY, 1, DIMZ);
+    copyout(cp.sbym, cp.t, 1, DIMX, 1, 1, 1, DIMZ);
+    copyout(cp.sbyp, cp.t, 1, DIMX, DIMY, DIMY, 1, DIMZ);
+    copyout(cp.sbzm, cp.t, 1, DIMX, 1, DIMY, 1, 1);
+    copyout(cp.sbzp, cp.t, 1, DIMX, 1, DIMY, DIMZ, DIMZ);
+
+    MPI_Irecv(cp.rbxp, DIMY*DIMZ, MPI_DOUBLE, cp.xp, 0, MPI_COMM_WORLD, &req[0]);
+    MPI_Irecv(cp.rbxm, DIMY*DIMZ, MPI_DOUBLE, cp.xm, 1, MPI_COMM_WORLD, &req[1]);
+    MPI_Irecv(cp.rbyp, DIMX*DIMZ, MPI_DOUBLE, cp.yp, 2, MPI_COMM_WORLD, &req[2]);
+    MPI_Irecv(cp.rbym, DIMX*DIMZ, MPI_DOUBLE, cp.ym, 3, MPI_COMM_WORLD, &req[3]);
+    MPI_Irecv(cp.rbzm, DIMX*DIMY, MPI_DOUBLE, cp.zm, 5, MPI_COMM_WORLD, &req[4]);
+    MPI_Irecv(cp.rbzp, DIMX*DIMY, MPI_DOUBLE, cp.zp, 4, MPI_COMM_WORLD, &req[5]);
+
+    MPI_Isend(cp.sbxm, DIMY*DIMZ, MPI_DOUBLE, cp.xm, 0, MPI_COMM_WORLD, &req[6]);
+    MPI_Isend(cp.sbxp, DIMY*DIMZ, MPI_DOUBLE, cp.xp, 1, MPI_COMM_WORLD, &req[7]);
+    MPI_Isend(cp.sbym, DIMX*DIMZ, MPI_DOUBLE, cp.ym, 2, MPI_COMM_WORLD, &req[8]);
+    MPI_Isend(cp.sbyp, DIMX*DIMZ, MPI_DOUBLE, cp.yp, 3, MPI_COMM_WORLD, &req[9]);
+    MPI_Isend(cp.sbzm, DIMX*DIMY, MPI_DOUBLE, cp.zm, 4, MPI_COMM_WORLD, &req[10]);
+    MPI_Isend(cp.sbzp, DIMX*DIMY, MPI_DOUBLE, cp.zp, 5, MPI_COMM_WORLD, &req[11]);
 
     MPI_Waitall(12, req, MPI_STATUSES_IGNORE);
 
-    copyin(cp->sbxm, cp->t, 0, 0, 1, DIMY, 1, DIMZ);
-    copyin(cp->sbxp, cp->t, DIMX+1, DIMX+1, 1, DIMY, 1, DIMZ);
-    copyin(cp->sbym, cp->t, 1, DIMX, 0, 0, 1, DIMZ);
-    copyin(cp->sbyp, cp->t, 1, DIMX, DIMY+1, DIMY+1, 1, DIMZ);
-    copyin(cp->sbzm, cp->t, 1, DIMX, 1, DIMY, 0, 0);
-    copyin(cp->sbzp, cp->t, 1, DIMX, 1, DIMY, DIMZ+1, DIMZ+1);
-
-    if(iter > 25 && iter < 85 && rank == 35)
-      m = 9;
-    else
-      m = 1;
-    for(; m>0; m--)
-      for(i=1; i<=DIMZ; i++)
-        for(j=1; j<=DIMY; j++)
-          for(k=1; k<=DIMX; k++) {
-            tval = (cp->t[k][j][i]   + cp->t[k][j][i+1] +
-                    cp->t[k][j][i-1] + cp->t[k][j+1][i] +
-                    cp->t[k][j-1][i] + cp->t[k+1][j][i] +
-                    cp->t[k-1][j][i]) / 7.0;
-            error = abs(tval-cp->t[k][j][i]);
-            cp->t[k][j][i] = tval;
-            if (error > maxerr) maxerr = error;
-          }
+    copyin(cp.sbxm, cp.t, 0, 0, 1, DIMY, 1, DIMZ);
+    copyin(cp.sbxp, cp.t, DIMX+1, DIMX+1, 1, DIMY, 1, DIMZ);
+    copyin(cp.sbym, cp.t, 1, DIMX, 0, 0, 1, DIMZ);
+    copyin(cp.sbyp, cp.t, 1, DIMX, DIMY+1, DIMY+1, 1, DIMZ);
+    copyin(cp.sbzm, cp.t, 1, DIMX, 1, DIMY, 0, 0);
+    copyin(cp.sbzp, cp.t, 1, DIMX, 1, DIMY, DIMZ+1, DIMZ+1);
+
+    for(i=1; i<=DIMZ; i++)
+      for(j=1; j<=DIMY; j++)
+        for(k=1; k<=DIMX; k++) {
+          tval = (cp.t[k][j][i]   + cp.t[k][j][i+1] +
+                  cp.t[k][j][i-1] + cp.t[k][j+1][i] +
+                  cp.t[k][j-1][i] + cp.t[k+1][j][i] +
+                  cp.t[k-1][j][i]) / 7.0;
+          error = abs(tval-cp.t[k][j][i]);
+          cp.t[k][j][i] = tval;
+          if (error > maxerr) maxerr = error;
+      }
     MPI_Allreduce(MPI_IN_PLACE, &maxerr, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
 
     itertime = MPI_Wtime() - starttime;
@@ -207,11 +164,6 @@ int main(int ac, char** av)
     if (rank == 0)
       printf("iter %d time: %lf maxerr: %lf\n", iter, itertime / size, maxerr);
     starttime = MPI_Wtime();
-#ifdef AMPI
-    if(iter%10 == 5) {
-      AMPI_Migrate(AMPI_INFO_LB_SYNC);
-    }
-#endif
   }
   MPI_Finalize();
   return 0;
diff --git a/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/Makefile b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/Makefile
new file mode 100644
index 0000000000..14e380e0fe
--- /dev/null
+++ b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/Makefile
@@ -0,0 +1,31 @@
+OPTS = -O3 -DHAPI_CUDA_CALLBACK
+
+CHARM_DIR = ../../../../..
+CHARMC = $(CHARM_DIR)/bin/charmc $(OPTS)
+CHARM_INC = -I$(CHARM_DIR)/include
+
+NVCC = nvcc
+NVCC_FLAGS = -c -std=c++11 -use_fast_math $(OPTS)
+LD_LIBS = -module EveryLB
+
+TARGET = jacobi2d
+all: $(TARGET)
+
+OBJS = $(TARGET).o $(TARGET)CUDA.o
+
+$(TARGET): $(OBJS)
+	$(CHARMC) -language charm++ -o $@ $(OBJS) $(LD_LIBS)
+
+$(TARGET).decl.h: $(TARGET).ci $(TARGET).h
+	$(CHARMC) $<
+
+$(TARGET).def.h: $(TARGET).ci $(TARGET).h
+
+$(TARGET).o: $(TARGET).C $(TARGET).decl.h $(TARGET).def.h $(TARGET).h
+	$(CHARMC) -c $<
+
+$(TARGET)CUDA.o: $(TARGET).cu $(TARGET).h
+	$(NVCC) -o $@ $(NVCC_FLAGS) $(CHARM_INC) $<
+
+clean:
+	rm -f *.decl.h *.def.h conv-host *.o $(TARGET) charmrun
diff --git a/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.C b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.C
new file mode 100644
index 0000000000..8a427e1508
--- /dev/null
+++ b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.C
@@ -0,0 +1,646 @@
+#include "hapi.h"
+#include "hapi_nvtx.h"
+#include "jacobi2d.decl.h"
+#include "jacobi2d.h"
+#include <utility>
+#include <sstream>
+
+#define COMM_ONLY 0
+#define CUDA_SYNC 0
+
+/* readonly */ CProxy_Main main_proxy;
+/* readonly */ CProxy_Block block_proxy;
+/* readonly */ int grid_width;
+/* readonly */ int grid_height;
+/* readonly */ int block_width;
+/* readonly */ int block_height;
+/* readonly */ int n_chares_x;
+/* readonly */ int n_chares_y;
+/* readonly */ int n_iters;
+/* readonly */ int warmup_iters;
+/* readonly */ bool sync_ver;
+/* readonly */ bool use_zerocopy;
+/* readonly */ bool print_elements;
+/* readonly */ int lb_freq;
+/* readonly */ int first_lb;
+/* readonly */ int imbalance;
+
+extern void invokeInitKernel(DataType* d_temperature, int block_width,
+    int block_height, cudaStream_t stream);
+extern void invokeBoundaryKernels(DataType* d_temperature, int block_width,
+    int block_height, bool left_bound, bool right_bound, bool top_bound,
+    bool bottom_bound, cudaStream_t stream);
+extern void invokeJacobiKernel(DataType* d_temperature, DataType* d_new_temperature,
+    int block_width, int block_height, int iter, cudaStream_t stream);
+extern void invokePackingKernels(DataType* d_temperature, DataType* d_left_ghost,
+    DataType* d_right_ghost, bool left_bound, bool right_bound, int block_width,
+    int block_height, cudaStream_t stream);
+extern void invokeUnpackingKernel(DataType* d_temperature, DataType* d_ghost,
+    bool is_left, int block_width, int block_height, cudaStream_t stream);
+
+enum Direction { LEFT = 1, RIGHT, TOP, BOTTOM };
+
+class Main : public CBase_Main {
+  int my_iter;
+  double init_start_time;
+  double start_time;
+  double comm_start_time;
+  double comm_agg_time;
+  double update_start_time;
+  double update_agg_time;
+
+public:
+  Main(CkArgMsg* m) {
+    // Set default values
+    main_proxy = thisProxy;
+    grid_width = 8192;
+    grid_height = 8192;
+    block_width = 2048;
+    block_height = 2048;
+    n_iters = 100;
+    warmup_iters = 10;
+    use_zerocopy = false;
+    print_elements = false;
+    sync_ver = false;
+    my_iter = 0;
+    first_lb = 10;
+    lb_freq = 100;
+    imbalance = 5;  // Max extra iterations for load imbalance
+
+    // Initialize aggregate timers
+    update_agg_time = 0.0;
+    comm_agg_time = 0.0;
+
+    // Process arguments
+    int c;
+    while ((c = getopt(m->argc, m->argv, "W:H:w:h:i:b:f:m:u:yzp")) != -1) {
+      switch (c) {
+        case 'W':
+          grid_width = atoi(optarg);
+          break;
+        case 'H':
+          grid_height = atoi(optarg);
+          break;
+        case 'w':
+          block_width = atoi(optarg);
+          break;
+        case 'h':
+          block_height = atoi(optarg);
+          break;
+        case 'i':
+          n_iters = atoi(optarg);
+          break;
+        case 'b':
+          lb_freq = atoi(optarg);
+          break;
+        case 'f':
+          first_lb = atoi(optarg);
+          break;
+        case 'm':
+          imbalance = atoi(optarg);
+          break;
+        case 'u':
+          warmup_iters = atoi(optarg);
+          break;
+        case 'y':
+          sync_ver = true;
+          break;
+        case 'z':
+          use_zerocopy = true;
+          break;
+        case 'p':
+          print_elements = true;
+          break;
+        default:
+          CkPrintf(
+              "Usage: %s -W [grid width] -H [grid height] -w [block width] -h [block height]"
+              "-b [lb frequency] -f [first lb] -m [max imbalance] "
+              "-i [iterations] -u [warmup] -y (use sync version) -z (use GPU zerocopy) -p (print blocks)\n",
+              m->argv[0]);
+          CkExit();
+      }
+    }
+    delete m;
+
+    if (grid_width % block_width != 0 || grid_height % block_height != 0) {
+      CkAbort("Invalid grid & block configuration\n");
+    }
+
+    // Number of chares per dimension
+    n_chares_x = grid_width / block_width;
+    n_chares_y = grid_height / block_height;
+
+    // Print configuration
+    CkPrintf("\n[CUDA 2D Jacobi example]\n");
+    CkPrintf("Grid: %d x %d, Block: %d x %d, Chares: %d x %d, Iterations: %d, "
+        "Warm-up: %d, Bulk-synchronous: %d, Zerocopy: %d, Print: %d\n\n",
+        grid_width, grid_height, block_width, block_height, n_chares_x, n_chares_y,
+        n_iters, warmup_iters, sync_ver, use_zerocopy, print_elements);
+
+    // Create blocks and start iteration
+    block_proxy = CProxy_Block::ckNew(n_chares_x, n_chares_y);
+    init_start_time = CkWallTimer();
+    block_proxy.init();
+  }
+
+  void initDone() {
+    CkPrintf("Init time: %.3lf s\n", CkWallTimer() - init_start_time);
+
+    startIter();
+  }
+
+  void startIter() {
+    if (my_iter++ == warmup_iters) start_time = CkWallTimer();
+    update_start_time = CkWallTimer();
+
+    block_proxy.exchangeGhosts();
+  }
+
+  void updateDone() {
+    if (my_iter > warmup_iters) update_agg_time += CkWallTimer() - update_start_time;
+    comm_start_time = CkWallTimer();
+
+    block_proxy.packGhosts();
+  }
+
+  void commDone() {
+    if (my_iter > warmup_iters) comm_agg_time += CkWallTimer() - comm_start_time;
+
+    if (my_iter == warmup_iters + n_iters) {
+      allDone();
+    } else {
+      startIter();
+    }
+  }
+
+  void allDone() {
+    double total_time = CkWallTimer() - start_time;
+    CkPrintf("Total time: %.3lf s\nAverage iteration time: %.3lf us\n",
+        total_time, (total_time / n_iters) * 1e6);
+    if (sync_ver) {
+      CkPrintf("Comm time per iteration: %.3lf us\nUpdate time per iteration: %.3lf us\n",
+          (comm_agg_time / n_iters) * 1e6, (update_agg_time / n_iters) * 1e6);
+    }
+
+    if (print_elements) {
+      sleep(1);
+      block_proxy(0,0).print();
+    } else {
+      CkExit();
+    }
+  }
+
+  void printDone() {
+    CkExit();
+  }
+};
+
+class Block : public CBase_Block {
+  Block_SDAG_CODE
+
+ public:
+  int my_iter;
+  int neighbors;
+  int remote_count;
+  int x, y;
+  int load_iters;
+
+  DataType* __restrict__ h_temperature;
+  DataType* __restrict__ d_temperature;
+  DataType* __restrict__ d_new_temperature;
+  DataType* __restrict__ h_left_ghost;
+  DataType* __restrict__ h_right_ghost;
+  DataType* __restrict__ h_top_ghost;
+  DataType* __restrict__ h_bottom_ghost;
+  DataType* __restrict__ d_left_ghost;
+  DataType* __restrict__ d_right_ghost;
+  DataType* __restrict__ d_send_left_ghost;
+  DataType* __restrict__ d_send_right_ghost;
+  DataType* __restrict__ d_send_top_ghost;
+  DataType* __restrict__ d_send_bottom_ghost;
+  DataType* __restrict__ d_recv_left_ghost;
+  DataType* __restrict__ d_recv_right_ghost;
+
+  cudaStream_t compute_stream;
+  cudaStream_t comm_stream;
+
+  cudaEvent_t compute_event;
+  cudaEvent_t comm_event;
+
+  bool left_bound, right_bound, top_bound, bottom_bound;
+
+  Block() {
+    usesAtSync = true;
+  }
+
+  Block(CkMigrateMessage* m) {
+    usesAtSync = true;
+    hapiCheck(cudaStreamCreateWithPriority(&compute_stream, cudaStreamDefault, 0));
+    hapiCheck(cudaStreamCreateWithPriority(&comm_stream, cudaStreamDefault, -1));
+
+    hapiCheck(cudaEventCreateWithFlags(&compute_event, cudaEventDisableTiming));
+    hapiCheck(cudaEventCreateWithFlags(&comm_event, cudaEventDisableTiming));
+  }
+
+  ~Block() {
+    // hapiCheck(cudaFreeHost(h_temperature));
+    hapiCheck(cudaFree(d_temperature));
+    hapiCheck(cudaFree(d_new_temperature));
+    // hapiCheck(cudaFreeHost(h_left_ghost));
+    // hapiCheck(cudaFreeHost(h_right_ghost));
+    // hapiCheck(cudaFreeHost(h_top_ghost));
+    // hapiCheck(cudaFreeHost(h_bottom_ghost));
+    if (!use_zerocopy) {
+      hapiCheck(cudaFree(d_left_ghost));
+      hapiCheck(cudaFree(d_right_ghost));
+    } else {
+      hapiCheck(cudaFree(d_send_left_ghost));
+      hapiCheck(cudaFree(d_send_right_ghost));
+      hapiCheck(cudaFree(d_send_top_ghost));
+      hapiCheck(cudaFree(d_send_bottom_ghost));
+      hapiCheck(cudaFree(d_recv_left_ghost));
+      hapiCheck(cudaFree(d_recv_right_ghost));
+    }
+
+    hapiCheck(cudaStreamDestroy(compute_stream));
+    hapiCheck(cudaStreamDestroy(comm_stream));
+
+    hapiCheck(cudaEventDestroy(compute_event));
+    hapiCheck(cudaEventDestroy(comm_event));
+  }
+
+  void pup(PUP::er& p) {
+    p | my_iter;
+    p | neighbors;
+    p | remote_count;
+    p | x;
+    p | y;
+    p | left_bound;
+    p | right_bound;
+    p | top_bound;
+    p | bottom_bound;
+    p | load_iters;
+
+    if (p.isUnpacking()) {
+      // hapiCheck(hapiMallocHost((void**)&h_temperature,
+      //       sizeof(DataType) * (block_width + 2) * (block_height + 2)));
+      hapiCheck(hapiMalloc((void**)&d_temperature,
+            sizeof(DataType) * (block_width + 2) * (block_height + 2)));
+      hapiCheck(hapiMalloc((void**)&d_new_temperature,
+            sizeof(DataType) * (block_width + 2) * (block_height + 2)));
+      // hapiCheck(hapiMallocHost((void**)&h_left_ghost, sizeof(DataType) * block_height));
+      // hapiCheck(hapiMallocHost((void**)&h_right_ghost, sizeof(DataType) * block_height));
+      // hapiCheck(hapiMallocHost((void**)&h_top_ghost, sizeof(DataType) * block_width));
+      // hapiCheck(hapiMallocHost((void**)&h_bottom_ghost, sizeof(DataType) * block_width));
+      if (!use_zerocopy) {
+        hapiCheck(hapiMalloc((void**)&d_left_ghost, sizeof(DataType) * block_height));
+        hapiCheck(hapiMalloc((void**)&d_right_ghost, sizeof(DataType) * block_height));
+      } else {
+        hapiCheck(hapiMalloc((void**)&d_send_left_ghost, sizeof(DataType) * block_height));
+        hapiCheck(hapiMalloc((void**)&d_send_right_ghost, sizeof(DataType) * block_height));
+        hapiCheck(hapiMalloc((void**)&d_send_top_ghost, sizeof(DataType) * block_width));
+        hapiCheck(hapiMalloc((void**)&d_send_bottom_ghost, sizeof(DataType) * block_width));
+        hapiCheck(hapiMalloc((void**)&d_recv_left_ghost, sizeof(DataType) * block_height));
+        hapiCheck(hapiMalloc((void**)&d_recv_right_ghost, sizeof(DataType) * block_height));
+      }
+    }
+      
+    p(d_temperature, (block_width + 2) * (block_height + 2), PUP::PUPMode::DEVICE);
+    p(d_new_temperature, (block_width + 2) * (block_height + 2), PUP::PUPMode::DEVICE);
+  }
+
+  void init() {
+    // Initialize values
+    my_iter = 0;
+    neighbors = 0;
+    x = thisIndex.x;
+    y = thisIndex.y;
+
+    load_iters = (((float) (x + y)) / (n_chares_x + n_chares_y)) * imbalance;
+    //CkPrintf("Block (%d,%d) load iters: %d\n", x, y, load_iters);
+
+    std::ostringstream os;
+    os << "Init (" << std::to_string(x) << "," << std::to_string(y) << ")";
+    NVTXTracer(os.str(), NVTXColor::Turquoise);
+
+    // Check bounds and set number of valid neighbors
+    left_bound = right_bound = top_bound = bottom_bound = false;
+    if (thisIndex.x == 0)
+      left_bound = true;
+    else
+      neighbors++;
+    if (thisIndex.x == n_chares_x - 1)
+      right_bound = true;
+    else
+      neighbors++;
+    if (thisIndex.y == 0)
+      top_bound = true;
+    else
+      neighbors++;
+    if (thisIndex.y == n_chares_y - 1)
+      bottom_bound = true;
+    else
+      neighbors++;
+
+    // Allocate memory and create CUDA entities
+    hapiCheck(hapiMallocHost((void**)&h_temperature,
+          sizeof(DataType) * (block_width + 2) * (block_height + 2)));
+    hapiCheck(hapiMalloc((void**)&d_temperature,
+          sizeof(DataType) * (block_width + 2) * (block_height + 2)));
+    hapiCheck(hapiMalloc((void**)&d_new_temperature,
+          sizeof(DataType) * (block_width + 2) * (block_height + 2)));
+    hapiCheck(hapiMallocHost((void**)&h_left_ghost, sizeof(DataType) * block_height));
+    hapiCheck(hapiMallocHost((void**)&h_right_ghost, sizeof(DataType) * block_height));
+    hapiCheck(hapiMallocHost((void**)&h_top_ghost, sizeof(DataType) * block_width));
+    hapiCheck(hapiMallocHost((void**)&h_bottom_ghost, sizeof(DataType) * block_width));
+    if (!use_zerocopy) {
+      hapiCheck(hapiMalloc((void**)&d_left_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMalloc((void**)&d_right_ghost, sizeof(DataType) * block_height));
+    } else {
+      hapiCheck(hapiMalloc((void**)&d_send_left_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMalloc((void**)&d_send_right_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMalloc((void**)&d_send_top_ghost, sizeof(DataType) * block_width));
+      hapiCheck(hapiMalloc((void**)&d_send_bottom_ghost, sizeof(DataType) * block_width));
+      hapiCheck(hapiMalloc((void**)&d_recv_left_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMalloc((void**)&d_recv_right_ghost, sizeof(DataType) * block_height));
+    }
+
+    hapiCheck(cudaStreamCreateWithPriority(&compute_stream, cudaStreamDefault, 0));
+    hapiCheck(cudaStreamCreateWithPriority(&comm_stream, cudaStreamDefault, -1));
+
+    hapiCheck(cudaEventCreateWithFlags(&compute_event, cudaEventDisableTiming));
+    hapiCheck(cudaEventCreateWithFlags(&comm_event, cudaEventDisableTiming));
+
+    // Initialize temperature data
+    invokeInitKernel(d_temperature, block_width, block_height, compute_stream);
+    invokeInitKernel(d_new_temperature, block_width, block_height, compute_stream);
+
+    // Enforce boundary conditions
+    invokeBoundaryKernels(d_temperature, block_width, block_height, left_bound,
+        right_bound, top_bound, bottom_bound, compute_stream);
+    invokeBoundaryKernels(d_new_temperature, block_width, block_height, left_bound,
+        right_bound, top_bound, bottom_bound, compute_stream);
+
+#if CUDA_SYNC
+    cudaStreamSynchronize(compute_stream);
+    thisProxy[thisIndex].initDone();
+#else
+    // TODO: Support reduction callback in hapiAddCallback
+    CkCallback* cb = new CkCallback(CkIndex_Block::initDone(), thisProxy[thisIndex]);
+    hapiAddCallback(compute_stream, cb);
+#endif
+  }
+
+  void initDone() {
+    contribute(CkCallback(CkReductionTarget(Main, initDone), main_proxy));
+  }
+
+  void iterate() {
+    if (my_iter == first_lb || (my_iter != 0 && my_iter % lb_freq == 0)) {
+      cudaStreamSynchronize(comm_stream);
+      cudaStreamSynchronize(compute_stream);
+      AtSync();
+    } else {
+      thisProxy[thisIndex].exchangeGhosts();
+    }
+  }
+
+  void ResumeFromSync() {
+    thisProxy[thisIndex].exchangeGhosts();
+  }
+
+  void update() {
+    std::ostringstream os;
+    os << "update (" << std::to_string(x) << "," << std::to_string(y) << ")";
+    NVTXTracer(os.str(), NVTXColor::WetAsphalt);
+
+    // Operations in compute stream should only be executed when
+    // operations in communication stream (transfers and unpacking) complete
+    hapiCheck(cudaEventRecord(comm_event, comm_stream));
+    hapiCheck(cudaStreamWaitEvent(compute_stream, comm_event, 0));
+
+#if !COMM_ONLY
+    // Invoke GPU kernel for Jacobi computation
+    invokeJacobiKernel(d_temperature, d_new_temperature, block_width, block_height, load_iters,
+        compute_stream);
+#endif
+
+    // Operations in communication stream (packing and transfers) should
+    // only be executed when operations in compute stream complete
+    hapiCheck(cudaEventRecord(compute_event, compute_stream));
+    hapiCheck(cudaStreamWaitEvent(comm_stream, compute_event, 0));
+
+    // Copy final temperature data back to host
+    if (print_elements && (my_iter == warmup_iters + n_iters)) {
+      hapiCheck(hapiMemcpyAsync(h_temperature, d_new_temperature,
+            sizeof(DataType) * (block_width + 2) * (block_height + 2),
+            cudaMemcpyDeviceToHost, comm_stream));
+    }
+
+    if (sync_ver) {
+#if CUDA_SYNC
+      cudaStreamSynchronize(compute_stream);
+      thisProxy[thisIndex].updateDone();
+#else
+      CkCallback* cb = new CkCallback(CkIndex_Block::updateDone(), thisProxy[thisIndex]);
+      hapiAddCallback(compute_stream, cb);
+#endif
+    }
+  }
+
+  void updateDone() {
+    contribute(CkCallback(CkReductionTarget(Main, updateDone), main_proxy));
+  }
+
+  void packGhosts() {
+    std::ostringstream os;
+    os << "packGhosts (" << std::to_string(x) << "," << std::to_string(y) << ")";
+    NVTXTracer(os.str(), NVTXColor::Emerald);
+
+    if (use_zerocopy) {
+#if !COMM_ONLY
+      // Pack non-contiguous ghosts to temporary contiguous buffers on device
+      invokePackingKernels(d_new_temperature, d_send_left_ghost, d_send_right_ghost,
+          left_bound, right_bound, block_width, block_height, comm_stream);
+#endif
+
+      // Copy top and bottom ghosts to send buffers
+      if (!top_bound)
+        hapiCheck(hapiMemcpyAsync(d_send_top_ghost, d_new_temperature + (block_width + 2) + 1,
+              block_width * sizeof(DataType), cudaMemcpyDeviceToDevice, comm_stream));
+      if (!bottom_bound)
+        hapiCheck(hapiMemcpyAsync(d_send_bottom_ghost, d_new_temperature + (block_width + 2) * block_height + 1,
+              block_width * sizeof(DataType), cudaMemcpyDeviceToDevice, comm_stream));
+    } else {
+#if !COMM_ONLY
+      // Pack non-contiguous ghosts to temporary contiguous buffers on device
+      invokePackingKernels(d_new_temperature, d_left_ghost, d_right_ghost,
+          left_bound, right_bound, block_width, block_height, comm_stream);
+#endif
+
+      // Transfer ghosts from device to host
+      if (!left_bound)
+        hapiCheck(hapiMemcpyAsync(h_left_ghost, d_left_ghost, block_height * sizeof(DataType),
+              cudaMemcpyDeviceToHost, comm_stream));
+      if (!right_bound)
+        hapiCheck(hapiMemcpyAsync(h_right_ghost, d_right_ghost, block_height * sizeof(DataType),
+              cudaMemcpyDeviceToHost, comm_stream));
+      if (!top_bound)
+        hapiCheck(hapiMemcpyAsync(h_top_ghost, d_new_temperature + (block_width + 2) + 1,
+              block_width * sizeof(DataType), cudaMemcpyDeviceToHost, comm_stream));
+      if (!bottom_bound)
+        hapiCheck(hapiMemcpyAsync(h_bottom_ghost, d_new_temperature + (block_width + 2) * block_height + 1,
+              block_width * sizeof(DataType), cudaMemcpyDeviceToHost, comm_stream));
+    }
+
+#if CUDA_SYNC
+    cudaStreamSynchronize(comm_stream);
+    thisProxy[thisIndex].packGhostsDone();
+#else
+    // Add asynchronous callback to be invoked when packing kernels and
+    // ghost transfers are complete
+    CkCallback* cb = new CkCallback(CkIndex_Block::packGhostsDone(), thisProxy[thisIndex]);
+    hapiAddCallback(comm_stream, cb);
+#endif
+  }
+
+  void sendGhosts() {
+    std::ostringstream os;
+    os << "sendGhosts (" << std::to_string(x) << "," << std::to_string(y) << ")";
+    NVTXTracer(os.str(), NVTXColor::PeterRiver);
+
+    // Send ghosts to neighboring chares
+    if (use_zerocopy) {
+      if (!left_bound)
+        thisProxy(x - 1, y).receiveGhostsZC(my_iter, RIGHT, block_height,
+            CkDeviceBuffer(d_send_left_ghost, comm_stream));
+      if (!right_bound)
+        thisProxy(x + 1, y).receiveGhostsZC(my_iter, LEFT, block_height,
+            CkDeviceBuffer(d_send_right_ghost, comm_stream));
+      if (!top_bound)
+        thisProxy(x, y - 1).receiveGhostsZC(my_iter, BOTTOM, block_width,
+            CkDeviceBuffer(d_send_top_ghost, comm_stream));
+      if (!bottom_bound)
+        thisProxy(x, y + 1).receiveGhostsZC(my_iter, TOP, block_width,
+            CkDeviceBuffer(d_send_bottom_ghost, comm_stream));
+    } else {
+      if (!left_bound)
+        thisProxy(x - 1, y).receiveGhostsReg(my_iter, RIGHT, block_height, h_left_ghost);
+      if (!right_bound)
+        thisProxy(x + 1, y).receiveGhostsReg(my_iter, LEFT, block_height, h_right_ghost);
+      if (!top_bound)
+        thisProxy(x, y - 1).receiveGhostsReg(my_iter, BOTTOM, block_width, h_top_ghost);
+      if (!bottom_bound)
+        thisProxy(x, y + 1).receiveGhostsReg(my_iter, TOP, block_width, h_bottom_ghost);
+    }
+  }
+
+  // This is the post entry method, the regular entry method is defined as a
+  // SDAG entry method in the .ci file
+  void receiveGhostsZC(int ref, int dir, int &size, DataType *&buf, CkDeviceBufferPost *devicePost) {
+    switch (dir) {
+      case LEFT:
+        buf = d_recv_left_ghost;
+        break;
+      case RIGHT:
+        buf = d_recv_right_ghost;
+        break;
+      case TOP:
+        buf = d_temperature + 1;
+        break;
+      case BOTTOM:
+        buf = d_temperature + (block_width + 2) * (block_height + 1) + 1;
+        break;
+      default:
+        CkAbort("Error: invalid direction");
+    }
+    devicePost[0].hapi_stream = comm_stream;
+  }
+
+  void processGhostsZC(int dir, int size, DataType* gh) {
+    std::ostringstream os;
+    os << "processGhostsZC (" << std::to_string(x) << "," << std::to_string(y) << ")";
+    NVTXTracer(os.str(), NVTXColor::Amethyst);
+
+    switch (dir) {
+      case LEFT:
+        invokeUnpackingKernel(d_temperature, d_recv_left_ghost, true, block_width,
+            block_height, comm_stream);
+        break;
+      case RIGHT:
+        invokeUnpackingKernel(d_temperature, d_recv_right_ghost, false, block_width,
+            block_height, comm_stream);
+        break;
+      case TOP:
+      case BOTTOM:
+        break;
+      default:
+        CkAbort("Error: invalid direction");
+    }
+  }
+
+  void processGhostsReg(int dir, int size, DataType* gh) {
+    std::ostringstream os;
+    os << "processGhostsReg (" << std::to_string(x) << "," << std::to_string(y) << ")";
+    NVTXTracer(os.str(), NVTXColor::Amethyst);
+
+    switch (dir) {
+      case LEFT:
+        memcpy(h_left_ghost, gh, size * sizeof(DataType));
+        hapiCheck(hapiMemcpyAsync(d_left_ghost, h_left_ghost,
+              block_height * sizeof(DataType), cudaMemcpyHostToDevice, comm_stream));
+#if !COMM_ONLY
+        invokeUnpackingKernel(d_temperature, d_left_ghost, true, block_width,
+            block_height, comm_stream);
+#endif
+        break;
+      case RIGHT:
+        memcpy(h_right_ghost, gh, size * sizeof(DataType));
+        hapiCheck(hapiMemcpyAsync(d_right_ghost, h_right_ghost,
+              block_height * sizeof(DataType), cudaMemcpyHostToDevice, comm_stream));
+#if !COMM_ONLY
+        invokeUnpackingKernel(d_temperature, d_right_ghost, false, block_width,
+            block_height, comm_stream);
+#endif
+        break;
+      case TOP:
+        memcpy(h_top_ghost, gh, size * sizeof(DataType));
+        hapiCheck(hapiMemcpyAsync(d_temperature + 1, h_top_ghost,
+              block_width * sizeof(DataType), cudaMemcpyHostToDevice, comm_stream));
+        break;
+      case BOTTOM:
+        memcpy(h_bottom_ghost, gh, size * sizeof(DataType));
+        hapiCheck(hapiMemcpyAsync(d_temperature + (block_width + 2) * (block_height + 1) + 1,
+              h_bottom_ghost, block_width * sizeof(DataType), cudaMemcpyHostToDevice, comm_stream));
+        break;
+      default:
+        CkAbort("Error: invalid direction");
+    }
+  }
+
+  void print() {
+    CkPrintf("[%d,%d]\n", thisIndex.x, thisIndex.y);
+    for (int j = 0; j < block_height + 2; j++) {
+      for (int i = 0; i < block_width + 2; i++) {
+#ifdef TEST_CORRECTNESS
+        CkPrintf("%d ", h_temperature[(block_width + 2) * j + i]);
+#else
+        CkPrintf("%.6lf ", h_temperature[(block_width + 2) * j + i]);
+#endif
+      }
+      CkPrintf("\n");
+    }
+
+    if (!(thisIndex.x == n_chares_x-1 && thisIndex.y == n_chares_y-1)) {
+      if (thisIndex.x == n_chares_x-1) {
+        thisProxy(0,thisIndex.y+1).print();
+      } else {
+        thisProxy(thisIndex.x+1,thisIndex.y).print();
+      }
+    } else {
+      main_proxy.printDone();
+    }
+  }
+};
+
+#include "jacobi2d.def.h"
diff --git a/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.ci b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.ci
new file mode 100644
index 0000000000..3957f23bfd
--- /dev/null
+++ b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.ci
@@ -0,0 +1,90 @@
+mainmodule jacobi2d {
+  include "jacobi2d.h";
+
+  readonly CProxy_Main main_proxy;
+  readonly CProxy_Block block_proxy;
+  readonly int grid_width;
+  readonly int grid_height;
+  readonly int block_width;
+  readonly int block_height;
+  readonly int n_chares_x;
+  readonly int n_chares_y;
+  readonly int n_iters;
+  readonly int warmup_iters;
+  readonly bool sync_ver;
+  readonly bool use_zerocopy;
+  readonly bool print_elements;
+  readonly int lb_freq;
+  readonly int first_lb;
+  readonly int imbalance;
+
+  mainchare Main {
+    entry Main(CkArgMsg* m);
+    entry [reductiontarget] void initDone();
+    entry void startIter();
+    entry [reductiontarget] void updateDone();
+    entry [reductiontarget] void commDone();
+    entry [reductiontarget] void allDone();
+    entry void printDone();
+  };
+
+  array [2D] Block {
+    entry Block(void);
+    entry void init();
+    entry void initDone();
+    entry void update();
+    entry void updateDone();
+    entry void packGhosts();
+    entry void packGhostsDone();
+    entry void receiveGhostsZC(int ref, int dir, int w, nocopydevice DataType gh[w]);
+    entry void receiveGhostsReg(int ref, int dir, int w, DataType gh[w]);
+    entry void iterate();
+
+    entry void exchangeGhosts() {
+      serial {
+        my_iter++;
+        update();
+        if (!sync_ver) packGhosts();
+      }
+
+      when packGhostsDone() {
+        serial {
+          // When packing is done, we know that the new temperatures have been updated
+          // (because the host doesn't separately detect when the Jacobi kernel completes)
+          std::swap(d_temperature, d_new_temperature);
+          sendGhosts();
+        }
+      }
+
+      for (remote_count = 0; remote_count < neighbors; remote_count++) {
+        if (use_zerocopy) {
+          when receiveGhostsZC[my_iter](int ref, int dir, int w, nocopydevice DataType buf[w]) {
+            serial {
+              processGhostsZC(dir, w, buf);
+            }
+          }
+        } else {
+          when receiveGhostsReg[my_iter](int ref, int dir, int w, DataType buf[w]) {
+            serial {
+              processGhostsReg(dir, w, buf);
+            }
+          }
+        }
+      }
+
+      serial {
+        if (sync_ver || my_iter <= warmup_iters) {
+          contribute(CkCallback(CkReductionTarget(Main, commDone), main_proxy));
+        } else {
+          if (my_iter < warmup_iters + n_iters) {
+            thisProxy[thisIndex].iterate();
+          } else {
+            contribute(CkCallback(CkReductionTarget(Main, allDone), main_proxy));
+          }
+        }
+      }
+    }
+
+    entry void print();
+  };
+};
diff --git a/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.cu b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.cu
new file mode 100644
index 0000000000..de1001d1c6
--- /dev/null
+++ b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.cu
@@ -0,0 +1,195 @@
+#include "hapi.h"
+#include "jacobi2d.h"
+
+#define TILE_SIZE 16
+#define DIVIDEBY5 0.2
+
+__global__ void initKernel(DataType* temperature, int block_width,
+    int block_height) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+  if (i < block_width + 2 && j < block_height + 2) {
+    temperature[IDX(i,j)] = 0;
+  }
+}
+
+__global__ void leftBoundaryKernel(DataType* temperature, int block_width,
+    int block_height) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+  if (j < block_height) {
+    temperature[IDX(0,1+j)] = 1;
+  }
+}
+
+__global__ void rightBoundaryKernel(DataType* temperature, int block_width,
+    int block_height) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+  if (j < block_height) {
+    temperature[IDX(block_width+1,1+j)] = 1;
+  }
+}
+
+__global__ void topBoundaryKernel(DataType* temperature, int block_width,
+    int block_height) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < block_width) {
+    temperature[IDX(1+i,0)] = 1;
+  }
+}
+
+__global__ void bottomBoundaryKernel(DataType* temperature, int block_width,
+    int block_height) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < block_width) {
+    temperature[IDX(1+i,block_height+1)] = 1;
+  }
+}
+
+__global__ void jacobiKernel(DataType* temperature, DataType* new_temperature,
+    int block_width, int block_height, int iter) {
+  int i = (blockDim.x * blockIdx.x + threadIdx.x) + 1;
+  int j = (blockDim.y * blockIdx.y + threadIdx.y) + 1;
+
+  if (i <= block_width && j <= block_height) {
+#ifdef TEST_CORRECTNESS
+    new_temperature[IDX(i,j)] = (temperature[IDX(i-1,j)] + temperature[IDX(i+1,j)] +
+      temperature[IDX(i,j-1)] + temperature[IDX(i,j+1)] + temperature[IDX(i,j)]) %
+      1e5;
+#else
+    DataType temp = 0;
+
+    for (int it = 0; it < iter; it++)
+      temp += (temperature[IDX(i-1,j)] + temperature[IDX(i+1,j)] +
+      temperature[IDX(i,j-1)] + temperature[IDX(i,j+1)] + temperature[IDX(i,j)]) *
+      DIVIDEBY5;
+
+    new_temperature[IDX(i,j)] = temp / iter;
+#endif
+  }
+}
+
+__global__ void leftPackingKernel(DataType* temperature, DataType* ghost,
+    int block_width, int block_height) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+  if (j < block_height) {
+    ghost[j] = temperature[IDX(1,1+j)];
+  }
+}
+
+__global__ void rightPackingKernel(DataType* temperature, DataType* ghost,
+    int block_width, int block_height) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+  if (j < block_height) {
+    ghost[j] = temperature[IDX(block_width,1+j)];
+  }
+}
+
+__global__ void leftUnpackingKernel(DataType* temperature, DataType* ghost,
+    int block_width, int block_height) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+  if (j < block_height) {
+    temperature[IDX(0,1+j)] = ghost[j];
+  }
+}
+
+__global__ void rightUnpackingKernel(DataType* temperature, DataType* ghost,
+    int block_width, int block_height) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+  if (j < block_height) {
+    temperature[IDX(block_width+1,1+j)] = ghost[j];
+  }
+}
+
+void invokeInitKernel(DataType* d_temperature, int block_width, int block_height,
+    cudaStream_t stream) {
+  dim3 block_dim(TILE_SIZE, TILE_SIZE);
+  dim3 grid_dim(((block_width + 2) + (block_dim.x - 1)) / block_dim.x,
+      ((block_height + 2) + (block_dim.y - 1)) / block_dim.y);
+
+  HAPI_LAUNCH_KERNEL_WRAPPER((initKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature, block_width, block_height)), stream)
+  // hapiLaunchKernelWrapper(initKernel, grid_dim, block_dim, 0, stream,
+  //     d_temperature, block_width, block_height);
+  hapiCheck(cudaPeekAtLastError());
+}
+
+void invokeBoundaryKernels(DataType* d_temperature, int block_width,
+    int block_height, bool left_bound, bool right_bound, bool top_bound,
+    bool bottom_bound, cudaStream_t stream) {
+  dim3 block_dim(TILE_SIZE * TILE_SIZE);
+
+  if (left_bound) {
+    dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x);
+    leftBoundaryKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature,
+       block_width, block_height);
+    // hapiLaunchKernelWrapper(leftBoundaryKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, block_width, block_height);
+  }
+  if (right_bound) {
+    dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x);
+    rightBoundaryKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature,
+       block_width, block_height);
+    // hapiLaunchKernelWrapper(rightBoundaryKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, block_width, block_height);
+  }
+  if (top_bound) {
+    dim3 grid_dim((block_width + (block_dim.x - 1)) / block_dim.x);
+    topBoundaryKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature,
+       block_width, block_height);
+    // hapiLaunchKernelWrapper(topBoundaryKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, block_width, block_height);
+  }
+  if (bottom_bound) {
+    dim3 grid_dim((block_width + (block_dim.x - 1)) / block_dim.x);
+    bottomBoundaryKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature,
+       block_width, block_height);
+    // hapiLaunchKernelWrapper(bottomBoundaryKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, block_width, block_height);
+  }
+  hapiCheck(cudaPeekAtLastError());
+}
+
+void invokeJacobiKernel(DataType* d_temperature, DataType* d_new_temperature,
+    int block_width, int block_height, int iter, cudaStream_t stream) {
+  dim3 block_dim(TILE_SIZE, TILE_SIZE);
+  dim3 grid_dim((block_width + (block_dim.x - 1)) / block_dim.x,
+      (block_height + (block_dim.y - 1)) / block_dim.y);
+
+  HAPI_LAUNCH_KERNEL_WRAPPER((jacobiKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature, d_new_temperature, block_width, block_height, iter)), stream)
+  // hapiLaunchKernelWrapper(jacobiKernel, grid_dim, block_dim, 0, stream,
+  //     d_temperature, d_new_temperature, block_width, block_height, iter);
+  hapiCheck(cudaPeekAtLastError());
+}
+
+void invokePackingKernels(DataType* d_temperature, DataType* d_left_ghost,
+    DataType* d_right_ghost, bool left_bound, bool right_bound, int block_width,
+    int block_height, cudaStream_t stream) {
+  dim3 block_dim(TILE_SIZE * TILE_SIZE);
+  dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x);
+  if (!left_bound) {
+    leftPackingKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature, d_left_ghost, block_width, block_height);
+    // hapiLaunchKernelWrapper(leftPackingKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, d_left_ghost, block_width, block_height);
+  }
+  if (!right_bound) {
+    rightPackingKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature, d_right_ghost, block_width, block_height);
+    // hapiLaunchKernelWrapper(rightPackingKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, d_right_ghost, block_width, block_height);
+  }
+  hapiCheck(cudaPeekAtLastError());
+}
+
+void invokeUnpackingKernel(DataType* d_temperature, DataType* d_ghost, bool is_left,
+    int block_width, int block_height, cudaStream_t stream) {
+  dim3 block_dim(TILE_SIZE * TILE_SIZE);
+  dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x);
+  if (is_left) {
+    leftUnpackingKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature, d_ghost, block_width, block_height);
+    // hapiLaunchKernelWrapper(leftUnpackingKernel, grid_dim, block_dim, 0, stream,
+        // d_temperature, d_ghost, block_width, block_height);
+  } else {
+    rightUnpackingKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature, d_ghost, block_width, block_height);
+    // hapiLaunchKernelWrapper(rightUnpackingKernel, grid_dim, block_dim, 0, stream,
+        // d_temperature, d_ghost, block_width, block_height);
+  }
+  hapiCheck(cudaPeekAtLastError());
+}
diff --git a/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.h b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.h
new file mode 100644
index 0000000000..56c3aa7662
--- /dev/null
+++ b/examples/charm++/cuda/gpudirect/jacobi2d-imbalance/jacobi2d.h
@@ -0,0 +1,12 @@
+#ifndef __CUDA_GPUDIRECT_JACOBI2D_H_
+#define __CUDA_GPUDIRECT_JACOBI2D_H_
+
+#ifdef TEST_CORRECTNESS
+typedef int DataType;
+#else
+typedef float DataType;
+#endif
+
+#define IDX(x,y) ((block_width+2)*(y)+(x))
+
+#endif // __CUDA_GPUDIRECT_JACOBI2D_H_
diff --git a/examples/charm++/cuda/gpudirect/jacobi2d/Makefile b/examples/charm++/cuda/gpudirect/jacobi2d/Makefile
index 72d1f7cb44..9e38264283 100644
--- a/examples/charm++/cuda/gpudirect/jacobi2d/Makefile
+++ b/examples/charm++/cuda/gpudirect/jacobi2d/Makefile
@@ -1,31 +1,37 @@
-OPTS = -O3
+OPTS = -O3 -DHAPI_CUDA_CALLBACK
 
-CHARM_DIR = ../../../../..
+CHARM_DIR = /u/ajain18/oldCharm/multicore-linux-x86_64
 CHARMC = $(CHARM_DIR)/bin/charmc $(OPTS)
 CHARM_INC = -I$(CHARM_DIR)/include
+CHARMC_FLAGS = -D__HIP_PLATFORM_AMD__=ON
 
-NVCC = nvcc
-NVCC_FLAGS = -c -std=c++11 -use_fast_math $(OPTS)
-LD_LIBS =
+HAPITOOLKIT_HOME ?= /opt/rocm
+HAPICC = hipcc
+HAPICC_FLAGS = -c -std=c++11
+HAPICC_INC = -I$(HAPITOOLKIT_HOME)/include
+HAPICC_LIB = -L$(HAPITOOLKIT_HOME)/lib
+
+LD_LIBS = -module EveryLB
 
 TARGET = jacobi2d
 all: $(TARGET)
 
-OBJS = $(TARGET).o $(TARGET)CUDA.o
+OBJS = $(TARGET).o $(TARGET)HAPI.o
 
 $(TARGET): $(OBJS)
-	$(CHARMC) -language charm++ -o $@ $(OBJS) $(LD_LIBS)
-
-$(TARGET).decl.h: $(TARGET).ci $(TARGET).h
-	$(CHARMC) $<
+	$(CHARMC) $(CHARMC_FLAGS) -language charm++ -o $@ $(OBJS) $(LD_LIBS)
 
-$(TARGET).def.h: $(TARGET).ci $(TARGET).h
+$(TARGET).decl.h: $(TARGET).ci
+	$(CHARMC) $(CHARMC_FLAGS) $<
 
-$(TARGET).o: $(TARGET).C $(TARGET).decl.h $(TARGET).def.h $(TARGET).h
-	$(CHARMC) -c $<
+$(TARGET).o: $(TARGET).C $(TARGET).decl.h
+	$(CHARMC) $(CHARMC_FLAGS) -c $<
 
-$(TARGET)CUDA.o: $(TARGET).cu $(TARGET).h
-	$(NVCC) -o $@ $(NVCC_FLAGS) $(CHARM_INC) $<
+$(TARGET)HAPI.o: $(TARGET).cu
+	$(HAPICC) $(CHARMC_FLAGS) -o $@ $(HAPICC_FLAGS) $(HAPICC_INC) $(CHARM_INC) $<
 
 clean:
 	rm -f *.decl.h *.def.h conv-host *.o $(TARGET) charmrun
+
+test: all
+	$(call run, ./$(TARGET) +p2)
diff --git a/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.C b/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.C
index 6e10917377..62a461ea27 100644
--- a/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.C
+++ b/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.C
@@ -21,19 +21,21 @@
 /* readonly */ bool sync_ver;
 /* readonly */ bool use_zerocopy;
 /* readonly */ bool print_elements;
+/* readonly */ int lb_freq;
+/* readonly */ int first_lb;
 
 extern void invokeInitKernel(DataType* d_temperature, int block_width,
-    int block_height, cudaStream_t stream);
+    int block_height, hapiStream_t stream);
 extern void invokeBoundaryKernels(DataType* d_temperature, int block_width,
     int block_height, bool left_bound, bool right_bound, bool top_bound,
-    bool bottom_bound, cudaStream_t stream);
+    bool bottom_bound, hapiStream_t stream);
 extern void invokeJacobiKernel(DataType* d_temperature, DataType* d_new_temperature,
-    int block_width, int block_height, cudaStream_t stream);
+    int block_width, int block_height, hapiStream_t stream);
 extern void invokePackingKernels(DataType* d_temperature, DataType* d_left_ghost,
     DataType* d_right_ghost, bool left_bound, bool right_bound, int block_width,
-    int block_height, cudaStream_t stream);
+    int block_height, hapiStream_t stream);
 extern void invokeUnpackingKernel(DataType* d_temperature, DataType* d_ghost,
-    bool is_left, int block_width, int block_height, cudaStream_t stream);
+    bool is_left, int block_width, int block_height, hapiStream_t stream);
 
 enum Direction { LEFT = 1, RIGHT, TOP, BOTTOM };
 
@@ -60,6 +62,8 @@ public:
     print_elements = false;
     sync_ver = false;
     my_iter = 0;
+    first_lb = 10;
+    lb_freq = 100;
 
     // Initialize aggregate timers
     update_agg_time = 0.0;
@@ -120,7 +124,7 @@ public:
         "Warm-up: %d, Bulk-synchronous: %d, Zerocopy: %d, Print: %d\n\n",
         grid_width, grid_height, block_width, block_height, n_chares_x, n_chares_y,
         n_iters, warmup_iters, sync_ver, use_zerocopy, print_elements);
-
+fflush(stdout);
     // Create blocks and start iteration
     block_proxy = CProxy_Block::ckNew(n_chares_x, n_chares_y);
     init_start_time = CkWallTimer();
@@ -129,7 +133,7 @@ public:
 
   void initDone() {
     CkPrintf("Init time: %.3lf s\n", CkWallTimer() - init_start_time);
-
+fflush(stdout);
     startIter();
   }
 
@@ -161,9 +165,11 @@ public:
     double total_time = CkWallTimer() - start_time;
     CkPrintf("Total time: %.3lf s\nAverage iteration time: %.3lf us\n",
         total_time, (total_time / n_iters) * 1e6);
+  fflush(stdout);
     if (sync_ver) {
       CkPrintf("Comm time per iteration: %.3lf us\nUpdate time per iteration: %.3lf us\n",
           (comm_agg_time / n_iters) * 1e6, (update_agg_time / n_iters) * 1e6);
+fflush(stdout);
     }
 
     if (print_elements) {
@@ -185,6 +191,7 @@ class Block : public CBase_Block {
  public:
   int my_iter;
   int neighbors;
+  int send_done_idx;
   int remote_count;
   int x, y;
 
@@ -203,42 +210,102 @@ class Block : public CBase_Block {
   DataType* __restrict__ d_send_bottom_ghost;
   DataType* __restrict__ d_recv_left_ghost;
   DataType* __restrict__ d_recv_right_ghost;
+  DataType* __restrict__ d_recv_top_ghost;
+  DataType* __restrict__ d_recv_bottom_ghost;
 
-  cudaStream_t compute_stream;
-  cudaStream_t comm_stream;
+  hapiStream_t compute_stream;
+  hapiStream_t comm_stream;
 
-  cudaEvent_t compute_event;
-  cudaEvent_t comm_event;
+  hapiEvent_t compute_event;
+  hapiEvent_t comm_event;
 
   bool left_bound, right_bound, top_bound, bottom_bound;
 
-  Block() {}
+  Block() {
+    usesAtSync = true;
+	  ckout<<"["<<thisIndex.x<<","<<thisIndex.y<<"]"<<CkMyPe()<<endl;  
+}
+
+  Block(CkMigrateMessage* m) {
+    ckout<<"["<<thisIndex.x<<","<<thisIndex.y<<"]"<<CkMyPe()<<endl;   
+    usesAtSync = true;
+    hapiCheck(hapiStreamCreateWithPriority(&compute_stream, hapiStreamDefault, 0));
+    hapiCheck(hapiStreamCreateWithPriority(&comm_stream, hapiStreamDefault, -1));
+
+    hapiCheck(hapiEventCreateWithFlags(&compute_event, hapiEventDisableTiming));
+    hapiCheck(hapiEventCreateWithFlags(&comm_event, hapiEventDisableTiming));
+  }
 
   ~Block() {
-    hapiCheck(cudaFreeHost(h_temperature));
-    hapiCheck(cudaFree(d_temperature));
-    hapiCheck(cudaFree(d_new_temperature));
-    hapiCheck(cudaFreeHost(h_left_ghost));
-    hapiCheck(cudaFreeHost(h_right_ghost));
-    hapiCheck(cudaFreeHost(h_top_ghost));
-    hapiCheck(cudaFreeHost(h_bottom_ghost));
+    //hapiStreamSynchronize(compute_stream);
+    //hapiStreamSynchronize(comm_stream);
+    hapiCheck(hapiFreeHost(h_temperature));
+    hapiCheck(hapiFree(d_temperature));
+    hapiCheck(hapiFree(d_new_temperature));
+    hapiCheck(hapiFreeHost(h_left_ghost));
+    hapiCheck(hapiFreeHost(h_right_ghost));
+    hapiCheck(hapiFreeHost(h_top_ghost));
+    hapiCheck(hapiFreeHost(h_bottom_ghost));
     if (!use_zerocopy) {
-      hapiCheck(cudaFree(d_left_ghost));
-      hapiCheck(cudaFree(d_right_ghost));
+      hapiCheck(hapiFree(d_left_ghost));
+      hapiCheck(hapiFree(d_right_ghost));
     } else {
-      hapiCheck(cudaFree(d_send_left_ghost));
-      hapiCheck(cudaFree(d_send_right_ghost));
-      hapiCheck(cudaFree(d_send_top_ghost));
-      hapiCheck(cudaFree(d_send_bottom_ghost));
-      hapiCheck(cudaFree(d_recv_left_ghost));
-      hapiCheck(cudaFree(d_recv_right_ghost));
+      hapiCheck(hapiFree(d_send_left_ghost));
+      hapiCheck(hapiFree(d_send_right_ghost));
+      hapiCheck(hapiFree(d_send_top_ghost));
+      hapiCheck(hapiFree(d_send_bottom_ghost));
+      hapiCheck(hapiFree(d_recv_left_ghost));
+      hapiCheck(hapiFree(d_recv_right_ghost));
+      hapiCheck(hapiFree(d_recv_top_ghost));
+      hapiCheck(hapiFree(d_recv_bottom_ghost));
     }
 
-    hapiCheck(cudaStreamDestroy(compute_stream));
-    hapiCheck(cudaStreamDestroy(comm_stream));
+    hapiCheck(hapiStreamDestroy(compute_stream));
+    hapiCheck(hapiStreamDestroy(comm_stream));
 
-    hapiCheck(cudaEventDestroy(compute_event));
-    hapiCheck(cudaEventDestroy(comm_event));
+    hapiCheck(hapiEventDestroy(compute_event));
+    hapiCheck(hapiEventDestroy(comm_event));
+  }
+
+  void pup(PUP::er& p) {
+    p | my_iter;
+    p | neighbors;
+    p | remote_count;
+    p | x;
+    p | y;
+    p | left_bound;
+    p | right_bound;
+    p | top_bound;
+    p | bottom_bound;
+
+    if (p.isUnpacking()) {
+      hapiCheck(hapiMallocHost((void**)&h_temperature,
+            sizeof(DataType) * (block_width + 2) * (block_height + 2)));
+      hapiCheck(hapiMalloc((void**)&d_temperature,
+            sizeof(DataType) * (block_width + 2) * (block_height + 2)));
+      hapiCheck(hapiMalloc((void**)&d_new_temperature,
+            sizeof(DataType) * (block_width + 2) * (block_height + 2)));
+      hapiCheck(hapiMallocHost((void**)&h_left_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMallocHost((void**)&h_right_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMallocHost((void**)&h_top_ghost, sizeof(DataType) * block_width));
+      hapiCheck(hapiMallocHost((void**)&h_bottom_ghost, sizeof(DataType) * block_width));
+      if (!use_zerocopy) {
+        hapiCheck(hapiMalloc((void**)&d_left_ghost, sizeof(DataType) * block_height));
+        hapiCheck(hapiMalloc((void**)&d_right_ghost, sizeof(DataType) * block_height));
+      } else {
+        hapiCheck(hapiMalloc((void**)&d_send_left_ghost, sizeof(DataType) * block_height));
+        hapiCheck(hapiMalloc((void**)&d_send_right_ghost, sizeof(DataType) * block_height));
+        hapiCheck(hapiMalloc((void**)&d_send_top_ghost, sizeof(DataType) * block_width));
+        hapiCheck(hapiMalloc((void**)&d_send_bottom_ghost, sizeof(DataType) * block_width));
+        hapiCheck(hapiMalloc((void**)&d_recv_left_ghost, sizeof(DataType) * block_height));
+        hapiCheck(hapiMalloc((void**)&d_recv_right_ghost, sizeof(DataType) * block_height));
+        hapiCheck(hapiMalloc((void**)&d_recv_top_ghost, sizeof(DataType) * block_width));
+        hapiCheck(hapiMalloc((void**)&d_recv_bottom_ghost, sizeof(DataType) * block_width));
+      }
+    }
+      
+    p(d_temperature, (block_width + 2) * (block_height + 2), PUP::PUPMode::DEVICE);
+    p(d_new_temperature, (block_width + 2) * (block_height + 2), PUP::PUPMode::DEVICE);
   }
 
   void init() {
@@ -272,33 +339,35 @@ class Block : public CBase_Block {
       neighbors++;
 
     // Allocate memory and create CUDA entities
-    hapiCheck(cudaMallocHost((void**)&h_temperature,
+    hapiCheck(hapiMallocHost((void**)&h_temperature,
           sizeof(DataType) * (block_width + 2) * (block_height + 2)));
-    hapiCheck(cudaMalloc((void**)&d_temperature,
+    hapiCheck(hapiMalloc((void**)&d_temperature,
           sizeof(DataType) * (block_width + 2) * (block_height + 2)));
-    hapiCheck(cudaMalloc((void**)&d_new_temperature,
+    hapiCheck(hapiMalloc((void**)&d_new_temperature,
           sizeof(DataType) * (block_width + 2) * (block_height + 2)));
-    hapiCheck(cudaMallocHost((void**)&h_left_ghost, sizeof(DataType) * block_height));
-    hapiCheck(cudaMallocHost((void**)&h_right_ghost, sizeof(DataType) * block_height));
-    hapiCheck(cudaMallocHost((void**)&h_top_ghost, sizeof(DataType) * block_width));
-    hapiCheck(cudaMallocHost((void**)&h_bottom_ghost, sizeof(DataType) * block_width));
+    hapiCheck(hapiMallocHost((void**)&h_left_ghost, sizeof(DataType) * block_height));
+    hapiCheck(hapiMallocHost((void**)&h_right_ghost, sizeof(DataType) * block_height));
+    hapiCheck(hapiMallocHost((void**)&h_top_ghost, sizeof(DataType) * block_width));
+    hapiCheck(hapiMallocHost((void**)&h_bottom_ghost, sizeof(DataType) * block_width));
     if (!use_zerocopy) {
-      hapiCheck(cudaMalloc((void**)&d_left_ghost, sizeof(DataType) * block_height));
-      hapiCheck(cudaMalloc((void**)&d_right_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMalloc((void**)&d_left_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMalloc((void**)&d_right_ghost, sizeof(DataType) * block_height));
     } else {
-      hapiCheck(cudaMalloc((void**)&d_send_left_ghost, sizeof(DataType) * block_height));
-      hapiCheck(cudaMalloc((void**)&d_send_right_ghost, sizeof(DataType) * block_height));
-      hapiCheck(cudaMalloc((void**)&d_send_top_ghost, sizeof(DataType) * block_width));
-      hapiCheck(cudaMalloc((void**)&d_send_bottom_ghost, sizeof(DataType) * block_width));
-      hapiCheck(cudaMalloc((void**)&d_recv_left_ghost, sizeof(DataType) * block_height));
-      hapiCheck(cudaMalloc((void**)&d_recv_right_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMalloc((void**)&d_send_left_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMalloc((void**)&d_send_right_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMalloc((void**)&d_send_top_ghost, sizeof(DataType) * block_width));
+      hapiCheck(hapiMalloc((void**)&d_send_bottom_ghost, sizeof(DataType) * block_width));
+      hapiCheck(hapiMalloc((void**)&d_recv_left_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMalloc((void**)&d_recv_right_ghost, sizeof(DataType) * block_height));
+      hapiCheck(hapiMalloc((void**)&d_recv_top_ghost, sizeof(DataType) * block_width));
+      hapiCheck(hapiMalloc((void**)&d_recv_bottom_ghost, sizeof(DataType) * block_width));
     }
 
-    hapiCheck(cudaStreamCreateWithPriority(&compute_stream, cudaStreamDefault, 0));
-    hapiCheck(cudaStreamCreateWithPriority(&comm_stream, cudaStreamDefault, -1));
+    hapiCheck(hapiStreamCreateWithPriority(&compute_stream, hapiStreamDefault, 0));
+    hapiCheck(hapiStreamCreateWithPriority(&comm_stream, hapiStreamDefault, -1));
 
-    hapiCheck(cudaEventCreateWithFlags(&compute_event, cudaEventDisableTiming));
-    hapiCheck(cudaEventCreateWithFlags(&comm_event, cudaEventDisableTiming));
+    hapiCheck(hapiEventCreateWithFlags(&compute_event, hapiEventDisableTiming));
+    hapiCheck(hapiEventCreateWithFlags(&comm_event, hapiEventDisableTiming));
 
     // Initialize temperature data
     invokeInitKernel(d_temperature, block_width, block_height, compute_stream);
@@ -311,7 +380,7 @@ class Block : public CBase_Block {
         right_bound, top_bound, bottom_bound, compute_stream);
 
 #if CUDA_SYNC
-    cudaStreamSynchronize(compute_stream);
+    hapiStreamSynchronize(compute_stream);
     thisProxy[thisIndex].initDone();
 #else
     // TODO: Support reduction callback in hapiAddCallback
@@ -324,15 +393,32 @@ class Block : public CBase_Block {
     contribute(CkCallback(CkReductionTarget(Main, initDone), main_proxy));
   }
 
+  void iterate() {
+    if (my_iter != 0 && my_iter % 10 == 0) {
+      hapiStreamSynchronize(comm_stream);
+      hapiStreamSynchronize(compute_stream);
+      AtSync();
+      ckout<<"called at sync"<<endl;
+    } else {
+      thisProxy[thisIndex].exchangeGhosts();
+    }
+  }
+
+  void ResumeFromSync() {
+    thisProxy[thisIndex].exchangeGhosts();
+  }
+
   void update() {
+    printf("[ITER] %d updating on Process: %d\n", my_iter, CmiMyNode());
+    fflush(stdout);
     std::ostringstream os;
     os << "update (" << std::to_string(x) << "," << std::to_string(y) << ")";
     NVTXTracer(os.str(), NVTXColor::WetAsphalt);
 
     // Operations in compute stream should only be executed when
     // operations in communication stream (transfers and unpacking) complete
-    hapiCheck(cudaEventRecord(comm_event, comm_stream));
-    hapiCheck(cudaStreamWaitEvent(compute_stream, comm_event, 0));
+    hapiCheck(hapiEventRecord(comm_event, comm_stream));
+    hapiCheck(hapiStreamWaitEvent(compute_stream, comm_event, 0));
 
 #if !COMM_ONLY
     // Invoke GPU kernel for Jacobi computation
@@ -342,19 +428,19 @@ class Block : public CBase_Block {
 
     // Operations in communication stream (packing and transfers) should
     // only be executed when operations in compute stream complete
-    hapiCheck(cudaEventRecord(compute_event, compute_stream));
-    hapiCheck(cudaStreamWaitEvent(comm_stream, compute_event, 0));
+    hapiCheck(hapiEventRecord(compute_event, compute_stream));
+    hapiCheck(hapiStreamWaitEvent(comm_stream, compute_event, 0));
 
     // Copy final temperature data back to host
     if (print_elements && (my_iter == warmup_iters + n_iters)) {
-      hapiCheck(cudaMemcpyAsync(h_temperature, d_new_temperature,
+      hapiCheck(hapiMemcpyAsync(h_temperature, d_new_temperature,
             sizeof(DataType) * (block_width + 2) * (block_height + 2),
-            cudaMemcpyDeviceToHost, comm_stream));
+            hapiMemcpyDeviceToHost, comm_stream));
     }
 
     if (sync_ver) {
 #if CUDA_SYNC
-      cudaStreamSynchronize(compute_stream);
+      hapiStreamSynchronize(compute_stream);
       thisProxy[thisIndex].updateDone();
 #else
       CkCallback* cb = new CkCallback(CkIndex_Block::updateDone(), thisProxy[thisIndex]);
@@ -381,11 +467,11 @@ class Block : public CBase_Block {
 
       // Copy top and bottom ghosts to send buffers
       if (!top_bound)
-        hapiCheck(cudaMemcpyAsync(d_send_top_ghost, d_new_temperature + (block_width + 2) + 1,
-              block_width * sizeof(DataType), cudaMemcpyDeviceToDevice, comm_stream));
+        hapiCheck(hapiMemcpyAsync(d_send_top_ghost, d_new_temperature + (block_width + 2) + 1,
+              block_width * sizeof(DataType), hapiMemcpyDeviceToDevice, comm_stream));
       if (!bottom_bound)
-        hapiCheck(cudaMemcpyAsync(d_send_bottom_ghost, d_new_temperature + (block_width + 2) * block_height + 1,
-              block_width * sizeof(DataType), cudaMemcpyDeviceToDevice, comm_stream));
+        hapiCheck(hapiMemcpyAsync(d_send_bottom_ghost, d_new_temperature + (block_width + 2) * block_height + 1,
+              block_width * sizeof(DataType), hapiMemcpyDeviceToDevice, comm_stream));
     } else {
 #if !COMM_ONLY
       // Pack non-contiguous ghosts to temporary contiguous buffers on device
@@ -395,21 +481,21 @@ class Block : public CBase_Block {
 
       // Transfer ghosts from device to host
       if (!left_bound)
-        hapiCheck(cudaMemcpyAsync(h_left_ghost, d_left_ghost, block_height * sizeof(DataType),
-              cudaMemcpyDeviceToHost, comm_stream));
+        hapiCheck(hapiMemcpyAsync(h_left_ghost, d_left_ghost, block_height * sizeof(DataType),
+              hapiMemcpyDeviceToHost, comm_stream));
       if (!right_bound)
-        hapiCheck(cudaMemcpyAsync(h_right_ghost, d_right_ghost, block_height * sizeof(DataType),
-              cudaMemcpyDeviceToHost, comm_stream));
+        hapiCheck(hapiMemcpyAsync(h_right_ghost, d_right_ghost, block_height * sizeof(DataType),
+              hapiMemcpyDeviceToHost, comm_stream));
       if (!top_bound)
-        hapiCheck(cudaMemcpyAsync(h_top_ghost, d_new_temperature + (block_width + 2) + 1,
-              block_width * sizeof(DataType), cudaMemcpyDeviceToHost, comm_stream));
+        hapiCheck(hapiMemcpyAsync(h_top_ghost, d_new_temperature + (block_width + 2) + 1,
+              block_width * sizeof(DataType), hapiMemcpyDeviceToHost, comm_stream));
       if (!bottom_bound)
-        hapiCheck(cudaMemcpyAsync(h_bottom_ghost, d_new_temperature + (block_width + 2) * block_height + 1,
-              block_width * sizeof(DataType), cudaMemcpyDeviceToHost, comm_stream));
+        hapiCheck(hapiMemcpyAsync(h_bottom_ghost, d_new_temperature + (block_width + 2) * block_height + 1,
+              block_width * sizeof(DataType), hapiMemcpyDeviceToHost, comm_stream));
     }
 
 #if CUDA_SYNC
-    cudaStreamSynchronize(comm_stream);
+    hapiStreamSynchronize(comm_stream);
     thisProxy[thisIndex].packGhostsDone();
 #else
     // Add asynchronous callback to be invoked when packing kernels and
@@ -428,16 +514,16 @@ class Block : public CBase_Block {
     if (use_zerocopy) {
       if (!left_bound)
         thisProxy(x - 1, y).receiveGhostsZC(my_iter, RIGHT, block_height,
-            CkDeviceBuffer(d_send_left_ghost, comm_stream));
+            CkDeviceBuffer(d_send_left_ghost,CkCallback(CkIndex_Block::d_send_left_ghost_done(), thisProxy[thisIndex]), comm_stream));
       if (!right_bound)
         thisProxy(x + 1, y).receiveGhostsZC(my_iter, LEFT, block_height,
-            CkDeviceBuffer(d_send_right_ghost, comm_stream));
+            CkDeviceBuffer(d_send_right_ghost,  CkCallback(CkIndex_Block::d_send_right_ghost_done(), thisProxy[thisIndex]), comm_stream));
       if (!top_bound)
         thisProxy(x, y - 1).receiveGhostsZC(my_iter, BOTTOM, block_width,
-            CkDeviceBuffer(d_send_top_ghost, comm_stream));
+            CkDeviceBuffer(d_send_top_ghost, CkCallback(CkIndex_Block::d_send_top_ghost_done(), thisProxy[thisIndex]), comm_stream));
       if (!bottom_bound)
         thisProxy(x, y + 1).receiveGhostsZC(my_iter, TOP, block_width,
-            CkDeviceBuffer(d_send_bottom_ghost, comm_stream));
+            CkDeviceBuffer(d_send_bottom_ghost,CkCallback(CkIndex_Block::d_send_bottom_ghost_done(), thisProxy[thisIndex]), comm_stream));
     } else {
       if (!left_bound)
         thisProxy(x - 1, y).receiveGhostsReg(my_iter, RIGHT, block_height, h_left_ghost);
@@ -461,15 +547,15 @@ class Block : public CBase_Block {
         buf = d_recv_right_ghost;
         break;
       case TOP:
-        buf = d_temperature + 1;
+        buf = d_recv_top_ghost;
         break;
       case BOTTOM:
-        buf = d_temperature + (block_width + 2) * (block_height + 1) + 1;
+        buf = d_recv_bottom_ghost;
         break;
       default:
         CkAbort("Error: invalid direction");
     }
-    devicePost[0].cuda_stream = comm_stream;
+    devicePost[0].hapi_stream = comm_stream;
   }
 
   void processGhostsZC(int dir, int size, DataType* gh) {
@@ -487,7 +573,12 @@ class Block : public CBase_Block {
             block_height, comm_stream);
         break;
       case TOP:
+          hapiCheck(hapiMemcpyAsync(d_temperature + 1, d_recv_top_ghost,
+              block_width * sizeof(DataType), hapiMemcpyDeviceToDevice, comm_stream));
+          break;
       case BOTTOM:
+          hapiCheck(hapiMemcpyAsync(d_temperature + (block_width + 2) * (block_height + 1) + 1,
+              d_recv_bottom_ghost, block_width * sizeof(DataType), hapiMemcpyDeviceToDevice, comm_stream));
         break;
       default:
         CkAbort("Error: invalid direction");
@@ -502,8 +593,8 @@ class Block : public CBase_Block {
     switch (dir) {
       case LEFT:
         memcpy(h_left_ghost, gh, size * sizeof(DataType));
-        hapiCheck(cudaMemcpyAsync(d_left_ghost, h_left_ghost,
-              block_height * sizeof(DataType), cudaMemcpyHostToDevice, comm_stream));
+        hapiCheck(hapiMemcpyAsync(d_left_ghost, h_left_ghost,
+              block_height * sizeof(DataType), hapiMemcpyHostToDevice, comm_stream));
 #if !COMM_ONLY
         invokeUnpackingKernel(d_temperature, d_left_ghost, true, block_width,
             block_height, comm_stream);
@@ -511,8 +602,8 @@ class Block : public CBase_Block {
         break;
       case RIGHT:
         memcpy(h_right_ghost, gh, size * sizeof(DataType));
-        hapiCheck(cudaMemcpyAsync(d_right_ghost, h_right_ghost,
-              block_height * sizeof(DataType), cudaMemcpyHostToDevice, comm_stream));
+        hapiCheck(hapiMemcpyAsync(d_right_ghost, h_right_ghost,
+              block_height * sizeof(DataType), hapiMemcpyHostToDevice, comm_stream));
 #if !COMM_ONLY
         invokeUnpackingKernel(d_temperature, d_right_ghost, false, block_width,
             block_height, comm_stream);
@@ -520,13 +611,13 @@ class Block : public CBase_Block {
         break;
       case TOP:
         memcpy(h_top_ghost, gh, size * sizeof(DataType));
-        hapiCheck(cudaMemcpyAsync(d_temperature + 1, h_top_ghost,
-              block_width * sizeof(DataType), cudaMemcpyHostToDevice, comm_stream));
+        hapiCheck(hapiMemcpyAsync(d_temperature + 1, h_top_ghost,
+              block_width * sizeof(DataType), hapiMemcpyHostToDevice, comm_stream));
         break;
       case BOTTOM:
         memcpy(h_bottom_ghost, gh, size * sizeof(DataType));
-        hapiCheck(cudaMemcpyAsync(d_temperature + (block_width + 2) * (block_height + 1) + 1,
-              h_bottom_ghost, block_width * sizeof(DataType), cudaMemcpyHostToDevice, comm_stream));
+        hapiCheck(hapiMemcpyAsync(d_temperature + (block_width + 2) * (block_height + 1) + 1,
+              h_bottom_ghost, block_width * sizeof(DataType), hapiMemcpyHostToDevice, comm_stream));
         break;
       default:
         CkAbort("Error: invalid direction");
@@ -555,6 +646,7 @@ class Block : public CBase_Block {
     } else {
       main_proxy.printDone();
     }
+    fflush(stdout);
   }
 };
 
diff --git a/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.ci b/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.ci
index 3a7d8ceab4..177d6c1453 100644
--- a/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.ci
+++ b/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.ci
@@ -14,6 +14,8 @@ mainmodule jacobi2d {
   readonly bool sync_ver;
   readonly bool use_zerocopy;
   readonly bool print_elements;
+  readonly int lb_freq;
+  readonly int first_lb;
 
   mainchare Main {
     entry Main(CkArgMsg* m);
@@ -35,6 +37,12 @@ mainmodule jacobi2d {
     entry void packGhostsDone();
     entry void receiveGhostsZC(int ref, int dir, int w, nocopydevice DataType gh[w]);
     entry void receiveGhostsReg(int ref, int dir, int w, DataType gh[w]);
+    entry void d_send_left_ghost_done();
+    entry void d_send_right_ghost_done();
+    entry void d_send_top_ghost_done();
+    entry void d_send_bottom_ghost_done();
+
+    entry void iterate();
 
     entry void exchangeGhosts() {
       serial {
@@ -68,12 +76,29 @@ mainmodule jacobi2d {
         }
       }
 
+      for (send_done_idx = 0; send_done_idx < 1; send_done_idx++) {
+        if (!sync_ver) {
+          if(!left_bound) {
+            when d_send_left_ghost_done() {}
+          }
+          if(!right_bound) {
+            when d_send_right_ghost_done() {}
+          }
+          if(!top_bound) {
+            when d_send_top_ghost_done() {}
+          }
+          if(!bottom_bound) {
+            when d_send_bottom_ghost_done() {}
+          }
+        }
+      }
+
       serial {
         if (sync_ver || my_iter <= warmup_iters) {
           contribute(CkCallback(CkReductionTarget(Main, commDone), main_proxy));
         } else {
           if (my_iter < warmup_iters + n_iters) {
-            thisProxy[thisIndex].exchangeGhosts();
+            thisProxy[thisIndex].iterate();
           } else {
             contribute(CkCallback(CkReductionTarget(Main, allDone), main_proxy));
           }
diff --git a/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.cu b/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.cu
index 1374dd0def..8a838616e4 100644
--- a/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.cu
+++ b/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.cu
@@ -1,3 +1,4 @@
+
 #include "hapi.h"
 #include "jacobi2d.h"
 
@@ -96,75 +97,95 @@ __global__ void rightUnpackingKernel(DataType* temperature, DataType* ghost,
 }
 
 void invokeInitKernel(DataType* d_temperature, int block_width, int block_height,
-    cudaStream_t stream) {
+    hapiStream_t stream) {
   dim3 block_dim(TILE_SIZE, TILE_SIZE);
   dim3 grid_dim(((block_width + 2) + (block_dim.x - 1)) / block_dim.x,
       ((block_height + 2) + (block_dim.y - 1)) / block_dim.y);
 
   initKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature, block_width, block_height);
-  hapiCheck(cudaPeekAtLastError());
+  // hapiLaunchKernelWrapper(initKernel, grid_dim, block_dim, 0, stream,
+  //     d_temperature, block_width, block_height);
+  // hapiCheck(hapiPeekAtLastError());
 }
 
 void invokeBoundaryKernels(DataType* d_temperature, int block_width,
     int block_height, bool left_bound, bool right_bound, bool top_bound,
-    bool bottom_bound, cudaStream_t stream) {
+    bool bottom_bound, hapiStream_t stream) {
   dim3 block_dim(TILE_SIZE * TILE_SIZE);
 
   if (left_bound) {
     dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x);
     leftBoundaryKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature,
-        block_width, block_height);
+       block_width, block_height);
+    // hapiLaunchKernelWrapper(leftBoundaryKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, block_width, block_height);
   }
   if (right_bound) {
     dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x);
     rightBoundaryKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature,
-        block_width, block_height);
+       block_width, block_height);
+    // hapiLaunchKernelWrapper(rightBoundaryKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, block_width, block_height);
   }
   if (top_bound) {
     dim3 grid_dim((block_width + (block_dim.x - 1)) / block_dim.x);
     topBoundaryKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature,
-        block_width, block_height);
+       block_width, block_height);
+    // hapiLaunchKernelWrapper(topBoundaryKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, block_width, block_height);
   }
   if (bottom_bound) {
     dim3 grid_dim((block_width + (block_dim.x - 1)) / block_dim.x);
     bottomBoundaryKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature,
-        block_width, block_height);
+       block_width, block_height);
+    // hapiLaunchKernelWrapper(bottomBoundaryKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, block_width, block_height);
   }
-  hapiCheck(cudaPeekAtLastError());
+  // hapiCheck(hapiPeekAtLastError());
 }
 
 void invokeJacobiKernel(DataType* d_temperature, DataType* d_new_temperature,
-    int block_width, int block_height, cudaStream_t stream) {
+    int block_width, int block_height, hapiStream_t stream) {
   dim3 block_dim(TILE_SIZE, TILE_SIZE);
   dim3 grid_dim((block_width + (block_dim.x - 1)) / block_dim.x,
       (block_height + (block_dim.y - 1)) / block_dim.y);
 
   jacobiKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature, d_new_temperature, block_width, block_height);
-  hapiCheck(cudaPeekAtLastError());
+  // hapiLaunchKernelWrapper(jacobiKernel, grid_dim, block_dim, 0, stream,
+  //     d_temperature, d_new_temperature, block_width, block_height);
+  // hapiCheck(hapiPeekAtLastError());
 }
 
 void invokePackingKernels(DataType* d_temperature, DataType* d_left_ghost,
     DataType* d_right_ghost, bool left_bound, bool right_bound, int block_width,
-    int block_height, cudaStream_t stream) {
+    int block_height, hapiStream_t stream) {
   dim3 block_dim(TILE_SIZE * TILE_SIZE);
   dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x);
   if (!left_bound) {
     leftPackingKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature, d_left_ghost, block_width, block_height);
+    // hapiLaunchKernelWrapper(leftPackingKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, d_left_ghost, block_width, block_height);
   }
   if (!right_bound) {
     rightPackingKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature, d_right_ghost, block_width, block_height);
+    // hapiLaunchKernelWrapper(rightPackingKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, d_right_ghost, block_width, block_height);
   }
-  hapiCheck(cudaPeekAtLastError());
+  // hapiCheck(hapiPeekAtLastError());
 }
 
 void invokeUnpackingKernel(DataType* d_temperature, DataType* d_ghost, bool is_left,
-    int block_width, int block_height, cudaStream_t stream) {
+    int block_width, int block_height, hapiStream_t stream) {
   dim3 block_dim(TILE_SIZE * TILE_SIZE);
   dim3 grid_dim((block_height + (block_dim.x - 1)) / block_dim.x);
   if (is_left) {
     leftUnpackingKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature, d_ghost, block_width, block_height);
+    // hapiLaunchKernelWrapper(leftUnpackingKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, d_ghost, block_width, block_height);
   } else {
     rightUnpackingKernel<<<grid_dim, block_dim, 0, stream>>>(d_temperature, d_ghost, block_width, block_height);
+    // hapiLaunchKernelWrapper(rightUnpackingKernel, grid_dim, block_dim, 0, stream,
+    //     d_temperature, d_ghost, block_width, block_height);
   }
-  hapiCheck(cudaPeekAtLastError());
+  // hapiCheck(hapiPeekAtLastError());
 }
diff --git a/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.h b/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.h
index ed628a4f59..56c3aa7662 100644
--- a/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.h
+++ b/examples/charm++/cuda/gpudirect/jacobi2d/jacobi2d.h
@@ -4,7 +4,7 @@
 #ifdef TEST_CORRECTNESS
 typedef int DataType;
 #else
-typedef double DataType;
+typedef float DataType;
 #endif
 
 #define IDX(x,y) ((block_width+2)*(y)+(x))
diff --git a/examples/charm++/cuda/gpudirect/sdag/sdag.C b/examples/charm++/cuda/gpudirect/sdag/sdag.C
index c0da070c30..d4093f897c 100644
--- a/examples/charm++/cuda/gpudirect/sdag/sdag.C
+++ b/examples/charm++/cuda/gpudirect/sdag/sdag.C
@@ -9,7 +9,7 @@
 /* readonly */ int block_size;
 /* readonly */ int n_iters;
 
-extern void invokeInitKernel(double*, int, double, cudaStream_t);
+extern void invokeInitKernel(double*, int, double, hapiStream_t);
 
 class Main : public CBase_Main {
   double start_time;
@@ -71,19 +71,19 @@ class Block : public CBase_Block {
   int* reg_local_data;
   int* reg_remote_data;
 
-  cudaStream_t stream;
+  hapiStream_t stream;
 
   Block() {}
 
   ~Block() {
-    // Free memory and destroy CUDA stream
-    hapiCheck(cudaFreeHost(h_local_data));
-    hapiCheck(cudaFreeHost(h_remote_data));
-    hapiCheck(cudaFree(d_local_data));
-    hapiCheck(cudaFree(d_remote_data));
+    // Free memory and destroy hapi stream
+    hapiCheck(hapiFreeHost(h_local_data));
+    hapiCheck(hapiFreeHost(h_remote_data));
+    hapiCheck(hapiFree(d_local_data));
+    hapiCheck(hapiFree(d_remote_data));
     free(reg_local_data);
     free(reg_remote_data);
-    cudaStreamDestroy(stream);
+    hapiStreamDestroy(stream);
   }
 
   void init() {
@@ -93,14 +93,14 @@ class Block : public CBase_Block {
     peer = (thisIndex < CkNumPes() / 2) ? (thisIndex + CkNumPes() / 2) :
       (thisIndex - CkNumPes() / 2);
 
-    // Allocate memory and create CUDA stream
-    hapiCheck(cudaMallocHost(&h_local_data, sizeof(double) * block_size));
-    hapiCheck(cudaMallocHost(&h_remote_data, sizeof(double) * block_size));
-    hapiCheck(cudaMalloc(&d_local_data, sizeof(double) * block_size));
-    hapiCheck(cudaMalloc(&d_remote_data, sizeof(double) * block_size));
+    // Allocate memory and create hapi stream
+    hapiCheck(hapiMallocHost(&h_local_data, sizeof(double) * block_size));
+    hapiCheck(hapiMallocHost(&h_remote_data, sizeof(double) * block_size));
+    hapiCheck(hapiMalloc(&d_local_data, sizeof(double) * block_size));
+    hapiCheck(hapiMalloc(&d_remote_data, sizeof(double) * block_size));
     reg_local_data = (int*)malloc(sizeof(int) * block_size);
     reg_remote_data = (int*)malloc(sizeof(int) * block_size);
-    cudaStreamCreate(&stream);
+    hapiStreamCreate(&stream);
 
     // Initialize data
     invokeInitKernel(d_local_data, block_size, (double)thisIndex, stream);
@@ -115,9 +115,9 @@ class Block : public CBase_Block {
   void receive(int ref, int &size1, double *&arr1, int size2, int *arr2,
       CkDeviceBufferPost *devicePost) {
     // Inform the runtime where the incoming data should be stored
-    // and which CUDA stream should be used for the transfer
+    // and which hapi stream should be used for the transfer
     arr1 = d_remote_data;
-    devicePost[0].cuda_stream = stream;
+    devicePost[0].hapi_stream = stream;
 
     // Last array should be available here as it is not RDMA
     // Copy it over for validation
@@ -127,9 +127,9 @@ class Block : public CBase_Block {
 
   void validateData() {
     // Move the data to the host for validation
-    hapiCheck(cudaMemcpyAsync(h_remote_data, d_remote_data,
-          sizeof(double) * block_size, cudaMemcpyDeviceToHost, stream));
-    hapiCheck(cudaStreamSynchronize(stream));
+    hapiCheck(hapiMemcpyAsync(h_remote_data, d_remote_data,
+          sizeof(double) * block_size, hapiMemcpyDeviceToHost, stream));
+    hapiCheck(hapiStreamSynchronize(stream));
 
     // Validate data
     bool validated = true;
diff --git a/examples/charm++/cuda/gpudirect/sdag/sdag.cu b/examples/charm++/cuda/gpudirect/sdag/sdag.cu
index 43740102eb..16143dbe0e 100644
--- a/examples/charm++/cuda/gpudirect/sdag/sdag.cu
+++ b/examples/charm++/cuda/gpudirect/sdag/sdag.cu
@@ -10,11 +10,11 @@ __global__ void initKernel(double* data, int count, double val) {
   }
 }
 
-void invokeInitKernel(double* data, int count, double val, cudaStream_t stream) {
+void invokeInitKernel(double* data, int count, double val, hapiStream_t stream) {
   dim3 block_dim(TB_SIZE);
   dim3 grid_dim((count + block_dim.x - 1) / block_dim.x);
 
   initKernel<<<grid_dim, block_dim, 0, stream>>>(data, count, val);
 
-  hapiCheck(cudaPeekAtLastError());
+  hapiCheck(hapiPeekAtLastError());
 }
diff --git a/examples/charm++/cuda/gpudirect/verify/Makefile b/examples/charm++/cuda/gpudirect/verify/Makefile
index 2ea467f546..d323fe387f 100644
--- a/examples/charm++/cuda/gpudirect/verify/Makefile
+++ b/examples/charm++/cuda/gpudirect/verify/Makefile
@@ -1,11 +1,16 @@
 OPTS = -O0 -g
 
-CHARM_DIR = ../../../../..
+CHARM_DIR = /u/ajain18/oldCharm/multicore-linux-x86_64
 CHARMC = $(CHARM_DIR)/bin/charmc $(OPTS)
 CHARM_INC = -I$(CHARM_DIR)/include
 
-NVCC = nvcc
-NVCC_FLAGS = -O3 -c -std=c++11 -use_fast_math
+AMD_FLAGS = -D__HIP_PLATFORM_AMD__=ON
+
+HAPICC = hipcc
+HAPITOOLKIT_HOME = /opt/rocm
+HAPICC_INC = -I$(HAPITOOLKIT_HOME)/include
+HAPICC_LIB = -L$(HAPITOOLKIT_HOME)/lib
+HAPI_FLAGS = -O3 -g -c -std=c++11 -use_fast_math
 LD_LIBS =
 
 TARGET = verify
@@ -14,18 +19,18 @@ all: $(TARGET)
 OBJS = $(TARGET).o $(TARGET)CUDA.o
 
 $(TARGET): $(OBJS)
-	$(CHARMC) -language charm++ -module CommonLBs -o $@ $(OBJS) $(LD_LIBS)
+	$(CHARMC) $(AMD_FLAGS) -language charm++ -module CommonLBs -o $@ $(OBJS) $(LD_LIBS)
 
 $(TARGET).decl.h: $(TARGET).ci
-	$(CHARMC) $<
+	$(CHARMC) $(AMD_FLAGS) $<
 
 $(TARGET).def.h: $(TARGET).ci
 
 $(TARGET).o: $(TARGET).C $(TARGET).decl.h $(TARGET).def.h
-	$(CHARMC) -c $<
+	$(CHARMC) $(AMD_FLAGS) -c $<
 
 $(TARGET)CUDA.o: $(TARGET).cu
-	$(NVCC) -o $@ $(NVCC_FLAGS) $(CHARM_INC) $<
+	$(HAPICC) $(AMD_FLAGS) -o $@ $(HAPI_FLAGS) $(HAPICC_INC) $(CHARM_INC) $(HAPICC_LIB) $<
 
 clean:
 	rm -f *.decl.h *.def.h conv-host *.o $(TARGET) charmrun
diff --git a/examples/charm++/cuda/gpudirect/verify/charmrun_hapi b/examples/charm++/cuda/gpudirect/verify/charmrun_hapi
new file mode 100755
index 0000000000..2064a05b7c
--- /dev/null
+++ b/examples/charm++/cuda/gpudirect/verify/charmrun_hapi
@@ -0,0 +1,292 @@
+#!/bin/bash
+
+is_restart=false
+original_args=("$@")
+pes_file="/dev/shm/numRestartProcs.txt"
+original_nodelist_file="/tmp/hapi_original_nodelist.txt"
+
+# --- Pre-parse to find the nodelist for daemon startup ---
+machinefile=""
+for ((i=0; i<${#original_args[@]}; ++i)); do
+    if [[ "${original_args[i]}" == "++nodelist" ]]; then
+        machinefile="${original_args[i+1]}"
+        break
+    fi
+done
+
+num_nodes=0
+if [[ -n "$machinefile" ]]; then
+    if [[ ! -f "$machinefile" ]]; then
+        echo "Charmrun> Error: nodelist file not found: $machinefile" >&2
+        exit 1
+    fi
+    num_nodes=$(wc -l < "$machinefile")
+else
+    echo "Charmrun> Warning: ++nodelist not found. Assuming 1 node for HAPI daemon."
+    num_nodes=1
+fi
+
+# --- Clean up and start the memory daemon in the background ---
+# Read IP addresses and slots from nodelist file (format: ipaddress slots=X)
+declare -A node_slots
+node_ips=()
+if [[ -n "$machinefile" ]]; then
+    while IFS= read -r line; do
+        # Extract IP address and slots count
+        ip=$(echo "$line" | awk '{print $1}')
+        slots=$(echo "$line" | grep -o 'slots=[0-9]*' | cut -d'=' -f2)
+        if [[ -n "$ip" ]]; then
+            node_ips+=("$ip")
+            # Default to 1 slot if not specified
+            node_slots["$ip"]=${slots:-1}
+        fi
+    done < "$machinefile"
+else
+    # Default to localhost if no nodelist
+    node_ips=("localhost")
+    node_slots["localhost"]=1
+fi
+
+# Save the original nodelist for restart comparison (only on first run)
+if [[ ! -f "$original_nodelist_file" ]]; then
+    if [[ -n "$machinefile" ]]; then
+        cp "$machinefile" "$original_nodelist_file"
+        echo "Charmrun> Saved original nodelist to $original_nodelist_file"
+    else
+        echo "localhost slots=1" > "$original_nodelist_file"
+        echo "Charmrun> Created default nodelist file at $original_nodelist_file"
+    fi
+fi
+
+# Function to get nodes from a nodelist file
+get_nodes_from_file() {
+    local file="$1"
+    local -A nodes_map
+    local -a nodes_list
+    
+    if [[ -f "$file" && -s "$file" ]]; then
+        while IFS= read -r line; do
+            # Skip empty lines and comments
+            [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
+            ip=$(echo "$line" | awk '{print $1}')
+            slots=$(echo "$line" | grep -o 'slots=[0-9]*' | cut -d'=' -f2)
+            if [[ -n "$ip" ]]; then
+                nodes_map["$ip"]=${slots:-1}
+                nodes_list+=("$ip")
+            fi
+        done < "$file"
+    fi
+    
+    # Return both the list and the associative array (global variables)
+    eval "original_nodes=(${nodes_list[*]})"
+    for ip in "${nodes_list[@]}"; do
+        eval "original_node_slots[\"$ip\"]=${nodes_map["$ip"]}"
+    done
+}
+
+# Function to find new nodes by comparing current nodelist with original (optimized with hashmap)
+find_new_nodes() {
+    declare -A original_node_slots
+    declare -a original_nodes
+    
+    # Get original nodes
+    get_nodes_from_file "$original_nodelist_file"
+    
+    declare -A original_nodes_map
+    for orig_ip in "${original_nodes[@]}"; do
+        original_nodes_map["$orig_ip"]=1
+    done
+    
+    local -a new_nodes
+    local -A new_node_slots
+    
+    # Compare current nodes with original nodes using hashmap lookup
+    for ip in "${node_ips[@]}"; do
+        # Check if node exists in original nodes hashmap (O(1) lookup)
+        if [[ -z "${original_nodes_map[$ip]}" ]]; then
+            echo "Charmrun> New node detected: $ip"
+            new_nodes+=("$ip")
+            new_node_slots["$ip"]=${node_slots["$ip"]}
+        fi
+    done
+    
+    # Return new nodes (use global variables)
+    eval "detected_new_nodes=(${new_nodes[*]})"
+    for ip in "${new_nodes[@]}"; do
+        eval "detected_new_node_slots[\"$ip\"]=${new_node_slots["$ip"]}"
+    done
+}
+
+# Clean up on all nodes via SSH (async)
+cleanup_pids=()
+echo "Charmrun> Initial cleanup on ${#node_ips[@]} node(s): ${node_ips[*]}"
+for ip in "${node_ips[@]}"; do
+    slots=${node_slots["$ip"]}
+    fifo_cmd="rm -f /dev/shm/numRestartProcs.txt /tmp/server_pipe_* /tmp/client_pipe_* /tmp/daemon_ready_*"
+    for ((slot=0; slot<slots; slot++)); do
+        fifo_cmd="$fifo_cmd; mkfifo -m 0666 /tmp/daemon_ready_$slot"
+    done
+    ssh "$ip" "$fifo_cmd" &
+    cleanup_pids+=($!)
+done
+
+# Wait for cleanup to complete on all nodes before starting daemons
+for pid in "${cleanup_pids[@]}"; do
+    wait "$pid"
+done
+
+# Start memory daemons on all nodes via SSH (async) - one daemon per slot
+daemon_pids=()
+echo "Charmrun> Starting memory daemons on all nodes..."
+for ip in "${node_ips[@]}"; do
+    slots=${node_slots["$ip"]}
+    echo "Charmrun> Starting $slots daemon(s) on node $ip"
+    for ((slot=0; slot<slots; slot++)); do
+        ssh "$ip" "nohup $(dirname "$0")/hapi_memory_daemon $slot > /dev/null 2>&1 &" &
+        daemon_pids+=($!)
+    done
+done
+
+# Optional: Wait a brief moment for SSH connections to establish (non-blocking)
+# sleep 1
+
+# --- Main execution loop ---
+while true; do
+    # Reset and parse arguments for each run
+    args=()
+    pes_arg=""
+    restart_arg=""
+
+    temp_args=("${original_args[@]}")
+    i=0
+    while [ $i -lt ${#temp_args[@]} ]; do
+        arg="${temp_args[$i]}"
+        case "$arg" in
+        +p|++p)
+            i=$((i+1))
+            pes_arg="$arg ${temp_args[$i]}"
+            ;;
+        +p[0-9]*)
+            pes_arg="$arg"
+            ;;
+        ++p[0-9]*)
+            pes_arg="$arg"
+            ;;
+        *)
+            args+=("$arg")
+            ;;
+        esac
+        i=$((i+1))
+    done
+
+    # Check the flag. If it's a restart, prepare the extra arguments.
+    if [ "$is_restart" = true ]; then
+        restart_arg="+shrinkexpand +restart /dev/shm"
+        if [ -f "$pes_file" ]; then
+            num_pes=$(cat "$pes_file")
+            pes_arg="+p $num_pes"
+        fi
+        
+        echo "Charmrun> Restart detected - checking for new nodes..."
+        
+        # Re-read current nodelist to check for new nodes
+        declare -A current_node_slots
+        current_node_ips=()
+        if [[ -n "$machinefile" ]]; then
+            while IFS= read -r line; do
+                ip=$(echo "$line" | awk '{print $1}')
+                slots=$(echo "$line" | grep -o 'slots=[0-9]*' | cut -d'=' -f2)
+                if [[ -n "$ip" ]]; then
+                    current_node_ips+=("$ip")
+                    current_node_slots["$ip"]=${slots:-1}
+                fi
+            done < "$machinefile"
+        else
+            current_node_ips=("localhost")
+            current_node_slots["localhost"]=1
+        fi
+        
+        # Update global variables with current state
+        node_ips=("${current_node_ips[@]}")
+        for ip in "${current_node_ips[@]}"; do
+            node_slots["$ip"]=${current_node_slots["$ip"]}
+        done
+        
+        # Find new nodes
+        declare -a detected_new_nodes
+        declare -A detected_new_node_slots
+        find_new_nodes
+        
+        if [[ ${#detected_new_nodes[@]} -gt 0 ]]; then
+            echo "Charmrun> Found ${#detected_new_nodes[@]} new node(s): ${detected_new_nodes[*]}"
+            
+            # Clean up new nodes
+            echo "Charmrun> Cleaning up new nodes..."
+            cleanup_pids=()
+            for ip in "${detected_new_nodes[@]}"; do
+                echo "Charmrun> Cleaning up node: $ip"
+                slots=${detected_new_node_slots["$ip"]}
+                fifo_cmd="rm -f /dev/shm/numRestartProcs.txt /tmp/server_fifo_* /tmp/client_fifo_* /tmp/daemon_ready_*"
+                for ((slot=0; slot<slots; slot++)); do
+                    fifo_cmd="$fifo_cmd; mkfifo -m 0666 /tmp/daemon_ready_$slot"
+                done
+                ssh "$ip" "$fifo_cmd" &
+                cleanup_pids+=($!)
+            done
+            
+            # Wait for cleanup to complete on new nodes
+            for pid in "${cleanup_pids[@]}"; do
+                wait "$pid"
+            done
+            
+            # Start memory daemons on new nodes
+            echo "Charmrun> Starting memory daemons on new nodes..."
+            daemon_pids=()
+            for ip in "${detected_new_nodes[@]}"; do
+                slots=${detected_new_node_slots["$ip"]}
+                echo "Charmrun> Starting $slots daemon(s) on new node $ip"
+                for ((slot=0; slot<slots; slot++)); do
+                    ssh "$ip" "nohup $(dirname "$0")/hapi_memory_daemon $slot > /dev/null 2>&1 &" &
+                    daemon_pids+=($!)
+                done
+            done
+            
+            # Update the original nodelist to include new nodes for future restarts
+            if [[ -n "$machinefile" ]]; then
+                cp "$machinefile" "$original_nodelist_file"
+                echo "Charmrun> Updated original nodelist with new nodes"
+            fi
+            
+            echo "Charmrun> New nodes setup completed"
+        else
+            echo "Charmrun> No new nodes detected"
+        fi
+    fi
+
+    # Pass all script arguments to the executable
+    "$(dirname "$0")/charmrun" $pes_arg "${args[@]}" $restart_arg
+
+    EXIT_CODE=$?
+
+    if [ "$EXIT_CODE" -eq 100 ]; then
+        is_restart=true
+        echo "Restart signal (code 100) received. Looping again."
+        echo "----------------------------------------"
+    else
+        echo "Final exit signal (code $EXIT_CODE) received. Exiting loop."
+        # Clean up the background daemon processes on all nodes
+        for ip in "${node_ips[@]}"; do
+            ssh "$ip" "pkill -f hapi_memory_daemon" &
+        done
+        # Also kill any remaining SSH connection PIDs
+        for pid in "${daemon_pids[@]}"; do
+            kill "$pid" 2>/dev/null
+        done
+        # Clean up temporary files
+        rm -f "$original_nodelist_file"
+        echo "Charmrun> Cleaned up temporary nodelist file"
+        break
+    fi
+done
+
+echo "Control loop finished."
\ No newline at end of file
diff --git a/examples/charm++/cuda/gpudirect/verify/helper.sh b/examples/charm++/cuda/gpudirect/verify/helper.sh
new file mode 100755
index 0000000000..6eef9fb132
--- /dev/null
+++ b/examples/charm++/cuda/gpudirect/verify/helper.sh
@@ -0,0 +1,6 @@
+make clean
+make verify
+# srun --mpi=cray_shasta -n 2 ./verify +gpushm +gpuipceventpool 256 +allgpus +gpucommbuffer 256 +ppn 2
+# ./charmrun ++local ++p 2 ./verify +gpushm +gpuipceventpool 512 +allgpus +gpucommbuffer 128
+# srun -n 2 ./verify +ppn 2
+# srun -n 4 ./verify
diff --git a/examples/charm++/cuda/gpudirect/verify/verbose b/examples/charm++/cuda/gpudirect/verify/verbose
new file mode 100644
index 0000000000..77bdcd5018
--- /dev/null
+++ b/examples/charm++/cuda/gpudirect/verify/verbose
@@ -0,0 +1 @@
+[2026-04-14T10:22:34.557] error: *** STEP 17600030.21 ON gpub023 CANCELLED AT 2026-04-14T10:22:34 DUE to SIGNAL Killed ***
diff --git a/examples/charm++/cuda/gpudirect/verify/verify.C b/examples/charm++/cuda/gpudirect/verify/verify.C
index 9ab66369d8..260c13b03e 100644
--- a/examples/charm++/cuda/gpudirect/verify/verify.C
+++ b/examples/charm++/cuda/gpudirect/verify/verify.C
@@ -10,34 +10,35 @@
 /* readonly */ CProxy_VerifyNodeGroup nodegroup_proxy;
 /* readonly */ int block_size;
 /* readonly */ int n_iters;
+/* readonly */ int n_warpup_iters;
 /* readonly */ bool lb_test;
 
-extern void invokeInitKernel(double*, int, double, cudaStream_t);
+extern void invokeInitKernel(double*, int, double, hapiStream_t);
 
 struct Container {
   double* h_local_data;
   double* h_remote_data;
   double* d_local_data;
   double* d_remote_data;
-  cudaStream_t stream;
+  hapiStream_t stream;
 
   Container() : h_local_data(nullptr), h_remote_data(nullptr),
     d_local_data(nullptr), d_remote_data(nullptr) {}
 
   ~Container() {
-    hapiCheck(cudaFreeHost(h_local_data));
-    hapiCheck(cudaFreeHost(h_remote_data));
-    hapiCheck(cudaFree(d_local_data));
-    hapiCheck(cudaFree(d_remote_data));
-    hapiCheck(cudaStreamDestroy(stream));
+    hapiCheck(hapiFreeHost(h_local_data));
+    hapiCheck(hapiFreeHost(h_remote_data));
+    hapiCheck(hapiFree(d_local_data));
+    hapiCheck(hapiFree(d_remote_data));
+    hapiCheck(hapiStreamDestroy(stream));
   }
 
   void init(double val) {
-    hapiCheck(cudaMallocHost(&h_local_data, sizeof(double) * block_size));
-    hapiCheck(cudaMallocHost(&h_remote_data, sizeof(double) * block_size));
-    hapiCheck(cudaMalloc(&d_local_data, sizeof(double) * block_size));
-    hapiCheck(cudaMalloc(&d_remote_data, sizeof(double) * block_size));
-    hapiCheck(cudaStreamCreate(&stream));
+    hapiCheck(hapiMallocHost(&h_local_data, sizeof(double) * block_size));
+    hapiCheck(hapiMallocHost(&h_remote_data, sizeof(double) * block_size));
+    hapiCheck(hapiMalloc(&d_local_data, sizeof(double) * block_size));
+    hapiCheck(hapiMalloc(&d_remote_data, sizeof(double) * block_size));
+    hapiCheck(hapiStreamCreate(&stream));
 
     for (int i = 0; i < block_size; i++) {
       h_local_data[i] = val;
@@ -45,13 +46,13 @@ struct Container {
     invokeInitKernel(d_local_data, block_size, val, stream);
     invokeInitKernel(d_remote_data, block_size, val, stream);
 
-    hapiCheck(cudaStreamSynchronize(stream));
+    hapiCheck(hapiStreamSynchronize(stream));
   }
 
   void verify(double val) {
-    hapiCheck(cudaMemcpyAsync(h_remote_data, d_remote_data,
-          sizeof(double) * block_size, cudaMemcpyDeviceToHost, stream));
-    hapiCheck(cudaStreamSynchronize(stream));
+    hapiCheck(hapiMemcpyAsync(h_remote_data, d_remote_data,
+          sizeof(double) * block_size, hapiMemcpyDeviceToHost, stream));
+    hapiCheck(hapiStreamSynchronize(stream));
 
     for (int i = 0; i < block_size; i++) {
       if (fabs(h_remote_data[i] - val) > ERROR_TOLERANCE) {
@@ -59,6 +60,8 @@ struct Container {
             i, val, h_remote_data[i]);
       }
     }
+
+    CmiPrintf("Data verified, looks OK!\n");
   }
 };
 
@@ -69,15 +72,16 @@ class Main : public CBase_Main {
 public:
   Main(CkArgMsg* m) {
     main_proxy = thisProxy;
-    block_size = 128;
-    n_iters = 100;
+    block_size = 1024 * 128;
+    n_iters = 150;
+    n_warpup_iters = 3;
     test_nodegroup = true;
     lb_test = false;
 
     // Check if there are 2 PEs
-    if (CkNumPes() != 2) {
-      CkAbort("Should be run with 2 PEs");
-    }
+    // if (CkNumPes() != 2) {
+    //   CkAbort("Should be run with 2 PEs");
+    // }
 
     // Don't do nodegroup test if run with 1 process
     if (CmiNumNodes() == 1) {
@@ -104,7 +108,7 @@ public:
     delete m;
 
     // Print info
-    CkPrintf("[CUDA Zerocopy Verification Test]\n"
+    CkPrintf("[hapi Zerocopy Verification Test]\n"
         "Block size: %d, Iters: %d, Nodegroup: %s, LB test: %s\n",
         block_size, n_iters, test_nodegroup ? "true" : "false",
         lb_test ? "true" : "false");
@@ -119,32 +123,44 @@ public:
   }
 
   void test() {
+    // warm up
+    for (int i = 0; i < n_warpup_iters; i++) {
+      array_proxy[0].send();
+      // CkWaitQD();
+      printf("[ITER] %d DONE!", i);
+      fflush(stdout);
+    }
     start_time = CkWallTimer();
-
-    CkPrintf("Testing chare array... ");
+    
+    CkPrintf("Testing chare array... \n");
     for (int i = 0; i < n_iters; i++) {
       array_proxy[0].send();
-      CkWaitQD();
+      printf("[ITER] %d DONE!", i);
+      fflush(stdout);
     }
+    CkWaitQD();
     CkPrintf("PASS\n");
 
-    CkPrintf("Testing chare group... ");
-    for (int i = 0; i < n_iters; i++) {
-      group_proxy[0].send();
-      CkWaitQD();
-    }
-    CkPrintf("PASS\n");
+    // CkPrintf("Testing chare group... \n");
+    // for (int i = 0; i < n_iters; i++) {
+    //   group_proxy[0].send();
+    // }
+    // CkWaitQD();
+    // CkPrintf("PASS\n");
 
-    if (test_nodegroup) {
-      CkPrintf("Testing chare nodegroup... ");
-      for (int i = 0; i < n_iters; i++) {
-        nodegroup_proxy[0].send();
-        CkWaitQD();
-      }
-      CkPrintf("PASS\n");
-    }
+    // if (test_nodegroup) {
+    //   CkPrintf("Testing chare nodegroup... \n");
+    //   for (int i = 0; i < n_iters; i++) {
+    //     nodegroup_proxy[0].send();
+    //   }
+    //   CkWaitQD();
+    //   CkPrintf("PASS\n");
+    // }
+
+    // sleep(3);
 
     CkPrintf("Elapsed: %.6lf s\n", CkWallTimer() - start_time);
+    fflush(stdout);
     CkExit();
   }
 };
@@ -168,7 +184,7 @@ public:
   }
 
   void send() {
-    thisProxy[1].recv(block_size, CkDeviceBuffer(container.d_local_data,
+    thisProxy[2].recv(block_size, CkDeviceBuffer(container.d_local_data,
           CkCallback(CkIndex_VerifyArray::reuse(), thisProxy[thisIndex]),
           container.stream));
     if (lb_test) {
@@ -179,7 +195,7 @@ public:
 
   void recv(int& size, double*& data, CkDeviceBufferPost* post) {
     data = container.d_remote_data;
-    post[0].cuda_stream = container.stream;
+    post[0].hapi_stream = container.stream;
   }
 
   void recv(int size, double* data) {
@@ -188,9 +204,14 @@ public:
       pe = CkMyPe();
       AtSync();
     }
+    printf("[VERIFY] data received on PE: %d, Process: %d\n", CmiMyPe(), CmiMyNode());
+    fflush(stdout);
   }
 
-  void reuse() {}
+  void reuse() {
+    printf("[VERIFY] source callback called on PE: %d, Process: %d\n", CmiMyPe(), CmiMyNode());
+    fflush(stdout);
+  }
 
   void ResumeFromSync() {}
 };
@@ -204,12 +225,12 @@ public:
   }
 
   void send() {
-    thisProxy[1].recv(block_size, CkDeviceBuffer(container.d_local_data, container.stream));
+    thisProxy[2].recv(block_size, CkDeviceBuffer(container.d_local_data, container.stream));
   }
 
   void recv(int& size, double*& data, CkDeviceBufferPost* post) {
     data = container.d_remote_data;
-    post[0].cuda_stream = container.stream;
+    post[0].hapi_stream = container.stream;
   }
 
   void recv(int size, double* data) {
@@ -231,7 +252,7 @@ public:
 
   void recv(int& size, double*& data, CkDeviceBufferPost* post) {
     data = container.d_remote_data;
-    post[0].cuda_stream = container.stream;
+    post[0].hapi_stream = container.stream;
   }
 
   void recv(int size, double* data) {
diff --git a/examples/charm++/cuda/gpudirect/verify/verify.cu b/examples/charm++/cuda/gpudirect/verify/verify.cu
index 987c5aed26..6b2819f7e8 100644
--- a/examples/charm++/cuda/gpudirect/verify/verify.cu
+++ b/examples/charm++/cuda/gpudirect/verify/verify.cu
@@ -10,11 +10,11 @@ __global__ void initKernel(double* data, int count, double val) {
   }
 }
 
-void invokeInitKernel(double* data, int count, double val, cudaStream_t stream) {
+void invokeInitKernel(double* data, int count, double val, hapiStream_t stream) {
   dim3 block_dim(BLOCK_SIZE);
   dim3 grid_dim((count + block_dim.x - 1) / block_dim.x);
 
   initKernel<<<grid_dim, block_dim, 0, stream>>>(data, count, val);
 
-  hapiCheck(cudaPeekAtLastError());
+  // hapiCheck(hapiPeekAtLastError());
 }
diff --git a/examples/charm++/cuda/stencil2d/Makefile b/examples/charm++/cuda/stencil2d/Makefile
index 6bde15412c..afa7284a61 100644
--- a/examples/charm++/cuda/stencil2d/Makefile
+++ b/examples/charm++/cuda/stencil2d/Makefile
@@ -1,11 +1,11 @@
 -include ../../../../common.mk
 CHARMC = ../../../../bin/charmc $(OPTS)
-OPTS = -O3 -fopt-info-vec-optimized #-DUSE_NVTX
+OPTS = -O3 -fopt-info-vec-optimized -g #-DHAPI_CUDA_CALLBACK #-DUSE_NVTX
 
 # set CUDATOOLKIT_HOME to the CUDA toolkit directory
 CUDATOOLKIT_HOME ?= /usr/local/cuda
 NVCC = $(CUDATOOLKIT_HOME)/bin/nvcc
-NVCC_FLAGS = -O3 -c -std=c++11 -DTILE_SIZE=16 -use_fast_math
+NVCC_FLAGS = -O3 -c -std=c++11 -DTILE_SIZE=16 -use_fast_math -g
 NVCC_INC = -I$(CUDATOOLKIT_HOME)/include
 CHARM_INC = -I../../../../include
 LD_LIBS = #-lnvToolsExt
@@ -16,7 +16,7 @@ all: $(TARGET)
 OBJS = $(TARGET).o $(TARGET)CUDA.o
 
 $(TARGET): $(OBJS)
-	$(CHARMC) -language charm++ -o $@ $(OBJS) $(LD_LIBS)
+	$(CHARMC) -language charm++ -module EveryLB -o $@ $(OBJS) $(LD_LIBS)
 
 $(TARGET).decl.h: $(TARGET).ci
 	$(CHARMC) $<
diff --git a/examples/charm++/cuda/stencil2d/stencil2d.C b/examples/charm++/cuda/stencil2d/stencil2d.C
index 7e1cbce385..bcd061206c 100644
--- a/examples/charm++/cuda/stencil2d/stencil2d.C
+++ b/examples/charm++/cuda/stencil2d/stencil2d.C
@@ -15,7 +15,7 @@
 #define BOTTOM 4
 #define DIVIDEBY5 0.2
 
-#define USE_CUSTOM_MAP 1 // Should be set to 1 to use GPU handler PEs
+#define USE_CUSTOM_MAP 0 // Should be set to 1 to use GPU handler PEs
 
 /* readonly */ CProxy_Main mainProxy;
 /* readonly */ int grid_x;
@@ -31,8 +31,8 @@
 /* readonly */ bool gpu_prio;
 /* readonly */ int gpu_pes;
 
-extern void invokeKernel(cudaStream_t stream, double* d_temperature,
-                         double* d_new_temperature, int block_x, int block_y,
+extern void invokeKernel(cudaStream_t stream, float* d_temperature,
+                         float* d_new_temperature, int block_x, int block_y,
                          int thread_size);
 
 // Calculate the number of digits.
@@ -85,15 +85,9 @@ class CustomMap : public CkArrayMap {
     }
 };
 
-// Used to specify LIFO ordering on callbacks.
-class CallbackMsg : public CMessage_CallbackMsg {
- public:
-  CallbackMsg() {}
-};
-
 class Main : public CBase_Main {
-  double init_start_time;
-  double start_time;
+  float init_start_time;
+  float start_time;
 
  public:
   CProxy_Stencil stencils;
@@ -214,6 +208,21 @@ class Main : public CBase_Main {
     stencils.init();
   }
 
+  void pup(PUP::er& p) {
+    p | grid_x;
+    p | grid_y;
+    p | block_x;
+    p | block_y;
+    p | num_iters;
+    p | global_exec_mode;
+    p | thread_size;
+    p | offload_ratio;
+    p | gpu_prio;
+    p | gpu_pes;
+    p | stencils;
+    p | init_start_time;
+  }
+
   void initDone() {
 #ifdef USE_NVTX
     NVTXTracer nvtx_range("Main::initDone", NVTXColor::Emerald);
@@ -225,16 +234,23 @@ class Main : public CBase_Main {
     start_time = CkWallTimer();
 
     // Start stencil iterations
-    CallbackMsg* m = new CallbackMsg();
-    stencils.iterate(m);
+    stencils.iterate();
   }
 
-  void done(double time) {
+  void done(float *times, int size) {
 #ifdef USE_NVTX
     NVTXTracer nvtx_range("Main::done", NVTXColor::Emerald);
 #endif
-    CkPrintf("\nAverage time per iteration: %lf\n",
-             time / ((num_chares_x * num_chares_y) * num_iters));
+    if (size != 2) {
+      CkAbort("Received reduction of incorrect size!");
+    }
+    float agg_time = times[0];
+    float gpu_time = times[1];
+    CkPrintf("Total times are: %lf CPU time, %lf GPU time\n",
+             agg_time, gpu_time);
+    CkPrintf("\nAverage time per iteration: %lf CPU time, %lf GPU time\n",
+             agg_time / ((num_chares_x * num_chares_y) * num_iters),
+             gpu_time / ((num_chares_x * num_chares_y) * num_iters));
     CkPrintf("Finished due to max iterations %d, total time %lf seconds\n",
              num_iters, CkWallTimer() - start_time);
     CkExit();
@@ -250,14 +266,14 @@ class Stencil : public CBase_Stencil {
   int neighbors;
   int remote_count;
 
-  double* __restrict__ temperature;
-  double* __restrict__ new_temperature;
-  double* __restrict__ d_temperature;
-  double* __restrict__ d_new_temperature;
-  double* __restrict__ left_ghost;
-  double* __restrict__ right_ghost;
-  double* __restrict__ bottom_ghost;
-  double* __restrict__ top_ghost;
+  float*  temperature;
+  float*  new_temperature;
+  float*  d_temperature;
+  float*  d_new_temperature;
+  float*  left_ghost;
+  float*  right_ghost;
+  float*  bottom_ghost;
+  float*  top_ghost;
 
   cudaStream_t stream;
 
@@ -265,20 +281,36 @@ class Stencil : public CBase_Stencil {
   int local_exec_mode;
 
   bool left_bound, right_bound, top_bound, bottom_bound;
-  double iter_start_time;
-  double agg_time;
+  float iter_start_time;
+  float agg_time;
 
-  Stencil() {}
+  Stencil() {
+    usesAtSync = true;
+  }
+
+  Stencil(CkMigrateMessage* msg) : CBase_Stencil(msg) {
+    cudaStreamCreate(&stream);
+    hapiCheck(
+          hapiMallocHost((void**)&temperature,
+                         sizeof(float) * (block_x + 2) * (block_y + 2)));
+    //hapiCheck(
+    //    hapiMallocHost((void**)&left_ghost, sizeof(float) * block_y));
+    //hapiCheck(
+    //    hapiMallocHost((void**)&right_ghost, sizeof(float) * block_y));
+    //hapiCheck(
+    //    hapiMallocHost((void**)&bottom_ghost, sizeof(float) * block_x));
+    //hapiCheck(hapiMallocHost((void**)&top_ghost, sizeof(float) * block_x));
+  }
 
   ~Stencil() {
     if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) {
-      hapiCheck(cudaFreeHost(temperature));
-      hapiCheck(cudaFree(d_temperature));
-      hapiCheck(cudaFree(d_new_temperature));
-      hapiCheck(cudaFreeHost(left_ghost));
-      hapiCheck(cudaFreeHost(right_ghost));
-      hapiCheck(cudaFreeHost(top_ghost));
-      hapiCheck(cudaFreeHost(bottom_ghost));
+      hapiCheck(hapiFreeHost(temperature));
+      hapiCheck(hapiFree(d_temperature));
+      hapiCheck(hapiFree(d_new_temperature));
+      hapiCheck(hapiFreeHost(left_ghost));
+      hapiCheck(hapiFreeHost(right_ghost));
+      hapiCheck(hapiFreeHost(top_ghost));
+      hapiCheck(hapiFreeHost(bottom_ghost));
 
       cudaStreamDestroy(stream);
     } else { // CPU_MODE
@@ -291,6 +323,40 @@ class Stencil : public CBase_Stencil {
     }
   }
 
+  void pup(PUP::er& p) {
+    p | n_digits;
+    p | my_iter;
+    p | neighbors;
+    p | remote_count;
+    p | iter_start_time;
+    p | agg_time;
+    p | thisFlatIndex;
+    p | left_bound;
+    p | right_bound;
+    p | top_bound;
+    p | bottom_bound;
+    p | local_exec_mode;
+
+    if (p.isUnpacking()) {
+      //hapiMallocHost((void**)&temperature, sizeof(float) * (block_x + 2) * (block_y + 2));
+      hapiMalloc((void**)&d_temperature, sizeof(float) * (block_x + 2) * (block_y + 2));
+      hapiMalloc((void**)&d_new_temperature, sizeof(float) * (block_x + 2) * (block_y + 2));
+      hapiMallocHost((void**)&left_ghost, sizeof(float) * block_y);
+      hapiMallocHost((void**)&right_ghost, sizeof(float) * block_y);
+      hapiMallocHost((void**)&bottom_ghost, sizeof(float) * block_x);
+      hapiMallocHost((void**)&top_ghost, sizeof(float) * block_x);
+    }
+
+    //p(temperature, (block_x + 2) * (block_y + 2));
+    //p(new_temperature, (block_x + 2) * (block_y + 2));
+    p(d_temperature, (block_x + 2) * (block_y + 2), PUP::PUPMode::DEVICE);
+    p(d_new_temperature, (block_x + 2) * (block_y + 2), PUP::PUPMode::DEVICE);
+    p(left_ghost, block_y);
+    p(right_ghost, block_y);
+    p(bottom_ghost, block_x);
+    p(top_ghost, block_x);
+  }
+
   void init() {
     thisFlatIndex = num_chares_y * thisIndex.x + thisIndex.y;
 
@@ -336,7 +402,7 @@ class Stencil : public CBase_Stencil {
         mode_string = "HAPI";
         break;
     }
-    CkPrintf("[%*d] Mode: %s, PE: %d\n", n_digits, thisFlatIndex, mode_string.c_str(), CkMyPe());
+   // CkPrintf("[%*d] Mode: %s, PE: %d\n", n_digits, thisFlatIndex, mode_string.c_str(), CkMyPe());
 
     // Initialize values
     my_iter = 0;
@@ -365,28 +431,28 @@ class Stencil : public CBase_Stencil {
     // Allocate memory and create CUDA stream
     if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) {
       hapiCheck(
-          cudaMallocHost((void**)&temperature,
-                         sizeof(double) * (block_x + 2) * (block_y + 2)));
-      hapiCheck(cudaMalloc((void**)&d_temperature,
-                           sizeof(double) * (block_x + 2) * (block_y + 2)));
-      hapiCheck(cudaMalloc((void**)&d_new_temperature,
-                           sizeof(double) * (block_x + 2) * (block_y + 2)));
+          hapiMallocHost((void**)&temperature,
+                         sizeof(float) * (block_x + 2) * (block_y + 2)));
+      hapiCheck(hapiMalloc((void**)&d_temperature,
+                           sizeof(float) * (block_x + 2) * (block_y + 2)));
+      hapiCheck(hapiMalloc((void**)&d_new_temperature,
+                           sizeof(float) * (block_x + 2) * (block_y + 2)));
       hapiCheck(
-          cudaMallocHost((void**)&left_ghost, sizeof(double) * block_y));
+          hapiMallocHost((void**)&left_ghost, sizeof(float) * block_y));
       hapiCheck(
-          cudaMallocHost((void**)&right_ghost, sizeof(double) * block_y));
+          hapiMallocHost((void**)&right_ghost, sizeof(float) * block_y));
       hapiCheck(
-          cudaMallocHost((void**)&bottom_ghost, sizeof(double) * block_x));
-      hapiCheck(cudaMallocHost((void**)&top_ghost, sizeof(double) * block_x));
+          hapiMallocHost((void**)&bottom_ghost, sizeof(float) * block_x));
+      hapiCheck(hapiMallocHost((void**)&top_ghost, sizeof(float) * block_x));
 
       cudaStreamCreate(&stream);
     } else {  // CPU_MODE
-      temperature = new double[(block_x + 2) * (block_y + 2)];
-      new_temperature = new double[(block_x + 2) * (block_y + 2)];
-      left_ghost = new double[block_y];
-      right_ghost = new double[block_y];
-      top_ghost = new double[block_x];
-      bottom_ghost = new double[block_x];
+      temperature = new float[(block_x + 2) * (block_y + 2)];
+      new_temperature = new float[(block_x + 2) * (block_y + 2)];
+      left_ghost = new float[block_y];
+      right_ghost = new float[block_y];
+      top_ghost = new float[block_x];
+      bottom_ghost = new float[block_x];
     }
 
     // Initialize temperature data
@@ -414,8 +480,8 @@ class Stencil : public CBase_Stencil {
     if ((local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) &&
         my_iter == 0) {
       hapiCheck(
-          cudaMemcpyAsync(d_temperature, temperature,
-                          sizeof(double) * (block_x + 2) * (block_y + 2),
+          hapiMemcpyAsync(d_temperature, temperature,
+                          sizeof(float) * (block_x + 2) * (block_y + 2),
                           cudaMemcpyHostToDevice, stream));
     }
 
@@ -445,17 +511,17 @@ class Stencil : public CBase_Stencil {
       thisProxy(x, y - 1).receiveGhosts(my_iter, TOP, block_x, bottom_ghost);
   }
 
-  void processGhosts(int dir, int width, double* gh) {
+  void processGhosts(int dir, int width, float* gh) {
 #ifdef USE_NVTX
     NVTXTracer nvtx_range(std::to_string(thisFlatIndex) + " Stencil::processGhosts", NVTXColor::WetAsphalt);
 #endif
     switch (dir) {
       case LEFT:
         if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) {
-          memcpy(left_ghost, gh, width * sizeof(double));
-          hapiCheck(cudaMemcpy2DAsync(
-              d_temperature + (block_x + 2), (block_x + 2) * sizeof(double),
-              left_ghost, sizeof(double), sizeof(double), block_y,
+          memcpy(left_ghost, gh, width * sizeof(float));
+          hapiCheck(hapiMemcpy2DAsync(
+              d_temperature + (block_x + 2), (block_x + 2) * sizeof(float),
+              left_ghost, sizeof(float), sizeof(float), block_y,
               cudaMemcpyHostToDevice, stream));
         } else {
           for (int j = 0; j < width; j++) {
@@ -465,11 +531,11 @@ class Stencil : public CBase_Stencil {
         break;
       case RIGHT:
         if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) {
-          memcpy(right_ghost, gh, width * sizeof(double));
-          hapiCheck(cudaMemcpy2DAsync(
+          memcpy(right_ghost, gh, width * sizeof(float));
+          hapiCheck(hapiMemcpy2DAsync(
               d_temperature + (block_x + 2) + (block_x + 1),
-              (block_x + 2) * sizeof(double), right_ghost, sizeof(double),
-              sizeof(double), block_y, cudaMemcpyHostToDevice, stream));
+              (block_x + 2) * sizeof(float), right_ghost, sizeof(float),
+              sizeof(float), block_y, cudaMemcpyHostToDevice, stream));
         } else {
           for (int j = 0; j < width; j++) {
             temperature[(block_x + 2) * (1 + j) + (block_x + 1)] = gh[j];
@@ -478,9 +544,9 @@ class Stencil : public CBase_Stencil {
         break;
       case BOTTOM:
         if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) {
-          memcpy(bottom_ghost, gh, width * sizeof(double));
-          hapiCheck(cudaMemcpyAsync(d_temperature + 1, bottom_ghost,
-                                    block_x * sizeof(double),
+          memcpy(bottom_ghost, gh, width * sizeof(float));
+          hapiCheck(hapiMemcpyAsync(d_temperature + 1, bottom_ghost,
+                                    block_x * sizeof(float),
                                     cudaMemcpyHostToDevice, stream));
         } else {
           for (int j = 0; j < width; j++) {
@@ -490,10 +556,10 @@ class Stencil : public CBase_Stencil {
         break;
       case TOP:
         if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) {
-          memcpy(top_ghost, gh, width * sizeof(double));
-          hapiCheck(cudaMemcpyAsync(
+          memcpy(top_ghost, gh, width * sizeof(float));
+          hapiCheck(hapiMemcpyAsync(
               d_temperature + (block_x + 2) * (block_y + 1) + 1, top_ghost,
-              block_x * sizeof(double), cudaMemcpyHostToDevice, stream));
+              block_x * sizeof(float), cudaMemcpyHostToDevice, stream));
         } else {
           for (int j = 0; j < width; j++) {
             temperature[(block_x + 2) * (block_y + 1) + (1 + j)] = gh[j];
@@ -505,60 +571,82 @@ class Stencil : public CBase_Stencil {
     }
   }
 
+  void ResumeFromSync()
+  {
+    CkCallback cb(CkReductionTarget(Stencil, compute), thisProxy);
+    contribute(cb);
+  }
+
+  void iterate()
+  {
+    if (my_iter > 0 && my_iter < num_iters && my_iter % 1000 == 0)
+    {
+      cudaStreamSynchronize(stream);
+      CkPrintf("Load balancing: %d/%d, iteration %d. GPU Load = %f\n",
+               thisFlatIndex, num_chares_x * num_chares_y, my_iter, getObjGPUTime());
+      AtSync();
+    }
+    else
+    {
+      if (thisFlatIndex == 0 && my_iter % 100 == 0)
+        CkPrintf("[%*d] Iteration %d\n", n_digits, thisFlatIndex, my_iter);
+      thisProxy(thisIndex.x, thisIndex.y).compute();
+    }
+  }
+
   // Updates local data with stencil computation.
   void update() {
 #ifdef USE_NVTX
     NVTXTracer nvtx_range(std::to_string(thisFlatIndex) + " Stencil::update", NVTXColor::Amethyst);
 #endif
 
-    CallbackMsg* m = new CallbackMsg();
     if (local_exec_mode == CUDA_MODE || local_exec_mode == HAPI_MODE) {
       // Invoke 2D stencil kernel
       invokeKernel(stream, d_temperature, d_new_temperature, block_x, block_y,
                    thread_size);
 
       // Transfer left ghost
-      hapiCheck(cudaMemcpy2DAsync(left_ghost, sizeof(double),
+      hapiCheck(hapiMemcpy2DAsync(left_ghost, sizeof(float),
             d_new_temperature + (block_x + 2),
-            (block_x + 2) * sizeof(double), sizeof(double),
+            (block_x + 2) * sizeof(float), sizeof(float),
             block_y, cudaMemcpyDeviceToHost, stream));
 
       // Transfer right ghost
       hapiCheck(
-          cudaMemcpy2DAsync(right_ghost, sizeof(double),
+          hapiMemcpy2DAsync(right_ghost, sizeof(float),
             d_new_temperature + (block_x + 2) + (block_x + 1),
-            (block_x + 2) * sizeof(double), sizeof(double),
+            (block_x + 2) * sizeof(float), sizeof(float),
             block_y, cudaMemcpyDeviceToHost, stream));
 
       // Transfer bottom ghost
-      hapiCheck(cudaMemcpyAsync(bottom_ghost, d_new_temperature + 1,
-            block_x * sizeof(double), cudaMemcpyDeviceToHost,
+      hapiCheck(hapiMemcpyAsync(bottom_ghost, d_new_temperature + 1,
+            block_x * sizeof(float), cudaMemcpyDeviceToHost,
             stream));
 
       // Transfer top ghost
-      hapiCheck(cudaMemcpyAsync(
+      hapiCheck(hapiMemcpyAsync(
             top_ghost, d_new_temperature + (block_x + 2) * (block_y + 1) + 1,
-            block_x * sizeof(double), cudaMemcpyDeviceToHost, stream));
+            block_x * sizeof(float), cudaMemcpyDeviceToHost, stream));
 
       // Copy final temperature data back to host (on last iteration)
       if (my_iter == num_iters - 1) {
         hapiCheck(
-            cudaMemcpyAsync(temperature, d_new_temperature,
-                            sizeof(double) * (block_x + 2) * (block_y + 2),
+            hapiMemcpyAsync(temperature, d_new_temperature,
+                            sizeof(float) * (block_x + 2) * (block_y + 2),
                             cudaMemcpyDeviceToHost, stream));
       }
 
       if (local_exec_mode == CUDA_MODE) {
         cudaStreamSynchronize(stream);
 
-        thisProxy(thisIndex.x, thisIndex.y).iterate(m);
+        thisProxy(thisIndex.x, thisIndex.y).iterate();
       } else {
         CkArrayIndex2D myIndex = CkArrayIndex2D(thisIndex);
         CkCallback* cb =
-            new CkCallback(CkIndex_Stencil::iterate(NULL), myIndex, thisProxy);
-        if (gpu_prio)
-          CkSetQueueing(m, CK_QUEUEING_LIFO);
-        hapiAddCallback(stream, cb, m);
+            new CkCallback(CkIndex_Stencil::iterate(), myIndex, thisProxy);
+        //if (gpu_prio)
+        //  CkSetQueueing(m, CK_QUEUEING_LIFO);
+        hapiAddCallback(stream, cb);
       }
     } else {  // CPU_MODE
       for (int i = 1; i <= block_x; ++i) {
@@ -573,12 +661,12 @@ class Stencil : public CBase_Stencil {
               DIVIDEBY5;
         }
       }
-      double* tmp;
+      float* tmp;
       tmp = temperature;
       temperature = new_temperature;
       new_temperature = tmp;
 
-      thisProxy(thisIndex.x, thisIndex.y).iterate(m);
+      thisProxy(thisIndex.x, thisIndex.y).iterate();
     }
   }
 
diff --git a/examples/charm++/cuda/stencil2d/stencil2d.ci b/examples/charm++/cuda/stencil2d/stencil2d.ci
index e39009f78b..467596bb2f 100644
--- a/examples/charm++/cuda/stencil2d/stencil2d.ci
+++ b/examples/charm++/cuda/stencil2d/stencil2d.ci
@@ -17,23 +17,21 @@ mainmodule stencil2d {
     entry CustomMap();
   };
 
-  message CallbackMsg;
 
   mainchare Main {
     entry Main(CkArgMsg* m);
     entry [reductiontarget] void initDone();
-    entry [reductiontarget] void done(double time);
+    entry [reductiontarget] void done(float times[n], int n);
   };
 
   array [2D] Stencil {
     entry Stencil(void);
     entry void init();
-    entry void receiveGhosts(int ref, int dir, int w, double gh[w]);
+    entry void receiveGhosts(int ref, int dir, int w, float gh[w]);
+    entry void iterate();
 
-    entry void iterate(CallbackMsg* m) {
+    entry [reductiontarget] void compute() {
       serial {
-        delete m;
-
         // Measure iteration time
         if (my_iter > 0) {
           agg_time += CkWallTimer() - iter_start_time;
@@ -42,10 +40,9 @@ mainmodule stencil2d {
 
         // Terminate if all iterations are complete
         if (my_iter >= num_iters) {
-          CkPrintf("[%*d] Average time per iteration: %lf\n", n_digits,
-                   thisFlatIndex, agg_time / num_iters);
+          float times[2] = {agg_time, getObjGPUTime()};
           CkCallback cb(CkReductionTarget(Main, done), mainProxy);
-          contribute(sizeof(double), &agg_time, CkReduction::sum_double, cb);
+          contribute(2 * sizeof(float), times, CkReduction::sum_float, cb);
         }
 
         // Send ghost data to neighbors
@@ -54,7 +51,7 @@ mainmodule stencil2d {
 
       // Receive ghost data from neighbors
       for (remote_count = 0; remote_count < neighbors; remote_count++) {
-        when receiveGhosts[my_iter](int ref, int dir, int w, double buf[w]) serial {
+        when receiveGhosts[my_iter](int ref, int dir, int w, float buf[w]) serial {
           processGhosts(dir, w, buf);
         }
       }
diff --git a/examples/charm++/cuda/stencil2d/stencil2d.cu b/examples/charm++/cuda/stencil2d/stencil2d.cu
index aff99c88dc..051e26d72c 100644
--- a/examples/charm++/cuda/stencil2d/stencil2d.cu
+++ b/examples/charm++/cuda/stencil2d/stencil2d.cu
@@ -4,7 +4,7 @@
 #define DIVIDEBY5 0.2
 #endif
 
-__global__ void stencil2DKernel(double* temperature, double* new_temperature,
+__global__ void stencil2DKernel(float* temperature, float* new_temperature,
                                 int block_x, int block_y, int thread_size) {
   int i_start = (blockDim.x * blockIdx.x + threadIdx.x) * thread_size + 1;
   int i_finish =
@@ -32,8 +32,8 @@ __global__ void stencil2DKernel(double* temperature, double* new_temperature,
   int j = jstart + threadIdx.y + blockDim.y*blockIdx.y;
 
   if (i < ifinish && j < jfinish) {
-    __shared__ double shared_temperature[TILE_SIZE][TILE_SIZE];
-    double center = temperature[j*(block_x+2)+i];
+    __shared__ float shared_temperature[TILE_SIZE][TILE_SIZE];
+    float center = temperature[j*(block_x+2)+i];
 
     shared_temperature[threadIdx.x][threadIdx.y] = center;
     __syncthreads();
@@ -55,8 +55,8 @@ __global__ void stencil2DKernel(double* temperature, double* new_temperature,
   */
 }
 
-void invokeKernel(cudaStream_t stream, double* d_temperature,
-                  double* d_new_temperature, int block_x, int block_y,
+void invokeKernel(cudaStream_t stream, float* d_temperature,
+                  float* d_new_temperature, int block_x, int block_y,
                   int thread_size) {
   dim3 block_dim(TILE_SIZE, TILE_SIZE);
   dim3 grid_dim(
@@ -64,7 +64,10 @@ void invokeKernel(cudaStream_t stream, double* d_temperature,
       (block_y + (block_dim.y * thread_size - 1)) /
           (block_dim.y * thread_size));
 
-  stencil2DKernel<<<grid_dim, block_dim, 0, stream>>>(
-      d_temperature, d_new_temperature, block_x, block_y, thread_size);
-  hapiCheck(cudaPeekAtLastError());
+  // stencil2DKernel<<<grid_dim, block_dim, 0, stream>>>(
+  //     d_temperature, d_new_temperature, block_x, block_y, thread_size);
+  // hapiCheck(cudaPeekAtLastError());
+
+    hapiCheck(hapiLaunchKernelWrapper(stencil2DKernel, grid_dim, block_dim, 0, stream,
+      d_temperature, d_new_temperature, block_x, block_y, thread_size));
 }
diff --git a/examples/charm++/cuda/vecadd/Makefile b/examples/charm++/cuda/vecadd/Makefile
index 8ac6ee0487..c4dfd1967b 100644
--- a/examples/charm++/cuda/vecadd/Makefile
+++ b/examples/charm++/cuda/vecadd/Makefile
@@ -6,12 +6,12 @@ CHARMC = ../../../../bin/charmc $(DEFS) $(OPTS)
 DEFS = #-DUSE_WR -USE_NVTX
 
 # set CUDATOOLKIT_HOME to the CUDA toolkit directory
-CUDATOOLKIT_HOME ?= /usr/local/cuda
+CUDATOOLKIT_HOME ?= /usr/
 NVCC = $(CUDATOOLKIT_HOME)/bin/nvcc
-NVCC_FLAGS = -c -std=c++11 $(DEFS)
+NVCC_FLAGS = -c -std=c++11 -lcuda -lnccl $(DEFS)
 NVCC_INC = -I$(CUDATOOLKIT_HOME)/include
 CHARM_INC = -I../../../../include
-LD_LIBS = #-lnvToolsExt
+LD_LIBS = -lcuda -lnccl #-lnvToolsExt
 
 TARGET = vecadd
 all: $(TARGET)
diff --git a/examples/charm++/cuda/vecadd/vecadd.C b/examples/charm++/cuda/vecadd/vecadd.C
index c32cf79a0f..8ccd0ff057 100644
--- a/examples/charm++/cuda/vecadd/vecadd.C
+++ b/examples/charm++/cuda/vecadd/vecadd.C
@@ -4,23 +4,21 @@
 #ifdef USE_NVTX
 #include "hapi_nvtx.h"
 #endif
+#include "nccl.h"
 
 /* readonly */ CProxy_Main mainProxy;
-/* readonly */ int vectorSize;
+/* readonly */ CProxy_NCCLManager ncclManagerProxy;
 
-#ifdef USE_WR
-extern void cudaVecAdd(int, float*, float*, float*, cudaStream_t, void*);
-#else
-extern void cudaVecAdd(int, float*, float*, float*, float*, float*, float*,
-                       cudaStream_t, void*);
-#endif
+
+extern void cudaVecAdd(int vectorSize, float* h_A, float* d_A);
+extern void localReduce(float* A, float* result, int n);
 
 void randomInit(float* data, int size) {
 #ifdef USE_NVTX
   NVTXTracer nvtx_range("randomInit", NVTXColor::PeterRiver);
 #endif
   for (int i = 0; i < size; ++i) {
-    data[i] = rand() / (float)RAND_MAX;
+    data[i] = 10;
   }
 }
 
@@ -39,7 +37,6 @@ class Main : public CBase_Main {
     // default values
     mainProxy = thisProxy;
     numChares = 4;
-    vectorSize = 1024;
 
     // handle arguments
     int c;
@@ -48,9 +45,6 @@ class Main : public CBase_Main {
         case 'c':
           numChares = atoi(optarg);
           break;
-        case 's':
-          vectorSize = atoi(optarg);
-          break;
         default:
           CkPrintf("Usage: %s -c [chares] -s [vector size]\n", m->argv[0]);
           CkExit();
@@ -60,17 +54,23 @@ class Main : public CBase_Main {
 
     // print configuration
     CkPrintf("\n[CUDA vecadd example]\n");
-    CkPrintf("Chares: %d\n", numChares);
-    CkPrintf("Vector size: %d\n", vectorSize);
+    CkPrintf("Chares: %d\n", 1);
 
     // create 1D chare array
-    workers = CProxy_Workers::ckNew(numChares);
+    workers = CProxy_Workers::ckNew(1024, 4 * CkNumPes());
+
+    ncclManagerProxy = CProxy_NCCLManager::ckNew();
 
     // start measuring execution time
     startTime = CkWallTimer();
 
     // fire off all chares in array
+    //workers.begin();
+  }
+
+  void nccl_done() {
     workers.begin();
+    //ncclManagerProxy.ckLocalBranch()->localChares++;
   }
 
   void done() {
@@ -83,103 +83,123 @@ class Main : public CBase_Main {
   }
 };
 
+class NCCLManager : public CBase_NCCLManager {
+ private:
+  int localChares;
+  ncclUniqueId id;
+  ncclComm_t comm;
+  float* localRed;
+  int redCount;
+
+ public:
+  float* globalRed;
+
+  NCCLManager() {
+    redCount = 0;
+    if (CkMyPe() == 0) {
+      ncclGetUniqueId(&id);
+      thisProxy.recvNCCLId(sizeof(ncclUniqueId), (char*)id.internal);
+    }
+  }
+
+  NCCLManager(CkMigrateMessage* m) : CBase_NCCLManager(m) {}
+
+  void registerChare() {
+    localChares++;
+  }
+
+  void setupNCCL() {
+    CkPrintf("NCCL Unique ID generated by PE %d\n", CkMyPe());
+
+    CkCallback cb(CkIndex_Main::nccl_done(), mainProxy);
+    contribute(cb);
+  }
+
+  void recvNCCLId(int size, char* id_buf) {
+    //CkPrintf("NCCL Unique ID received by PE %d\n", CkMyPe());
+    memcpy(id.internal, id_buf, sizeof(ncclUniqueId));
+
+    ncclCommInitRank(&comm, CkNumPes(), id, CkMyPe());
+
+    CkCallback cb(CkIndex_Main::nccl_done(), mainProxy);
+    contribute(cb);
+  }
+
+  void deviceContribute(int size, float* data) {
+    localReduce(localRed, data, size);
+    if (++redCount == localChares) {
+      ncclReduce((const void*)localRed, (void*)globalRed, size,
+                    ncclFloat, ncclSum, 0, comm, cudaStreamDefault);
+      redCount = 0;
+    }
+  }
+};
+
 class Workers : public CBase_Workers {
  private:
+  int vectorSize;
   float* h_A;
-  float* h_B;
-  float* h_C;
-#ifndef USE_WR
   float* d_A;
-  float* d_B;
-  float* d_C;
-#endif
   cudaStream_t stream;
 
  public:
-  Workers() {
-#ifdef USE_NVTX
-    NVTXTracer nvtx_range("Workers::Workers", NVTXColor::WetAsphalt);
-#endif
-
-    int size = sizeof(float) * vectorSize;
-    hapiCheck(cudaMallocHost(&h_A, size));
-    hapiCheck(cudaMallocHost(&h_B, size));
-    hapiCheck(cudaMallocHost(&h_C, size));
+  Workers(int size) : vectorSize(size) {
+    int dataSize = sizeof(float) * vectorSize;
+    hapiCheck(cudaMallocHost(&h_A, dataSize));
     hapiCheck(cudaStreamCreate(&stream));
-#ifndef USE_WR
-    hapiCheck(cudaMalloc(&d_A, size));
-    hapiCheck(cudaMalloc(&d_B, size));
-    hapiCheck(cudaMalloc(&d_C, size));
-#endif
+    hapiCheck(hapiMalloc((void**) &d_A, dataSize));
 
     srand(time(NULL));
     randomInit(h_A, vectorSize);
-    randomInit(h_B, vectorSize);
   }
 
-  ~Workers() {
-#ifdef USE_NVTX
-    NVTXTracer nvtx_range("Workers::~Workers", NVTXColor::WetAsphalt);
-#endif
+  Workers(CkMigrateMessage* m) : CBase_Workers(m) 
+  {
+    hapiCheck(cudaStreamCreate(&stream));    
+  }
 
+  ~Workers() {
     hapiCheck(cudaFreeHost(h_A));
-    hapiCheck(cudaFreeHost(h_B));
-    hapiCheck(cudaFreeHost(h_C));
     hapiCheck(cudaStreamDestroy(stream));
-#ifndef USE_WR
     hapiCheck(cudaFree(d_A));
-    hapiCheck(cudaFree(d_B));
-    hapiCheck(cudaFree(d_C));
-#endif
   }
 
-  void begin() {
-#ifdef USE_NVTX
-    NVTXTracer nvtx_range("Workers::begin", NVTXColor::Carrot);
-#endif
-
-    CkArrayIndex1D myIndex = CkArrayIndex1D(thisIndex);
-    CkCallback* cb =
-        new CkCallback(CkIndex_Workers::complete(), myIndex, thisArrayID);
-#ifdef USE_WR
-    cudaVecAdd(vectorSize, h_A, h_B, h_C, stream, (void*)cb);
-#else
-    cudaVecAdd(vectorSize, h_A, h_B, h_C, d_A, d_B, d_C, stream, (void*)cb);
-#endif
+  void pup(PUP::er& p) {
+    p | vectorSize;
+    if (p.isUnpacking())
+    {
+      hapiMalloc((void**) &d_A, vectorSize * sizeof(float));
+    }
+    p(d_A, vectorSize, PUP::PUPMode::DEVICE);
   }
 
-  void complete() {
-#ifdef USE_NVTX
-    NVTXTracer nvtx_range("Workers::complete", NVTXColor::Clouds);
-#endif
-
-#ifdef DEBUG
-    CkPrintf("[%d] A\n", thisIndex);
-    for (int i = 0; i < vectorSize; i++) {
-      CkPrintf("%.2f ", h_A[i]);
-    }
-    CkPrintf("\n");
+  void begin() {
+    ncclManagerProxy.ckLocalBranch()->registerChare();
+    CkCallback cb(CkIndex_Workers::reduction(), thisProxy);
+    contribute(cb);
+  }
 
-    CkPrintf("[%d] B\n", thisIndex);
-    for (int i = 0; i < vectorSize; i++) {
-      CkPrintf("%.2f ", h_B[i]);
-    }
-    CkPrintf("\n");
+  void reduction() {
+    ncclManagerProxy.ckLocalBranch()->deviceContribute(vectorSize, d_A);
 
-    CkPrintf("[%d] C\n", thisIndex);
-    for (int i = 0; i < vectorSize; i++) {
-      CkPrintf("%.2f ", h_C[i]);
+    if (thisIndex == 0) {
+      cudaMemcpy(h_A, ncclManagerProxy.ckLocalBranch()->globalRed,
+                 sizeof(float) * vectorSize, cudaMemcpyDeviceToHost);
     }
-    CkPrintf("\n");
 
-    CkPrintf("[%d] C-gold\n", thisIndex);
-    for (int j = 0; j < vectorSize; j++) {
-      h_C[j] = h_A[j] + h_B[j];
-      CkPrintf("%.2f ", h_C[j]);
+    for (int i = 0; i < vectorSize; ++i) {
+      if (thisIndex == 0) {
+        // Expected value is 10 * total number of chares
+        float expected = 10.0f * 10;
+        if (h_A[i] != expected) {
+          CkPrintf("Error at index %d: Expected %.2f, Got %.2f\n",
+                   i, expected, h_A[i]);
+          break;
+        }
+      }
     }
-    CkPrintf("\n");
-#endif
 
+    //hapiCheck(hapiFree(d_A));
     contribute(CkCallback(CkIndex_Main::done(), mainProxy));
   }
 };
diff --git a/examples/charm++/cuda/vecadd/vecadd.ci b/examples/charm++/cuda/vecadd/vecadd.ci
index 4b77228354..319aa86af6 100644
--- a/examples/charm++/cuda/vecadd/vecadd.ci
+++ b/examples/charm++/cuda/vecadd/vecadd.ci
@@ -1,15 +1,21 @@
 mainmodule vecadd {
-  readonly int vectorSize;
   readonly CProxy_Main mainProxy;
+  readonly CProxy_NCCLManager ncclManagerProxy;
 
   mainchare Main {
     entry Main(CkArgMsg* m);
+    entry [reductiontarget] void nccl_done();
     entry [reductiontarget] void done();
   };
 
+  group NCCLManager {
+    entry NCCLManager();
+    entry void recvNCCLId(int size, char id_buf[size]);
+  };
+
   array [1D] Workers {
-    entry Workers();
+    entry Workers(int size);
     entry void begin();
-    entry void complete();
+    entry [reductiontarget] void reduction();
   };
 };
diff --git a/examples/charm++/cuda/vecadd/vecadd.cu b/examples/charm++/cuda/vecadd/vecadd.cu
index 3dda85282a..844456dbf6 100644
--- a/examples/charm++/cuda/vecadd/vecadd.cu
+++ b/examples/charm++/cuda/vecadd/vecadd.cu
@@ -7,13 +7,13 @@
 #define B_INDEX 1
 #define C_INDEX 2
 
-__global__ void vecAdd(float* C, float* A, float* B, int n) {
+__global__ void vecAdd(float* C, float* A, int n) {
   // Get our global thread ID
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   // Make sure we do not go out of bounds
   if (id < n) {
-    C[id] = A[id] + B[id];
+    C[id] = C[id] + A[id];
   }
 }
 
@@ -27,42 +27,12 @@ void run_VECADD_KERNEL(hapiWorkRequest* wr, cudaStream_t kernel_stream,
 }
 #endif
 
-#ifdef USE_WR
-void cudaVecAdd(int vectorSize, float* h_A, float* h_B, float* h_C,
-                cudaStream_t stream, void* cb) {
-#else
-void cudaVecAdd(int vectorSize, float* h_A, float* h_B, float* h_C, float* d_A,
-                float* d_B, float* d_C, cudaStream_t stream, void* cb) {
-#endif
-  int size = vectorSize * sizeof(float);
-  dim3 dimBlock(BLOCK_SIZE, 1);
-  dim3 dimGrid((vectorSize - 1) / dimBlock.x + 1, 1);
-
-#ifdef USE_WR
-  // DEPRECATED
-  hapiWorkRequest* wr = hapiCreateWorkRequest();
-  wr->setExecParams(dimGrid, dimBlock);
-  wr->setStream(stream);
-  wr->addBuffer(h_A, size, true, false, true);
-  wr->addBuffer(h_B, size, true, false, true);
-  wr->addBuffer(h_C, size, false, true, true);
-  wr->setCallback(cb);
-#ifdef HAPI_TRACE
-  wr->setTraceName("vecadd");
-#endif
-  wr->setRunKernel(run_VECADD_KERNEL);
-  wr->copyUserData(&vectorSize, sizeof(int));
-
-  hapiEnqueue(wr);
-#else
-  hapiCheck(cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
-  hapiCheck(cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
-
-  vecAdd<<<dimGrid, dimBlock, 0, stream>>>(d_C, d_A, d_B, vectorSize);
-  hapiCheck(cudaPeekAtLastError());
-
-  hapiCheck(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
+void localReduce(float* A, float* result, int n) {
+  vecAdd<<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(
+      result, A, n);
+}
 
-  hapiAddCallback(stream, cb);
-#endif
+void cudaVecAdd(int vectorSize, float* h_A, float* d_A) {
+  int size = vectorSize * sizeof(float);
+  hapiCheck(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
 }
diff --git a/examples/charm++/hello/1darray/hello.C b/examples/charm++/hello/1darray/hello.C
index d83397da75..ba3d03570e 100644
--- a/examples/charm++/hello/1darray/hello.C
+++ b/examples/charm++/hello/1darray/hello.C
@@ -24,9 +24,10 @@ public:
     CkArrayOptions opts;
     opts.setNumInitial(nElements);
     CkCallback initCB(CkIndex_Main::initDone(), thisProxy);
-    opts.setInitCallback(initCB);
+    //opts.setInitCallback(initCB);
     opts.setStaticInsertion(true);
     arrProxy = CProxy_Hello::ckNew(opts);
+    //CkExit();
   };
 
   void initDone(void) {
@@ -48,6 +49,7 @@ public:
   Hello()
   {
     CkPrintf("Hello %d created\n",thisIndex);
+    CkExit();
   }
 
   Hello(CkMigrateMessage *m) {}
diff --git a/examples/charm++/osu_bw/Makefile b/examples/charm++/osu_bw/Makefile
new file mode 100644
index 0000000000..1b94ba7513
--- /dev/null
+++ b/examples/charm++/osu_bw/Makefile
@@ -0,0 +1,27 @@
+-include ../../common.mk
+CHARMC=../../../bin/charmc -O3 $(OPTS)
+
+OBJS = osu_bw.o
+
+all: osu_bw
+
+osu_bw: $(OBJS)
+	$(CHARMC) -language charm++ -o osu_bw $(OBJS)
+
+proj: $(OBJS)
+	$(CHARMC) -language charm++ -tracemode projections -o osu_bw.prj $(OBJS)
+
+osu_bw.o: osu_bw.C osu_bw.decl.h
+	$(CHARMC) -c osu_bw.C
+
+osu_bw.decl.h: osu_bw.ci
+	$(CHARMC)  osu_bw.ci
+
+clean:
+	rm -f *.decl.h *.def.h conv-host *.o osu_bw osu_bw.prj charmrun *~ *log *projrc *sts
+
+test: all
+	$(call run, +p4 ./osu_bw 32 32 4 )
+
+testp: all
+	$(call run, +p$(P) ./osu_bw $$(( $(P) * 80 ))  $$(( $(P) * 80 )) $$(( $(P) * 10 )) )
diff --git a/examples/charm++/osu_bw/osu_bw.C b/examples/charm++/osu_bw/osu_bw.C
new file mode 100644
index 0000000000..17f51390bf
--- /dev/null
+++ b/examples/charm++/osu_bw/osu_bw.C
@@ -0,0 +1,165 @@
+// osu_bw.C
+#include "osu_bw.decl.h"
+#include <cstdlib>
+#include <cstring>
+
+static const int DEFAULT_MIN = 1;
+static const int DEFAULT_MAX = 1<<22; // 4 MiB
+static const int DEFAULT_ITERS = 1000;
+static const int DEFAULT_SKIP  = 100;
+static const int DEFAULT_WIN   = 64;
+
+class Endpoint;
+class Main;
+
+class DataMsg : public CMessage_DataMsg {
+  public:
+    int size;
+    char* data;
+};
+
+class Main : public CBase_Main {
+  CProxy_Endpoint sender, receiver;
+  int minSize, maxSize, iters, skip, win;
+  int curSize;
+  double tMeasured;
+  int pendingReports;
+  int ready_count;
+
+ public:
+  Main(CkArgMsg* m) {
+    // Parse arguments
+    minSize = DEFAULT_MIN; maxSize = DEFAULT_MAX; iters = DEFAULT_ITERS; skip = DEFAULT_SKIP; win = DEFAULT_WIN;
+    ready_count = 0;
+    for (int i=1; i<m->argc; ++i) {
+      if (!strcmp(m->argv[i], "-m") && i+1<m->argc) minSize = atoi(m->argv[++i]);
+      else if (!strcmp(m->argv[i], "-M") && i+1<m->argc) maxSize = atoi(m->argv[++i]);
+      else if (!strcmp(m->argv[i], "-i") && i+1<m->argc) iters   = atoi(m->argv[++i]);
+      else if (!strcmp(m->argv[i], "-s") && i+1<m->argc) skip    = atoi(m->argv[++i]);
+      else if (!strcmp(m->argv[i], "-w") && i+1<m->argc) win     = atoi(m->argv[++i]);
+    }
+    delete m;
+
+    // Create endpoints - handle case where we only have 1 PE
+    if (CkNumPes() < 2) {
+      CkPrintf("Error: Need at least 2 PEs to run bandwidth test\n");
+      CkExit();
+      return;
+    }
+    sender   = CProxy_Endpoint::ckNew(thisProxy, iters, win, skip, 0);
+    receiver = CProxy_Endpoint::ckNew(thisProxy, iters, win, skip, 1);
+    sender.setPeer(receiver);
+    receiver.setPeer(sender);
+
+    // Header like OMB
+    CkPrintf("# OSU-style Bandwidth (Charm++)\n# Size       MB/s (MB=1e6)\n");
+
+    curSize = minSize;
+    pendingReports = 0;
+  }
+
+  void ready()
+  {
+    ready_count++;
+    if (ready_count == 2) {
+      nextSize();
+    }
+  }
+
+  void nextSize() {
+    if (curSize > maxSize) {
+      finish();
+      return;
+    }
+    pendingReports = 1;
+    sender.start(curSize);
+  }
+
+  void doneOne(double seconds) {
+    // Compute bandwidth in MB/s (decimal)
+    double bytes = double(curSize) * double(iters) * double(win);
+    double mbps  = bytes / seconds / 1.0e6;
+    CkPrintf("%-10d %.2f\n", curSize, mbps);
+    curSize = (curSize == 0) ? 1 : curSize * 2;
+    nextSize();
+  }
+
+  void finish() {
+    CkExit();
+  }
+};
+
+class Endpoint : public CBase_Endpoint {
+  CProxy_Endpoint peer;
+  CProxy_Main mainProxy;
+  int size, iters, window, skip;
+  int iter, inFlightRecv, recvInIter;
+  double t0;
+
+ public:
+  Endpoint(CProxy_Main m, int iters_, int window_, int skip_) : 
+              mainProxy(m), size(0), iters(iters_), window(window_), skip(skip_),
+              iter(0), inFlightRecv(0), recvInIter(0), t0(0.0) {}
+
+  void setPeer(CProxy_Endpoint p) { 
+    peer = p;
+    mainProxy.ready();
+  }
+
+  void start(int size_) {
+    size = size_; iter = 0; recvInIter = 0; inFlightRecv = 0; t0 = 0.0;
+    // Warmups + measured
+    // Kick off first window - but only from sender (PE 0)
+    if (CkMyPe() == 0) {
+      //CkPrintf("Starting bandwidth test: size=%d, iters=%d, window=%d, skip=%d\n", 
+      //         size, iters, window, skip);
+      sendWindow();
+    } else if (CkMyPe() == 1) {
+      CkPrintf("Receiver ready on PE %d\n", CkMyPe());
+    }
+  }
+
+  void sendWindow() {
+    // Start timer at end of warmup
+    if (iter == skip) t0 = CkWallTimer();
+    for (int w = 0; w < window; ++w) {
+      DataMsg* m = new (size) DataMsg;
+      //DataMsg* m = (DataMsg*)CkAllocMsg(DataMsg, sizeof(DataMsg) + size);
+      m->size = size;
+      // touch payload to avoid lazy effects
+      if (size > 0) memset(m->data, w, size);
+      peer.recv(m);
+    }
+    // Wait for ack from receiver to proceed to next window/iter
+  }
+
+  void recv(DataMsg* m) {
+    // Receiver counts messages and acks per window
+    recvInIter++;
+    //CkPrintf("Received message of size %d on PE %d, %d, %d\n", m->size, CkMyPe(), recvInIter, window);
+    if (recvInIter == window) {
+      recvInIter = 0;
+      peer.ack();
+    }
+    delete m;
+  }
+
+  void ack() {
+    // Sender advances iteration
+    iter++;
+    // After warmups + measured iterations, stop and report
+    if (iter == skip + iters) {
+      double t = CkWallTimer() - t0;
+      if (CkMyPe() == 0) {
+        //CkPrintf("Test completed, reporting results\n");
+        mainProxy.doneOne(t);
+      }
+      return;
+    }
+    // Otherwise send next window
+    if (CkMyPe() == 0) sendWindow();
+  }
+};
+
+#include "osu_bw.def.h"
+
diff --git a/examples/charm++/osu_bw/osu_bw.ci b/examples/charm++/osu_bw/osu_bw.ci
new file mode 100644
index 0000000000..7f44084d1e
--- /dev/null
+++ b/examples/charm++/osu_bw/osu_bw.ci
@@ -0,0 +1,23 @@
+// osu_bw.ci
+mainmodule osu_bw {
+  message DataMsg {
+    char data[];
+  };
+
+  mainchare Main {
+    entry Main(CkArgMsg* m);
+    entry void nextSize();
+    entry void doneOne(double seconds);
+    entry void finish();
+    entry void ready();
+  };
+
+  chare Endpoint {
+    entry Endpoint(CProxy_Main m, int iters, int window, int skip);
+    entry void setPeer(CProxy_Endpoint p);
+    entry void start(int size);
+    entry void recv(DataMsg *m);
+    entry void ack();
+  };
+}
+
diff --git a/examples/charm++/osu_latency/Makefile b/examples/charm++/osu_latency/Makefile
new file mode 100644
index 0000000000..619fd1d90e
--- /dev/null
+++ b/examples/charm++/osu_latency/Makefile
@@ -0,0 +1,21 @@
+-include ../../common.mk
+CHARMC=../../../bin/charmc $(OPTS)
+
+OBJS = osu_latency.o
+
+all: osu_latency
+
+osu_latency: $(OBJS)
+	$(CHARMC) -language charm++ -o osu_latency $(OBJS)
+
+osu_latency.decl.h: osu_latency.ci
+	$(CHARMC) osu_latency.ci
+
+clean:
+	rm -f *.decl.h *.def.h conv-host *.o osu_latency charmrun
+
+osu_latency.o: osu_latency.C osu_latency.decl.h
+	$(CHARMC) -c osu_latency.C
+
+test: all
+	$(call run, +p2 ./osu_latency)
diff --git a/examples/charm++/osu_latency/osu_latency.C b/examples/charm++/osu_latency/osu_latency.C
new file mode 100644
index 0000000000..eeaf1dc61b
--- /dev/null
+++ b/examples/charm++/osu_latency/osu_latency.C
@@ -0,0 +1,164 @@
+// osu_latency.C
+#include "osu_latency.decl.h"
+#include <cstdlib>
+#include <cstring>
+
+static const int DEFAULT_MIN = 0;
+static const int DEFAULT_MAX = 1<<22; // 4 MiB
+static const int DEFAULT_ITERS = 10000;
+static const int DEFAULT_SKIP  = 1000;
+
+CProxy_Main mainProxy;
+
+class Endpoint;
+class Main;
+
+class LatencyMsg : public CMessage_LatencyMsg {
+  public:
+    int size;
+    char* data;
+};
+
+class Main : public CBase_Main {
+  CProxy_Endpoint sender, receiver;
+  int minSize, maxSize, iters, skip;
+  int curSize;
+  int ready_count;
+
+ public:
+  Main(CkArgMsg* m) {
+    // Parse arguments
+    minSize = DEFAULT_MIN; maxSize = DEFAULT_MAX; iters = DEFAULT_ITERS; skip = DEFAULT_SKIP;
+    ready_count = 0;
+    for (int i=1; i<m->argc; ++i) {
+      if (!strcmp(m->argv[i], "-m") && i+1<m->argc) minSize = atoi(m->argv[++i]);
+      else if (!strcmp(m->argv[i], "-M") && i+1<m->argc) maxSize = atoi(m->argv[++i]);
+      else if (!strcmp(m->argv[i], "-i") && i+1<m->argc) iters   = atoi(m->argv[++i]);
+      else if (!strcmp(m->argv[i], "-s") && i+1<m->argc) skip    = atoi(m->argv[++i]);
+    }
+    delete m;
+
+    // Create endpoints - handle case where we only have 1 PE
+    if (CkNumPes() < 2) {
+      CkPrintf("Error: Need at least 2 PEs to run latency test\n");
+      CkExit();
+      return;
+    }
+    sender   = CProxy_Endpoint::ckNew(thisProxy, iters, skip, 0);
+    receiver = CProxy_Endpoint::ckNew(thisProxy, iters, skip, 1);
+    sender.setPeer(receiver);
+    receiver.setPeer(sender);
+    mainProxy = thisProxy;
+
+    // Header like OMB
+    CkPrintf("# OSU-style Latency (Charm++)\n# Size          Latency (us)\n");
+
+    curSize = minSize;
+  }
+
+  void ready() {
+    ready_count++;
+    if (ready_count == 2) {
+      nextSize();
+    }
+  }
+
+  void nextSize() {
+    if (curSize > maxSize) {
+      finish();
+      return;
+    }
+    sender.start(curSize);
+  }
+
+  void doneOne(double seconds) {
+    // Compute latency in microseconds (round-trip / 2)
+    double latency_us = (seconds / (2.0 * double(iters))) * 1.0e6;
+    CkPrintf("%-12d    %.2f\n", curSize, latency_us);
+    curSize = (curSize == 0) ? 1 : curSize * 2;
+    nextSize();
+  }
+
+  void finish() {
+    CkExit();
+  }
+};
+
+class Endpoint : public CBase_Endpoint {
+  CProxy_Endpoint peer;
+  int size, iters, skip;
+  int iter;
+  double t0;
+  bool is_sender;
+
+ public:
+  Endpoint(CProxy_Main m, int iters_, int skip_) : 
+              size(0), iters(iters_), skip(skip_),
+              iter(0), t0(0.0), is_sender(false) {}
+
+  void setPeer(CProxy_Endpoint p) { 
+    peer = p;
+    is_sender = (CkMyPe() == 0);
+    mainProxy.ready();
+  }
+
+  void start(int size_) {
+    size = size_; 
+    iter = 0; 
+    t0 = 0.0;
+    
+    if (is_sender) {
+      // Start the ping-pong
+      sendPing();
+    }
+  }
+
+  void sendPing() {
+    // Start timer at end of warmup
+    if (iter == skip) {
+      t0 = CkWallTimer();
+    }
+    
+    LatencyMsg* m = new (size) LatencyMsg;
+    m->size = size;
+    // touch payload to avoid lazy effects
+    if (size > 0) {
+      memset(m->data, iter % 256, size);
+    }
+    peer.ping(m);
+  }
+
+  void ping(LatencyMsg* m) {
+    // Receiver gets ping and sends pong back
+    if (!is_sender) {
+      LatencyMsg* reply = new (m->size) LatencyMsg;
+      reply->size = m->size;
+      if (m->size > 0) {
+        memcpy(reply->data, m->data, m->size);
+      }
+      peer.pong(reply);
+    }
+    delete m;
+  }
+
+  void pong(LatencyMsg* m) {
+    // Sender gets pong back, completes one iteration
+    if (is_sender) {
+      iter++;
+      
+      // After warmups + measured iterations, stop and report
+      if (iter == skip + iters) {
+        double t = CkWallTimer() - t0;
+        mainProxy.doneOne(t);
+        delete m;
+        return;
+      }
+      
+      // Otherwise send next ping
+      delete m;
+      sendPing();
+    }
+  }
+};
+
+#include "osu_latency.def.h"
diff --git a/examples/charm++/osu_latency/osu_latency.ci b/examples/charm++/osu_latency/osu_latency.ci
new file mode 100644
index 0000000000..d8730a70f7
--- /dev/null
+++ b/examples/charm++/osu_latency/osu_latency.ci
@@ -0,0 +1,26 @@
+// osu_latency.ci
+mainmodule osu_latency {
+  readonly CProxy_Main mainProxy;
+
+  message LatencyMsg {
+    char data[];
+  };
+
+  mainchare Main {
+    entry Main(CkArgMsg* m);
+    entry void ready();
+    entry void nextSize();
+    entry void doneOne(double seconds);
+    entry void finish();
+  };
+
+  chare Endpoint {
+    entry Endpoint(CProxy_Main m, int iters, int skip);
+    entry void setPeer(CProxy_Endpoint p);
+    entry void start(int size);
+    entry void sendPing();
+    entry void ping(LatencyMsg* m);
+    entry void pong(LatencyMsg* m);
+  };
+
+};
diff --git a/examples/charm++/shrink_expand/README b/examples/charm++/shrink_expand/README
index 685431a40d..986c8e0183 100644
--- a/examples/charm++/shrink_expand/README
+++ b/examples/charm++/shrink_expand/README
@@ -1,20 +1,25 @@
 To be able to shrink and expand an application
     1 - Needs to run with a load balancer
-    2 - Ccs server option needs to be added buring runtime
+    2 - Ccs server option needs to be added during runtime
 
 Example running command:
-./charmrun +p4 jacobi2d 200 20 +balancer GreedyLB +LBDebug 3 ++nodelist ./mynodelistfile ++server ++server-port 1234
+./charmrun_elastic +p4 jacobi2d 200 20 +balancer GreedyLB +LBDebug 3 ++nodelist ./mynodelistfile ++server ++server-port 1234
+
+Ignore the ++nodelist ./mynodelistfile argument when running locally.
+
+Use the MPI machinefile format for mynodelist. For example,
+
+<ip-addr-1> slots=<nslots-1>
+<ip-addr-2> slots=<nslots-2>
+<ip-addr-3> slots=<nslots-3>
 
 Use the client to send the shrink or expand command to the running application:
-./client <hostname> <port> <oldprocs> <newprocs>
+./client <hostname> <port> <oldpes> <number_of_pes_to_be_killed> <list_pes_to_be_killed> <number_of_pes_to_be_added>
 
 For example this command will expand the application from 4 to 8 PEs:
-./client valor 1234 4 8
+./client <ip> 1234 4 0 4
 
-NOTE 1: Charm needs to built with --enable-shrinkexpand option.
+To shrink the application from 4 to 2 PEs by killing PEs 2, 3:
+./client <ip> 1234 4 2 2 3 0
 
-NOTE 2: Let's say you want to shrink your application from 2 nodes to 1 node where
-each node has 8 cores, you should have repetitive 8 entries in the nodelist file
-for the number of cores in each node. Otherwise, you'll end up shrinking your
-application in a way that it'll use 4 cores from each node whereas what you really
-want is(usually) to only use 8 cores in one of the nodes after shrink.
+NOTE 1: Charm needs to built with --enable-shrinkexpand option.
diff --git a/examples/charm++/shrink_expand/client.C b/examples/charm++/shrink_expand/client.C
index 0292894fdf..d360eadef2 100644
--- a/examples/charm++/shrink_expand/client.C
+++ b/examples/charm++/shrink_expand/client.C
@@ -14,58 +14,59 @@ int main (int argc, char **argv)
 {
     int OLDNPROCS, NEWNPROCS;
 
-    if (argc < 5) {
-        printf("Usage: %s <hostname> <port> <oldprocs> <newprocs> \n", argv[0]);
-        return 1;
-    }
-
     // Create a CcsServer and connect to the given hostname and port
     CcsServer server;
-    char host[BUF], *bitmap;
-    int i, port, cmdLen, mode;
+    char host[BUF], *msg;
+    int i, port, cmdLen, numKilled, numAdded;
+    bool isExpand;
 
     sprintf(host, "%s", argv[1]);
     sscanf(argv[2], "%d", &port);
     sscanf(argv[3], "%d", &OLDNPROCS);
-    sscanf(argv[4], "%d", &NEWNPROCS);
+    sscanf(argv[4], "%d", &numKilled);
+    int killedIndex[numKilled];
+    
+    for (i = 0; i < numKilled; i++) {
+        sscanf(argv[5 + i], "%d", &killedIndex[i]);
+    }
+
+    sscanf(argv[5 + numKilled], "%d", &numAdded);
+
+    NEWNPROCS = OLDNPROCS - numKilled + numAdded;
 
-    if( NEWNPROCS > OLDNPROCS)
-        mode = EXPAND;
-    else if(OLDNPROCS > NEWNPROCS)
-        mode = SHRINK;
-    else{
-        printf("Error: Old and new PE number is the same!\n");
+    //printf("Connecting to server %s %d\n", host, port);
+    if (CcsConnect(&server, host, port, NULL) == -1) {
+        printf("0");
         return 0;
     }
-    printf("Connecting to server %s %d\n", host, port);
-    CcsConnect(&server, host, port, NULL);
-    printf("Connected to server\n");
+    //printf("Connected to server\n");
 
-    cmdLen = OLDNPROCS * sizeof(char) + sizeof(int) + sizeof(char);
-    bitmap = (char *) malloc(cmdLen);
-
-    if (mode == EXPAND) {
-        printf("Sending expand command.\n");
-        for (i = 0; i < OLDNPROCS; i++) {
-            bitmap[i] = 1;
+    cmdLen = 2 * sizeof(int) + OLDNPROCS * sizeof(char);
+    msg = (char *) malloc(cmdLen);
+    memcpy(msg, &NEWNPROCS, sizeof(int));
+    memcpy(&msg[sizeof(int)], &OLDNPROCS, sizeof(int));
+    
+    int offset = 2 * sizeof(int);
+    int count = 0;
+    for (i = 0; i < OLDNPROCS; i++) {
+        if (i == killedIndex[count]) {
+            msg[i + offset] = 0;
+            count++;
         }
+        else
+            msg[i + offset] = 1;
     }
-    else {
-        printf("Sending shrink command.\n");
-        for (i = 0; i < OLDNPROCS; i++) {
-            if (i < NEWNPROCS)
-                bitmap[i] = 1;
-            else
-                bitmap[i] = 0;
-        }
+
+    for (i = 0; i < OLDNPROCS; i++) {
+        printf("PE %d: %d\n", i, msg[i + offset]);
     }
-    memcpy(&bitmap[OLDNPROCS], &NEWNPROCS, sizeof(int));
-    bitmap[OLDNPROCS+sizeof(int)] = '\0';
-    CcsSendRequest(&server, "set_bitmap", 0, cmdLen, bitmap);
 
-    printf("Waiting for reply...\n" );
-    CcsRecvResponse(&server, cmdLen, bitmap , 180);
-    printf("Reply received.\n");
+    //memcpy(&msg[sizeof(bool)], &NEWNPROCS, sizeof(int));
+    CcsSendRequest(&server, "set_bitmap", 0, cmdLen, msg);
+
+    //printf("Waiting for reply...\n" );
+    //CcsRecvResponse(&server, cmdLen, msg , 180);
+    //printf("Reply received.\n");
 
     return 0;
 }
diff --git a/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d.C b/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d.C
index 7a01e1a793..9e4a7cdd29 100644
--- a/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d.C
+++ b/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d.C
@@ -23,7 +23,9 @@ public:
     CProxy_Jacobi array;
     int num_chares;
     int iterations;
+    int iterations_after_restart;
     int total_iterations;
+    int lbTime;
 	double stTime;
     double startTime;
 
@@ -35,6 +37,7 @@ public:
 
         // set iteration counter to zero
         iterations=0;
+        iterations_after_restart=0;
 
         // store the main proxy
         mainProxy = thisProxy;
@@ -55,8 +58,14 @@ public:
 	        total_iterations = atoi(m->argv[3]);
 	    }
 
+        if (m->argc > 4) {
+            lbTime = atoi(m->argv[4]);
+        } else {
+            lbTime = 100;
+        }
+
         // Create new array of worker chares
-        array = CProxy_Jacobi::ckNew(num_chare_cols, num_chare_rows);
+        array = CProxy_Jacobi::ckNew(lbTime, num_chare_cols, num_chare_rows);
 
         // save the total number of worker chares we have in this simulation
         num_chares = num_chare_rows*num_chare_cols;
@@ -81,6 +90,8 @@ public:
       // subtle: Chare proxy readonly needs to be updated manually because of
       // the object pointer inside it.
     mainProxy = thisProxy;
+    stTime = CkWallTimer();
+    iterations_after_restart = 0;
 
     CkPrintf("Resuming Jacobi on %d processors with (%d,%d) elements\n", CkNumPes(), num_chare_rows, num_chare_cols);
 
@@ -96,11 +107,13 @@ void report(int completed_iteration) {
         if (iterations == total_iterations || CkWallTimer()-stTime>=3000000) {
 			CkPrintf("Program Done! avg_it:%.6f\n",(CkWallTimer()-stTime)/iterations);
             CkExit();
+            //exit(0);
         } else {
-            if(iterations%1==0) CkPrintf("starting new iteration; iteration %d time: %.6lf time/itr::%.6f\n", iterations, CkWallTimer()-stTime,(CkWallTimer()-stTime)/iterations);
-            CkPrintf("Memory Usage: %ld bytes \n", CmiMemoryUsage());
+            if(iterations%10==0) CkPrintf("starting new iteration; iteration %d time: %.6lf time/itr::%.6f\n", iterations, CkWallTimer()-stTime,(CkWallTimer()-stTime)/iterations_after_restart);
+            //CkPrintf("Memory Usage: %ld bytes \n", CmiMemoryUsage());
             recieve_count=0;
             iterations++;
+            iterations_after_restart++;
             // Call begin_iteration on all worker chares in array
             startTime = CkWallTimer();
             array.begin_iteration();
@@ -113,6 +126,7 @@ void pup(PUP::er &p){
     p|num_chares;
     p|iterations;
     p|total_iterations;
+    p|lbTime;
     p|stTime;
     p|startTime;
     CkPrintf("Main's PUPer. \n");
@@ -130,13 +144,15 @@ public:
     int messages_due;
 	int iteration;
     int useLB;
+    int lbTime;
     array2d temperature;
 
     // Constructor, initialize values
-    Jacobi()
+    Jacobi(int lbTime_)
     : messages_due(4)
     , iteration(0)
     , useLB(1)
+    , lbTime(lbTime_)
     , temperature(block_height + 2, array1d(block_width + 2, 0.0))
     {
         usesAtSync = true;
@@ -144,9 +160,11 @@ public:
     }
 
     void pup(PUP::er &p){
+        //CkPrintf("[%d] Jacobi's PUPer. \n",CkMyPe());
         p|messages_due;
         p|iteration;
         p|useLB;
+        p|lbTime;
         p|temperature;
         /* There may be some more variables used in doWork */
     }
@@ -169,15 +187,15 @@ public:
     // Perform one iteration of work
     // The first step is to send the local state to the neighbors
     void begin_iteration(void) {
-        if (iteration %50 ==0 && useLB ) {
+        if (((iteration > 0 && iteration % lbTime == 0) || iteration == 10) && useLB) {
             useLB = 0;
-            if(thisIndex.x==0 && thisIndex.y==0) CkPrintf("PROC#%d Calling LBD --------------------- iteration=%d\n",CkMyPe(),iteration);
+            //if(thisIndex.x==0 && thisIndex.y==0) CkPrintf("PROC#%d Calling LBD --------------------- iteration=%d\n",CkMyPe(),iteration);
             AtSync();
         } else {
 
         useLB=1;
-        if(thisIndex.x==0 && thisIndex.y==0) CkPrintf("PROC#%d started --------------------- iteration=%d\n",CkMyPe(),iteration);
-				iteration++;
+        //if(thisIndex.x==0 && thisIndex.y==0) CkPrintf("PROC#%d started --------------------- iteration=%d\n",CkMyPe(),iteration);
+		iteration++;
         // Copy left column and right column into temporary arrays
         array1d left_edge(block_height);
         array1d right_edge(block_height);
@@ -245,17 +263,28 @@ void ResumeFromSync() {begin_iteration();}
             // and write them to temperature[][] after all of the new values are computed.
             array2d new_temperature(block_height + 2, array1d(block_width + 2));
 
-            for(int i=1;i<block_height+1;++i){
-                for(int j=1;j<block_width+1;++j){
-                    // update my value based on the surrounding values
-                    new_temperature[i][j] = (temperature[i-1][j]+temperature[i+1][j]+temperature[i][j-1]+temperature[i][j+1]+temperature[i][j]) / 5.0;
-
+            const int tile_size_i = 1024;
+            const int tile_size_j = 1024;
+            for(int ii=1; ii<block_height+1; ii+=tile_size_i){
+                for(int jj=1; jj<block_width+1; jj+=tile_size_j){
+                    for(int i=ii; i<std::min(ii+tile_size_i, block_height+1); ++i){
+                        for(int j=jj; j<std::min(jj+tile_size_j, block_width+1); ++j){
+                            // update my value based on the surrounding values
+                            new_temperature[i][j] = (temperature[i-1][j]+temperature[i+1][j]+temperature[i][j-1]+temperature[i][j+1]+temperature[i][j]) / 5.0;
+                        }
+                    }
                 }
             }
 
-            for(int i=0;i<block_height+2;++i)
-                for(int j=0;j<block_width+2;++j)
-                    temperature[i][j] = new_temperature[i][j];
+            for(int ii=0; ii<block_height+2; ii+=tile_size_i){
+                for(int jj=0; jj<block_width+2; jj+=tile_size_j){
+                    for(int i=ii; i<std::min(ii+tile_size_i, block_height+2); ++i){
+                        for(int j=jj; j<std::min(jj+tile_size_j, block_width+2); ++j){
+                            temperature[i][j] = new_temperature[i][j];
+                        }
+                    }
+                }
+            }
 
             // Enforce the boundary conditions again
             BC();
diff --git a/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d.ci b/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d.ci
index fb96eaad02..9e4da50471 100644
--- a/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d.ci
+++ b/examples/charm++/shrink_expand/jacobi2d-iter/jacobi2d.ci
@@ -15,7 +15,7 @@ mainmodule jacobi2d {
 
   array [2D] Jacobi {
     // Normal Charm++ entry methods
-    entry Jacobi(void);
+    entry Jacobi(int);
     entry void begin_iteration(void);
     entry void ghostsFromLeft(int width, double s[width]);
     entry void ghostsFromRight(int width,double s[width]);
diff --git a/examples/charm++/shrink_expand/startup/Makefile b/examples/charm++/shrink_expand/startup/Makefile
new file mode 100644
index 0000000000..19b06c4b70
--- /dev/null
+++ b/examples/charm++/shrink_expand/startup/Makefile
@@ -0,0 +1,21 @@
+-include ../../../common.mk
+CHARMC=../../../../bin/charmc -O3 $(OPTS)
+
+OBJS = startup.o
+
+all: startup
+
+startup: $(OBJS)
+	$(CHARMC) -language charm++ -module CommonLBs -g -o startup $(OBJS)
+
+startup.decl.h: startup.ci
+	$(CHARMC)  startup.ci
+
+clean:
+	rm -f *.decl.h *.def.h conv-host *.o startup charmrun *~
+
+startup.o: startup.C startup.decl.h
+	$(CHARMC) -c startup.C
+
+test: all
+	$(call run, ./startup +p4 200 20 +balancer GreedyLB +LBDebug 3 ++server ++server-port 1234)
diff --git a/examples/charm++/shrink_expand/startup/startup.C b/examples/charm++/shrink_expand/startup/startup.C
new file mode 100644
index 0000000000..e8243aeae7
--- /dev/null
+++ b/examples/charm++/shrink_expand/startup/startup.C
@@ -0,0 +1,12 @@
+#include "startup.decl.h"
+
+class Main : public CBase_Main
+{
+public:
+
+    Main(CkArgMsg* m) {
+        CkExit();
+    }
+};
+
+#include "startup.def.h"
diff --git a/examples/charm++/shrink_expand/startup/startup.ci b/examples/charm++/shrink_expand/startup/startup.ci
new file mode 100644
index 0000000000..d77559b4e3
--- /dev/null
+++ b/examples/charm++/shrink_expand/startup/startup.ci
@@ -0,0 +1,7 @@
+mainmodule startup {
+
+  mainchare Main {
+    entry Main(CkArgMsg *m);
+  };
+
+};
diff --git a/src/arch/common/charmrun_elastic b/src/arch/common/charmrun_elastic
new file mode 100755
index 0000000000..f9e17224c3
--- /dev/null
+++ b/src/arch/common/charmrun_elastic
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+is_restart=false
+
+original_args=("$@")
+
+if [[ "$(uname)" == "Darwin" ]]; then
+    TMPDIR="/tmp"
+else
+    TMPDIR="/dev/shm"
+fi
+
+pes_file="$TMPDIR/numRestartProcs.txt"
+
+time {
+while true; do
+    args=()
+    pes_args=""
+    restart_arg=""
+
+    temp_args=("${original_args[@]}")
+    i=0
+    while [ $i -lt ${#temp_args[@]} ]; do
+        arg="${temp_args[$i]}"
+        case "$arg" in
+        +p|++p)
+        i=$((i+1))
+        pes_arg="$arg ${temp_args[$i]}"
+        ;;
+        +p[0-9]*)
+        pes_arg="$arg"
+        ;;
+        ++p[0-9]*)
+        pes_arg="$arg"
+        ;;
+        *)
+        args+=("$arg")
+        ;;
+        esac
+        i=$((i+1))
+    done
+
+    # 2. Check the flag. If it's a restart, prepare the extra argument.
+    if [ "$is_restart" = true ]; then
+        restart_arg="+restart $TMPDIR"
+        if [ -f "$pes_file" ]; then
+            num_pes=$(cat "$pes_file")
+            echo "Charm> Reading pes $num_pes from $pes_file"
+            pes_arg="+p $num_pes"
+        fi
+    fi
+
+    # Pass all script arguments ("$@") to the executable
+    "$(dirname "$0")/charmrun" $pes_arg "${args[@]}" $restart_arg
+
+    EXIT_CODE=$?
+
+    if [ "$EXIT_CODE" -eq 100 ]; then
+        is_restart=true
+        echo "Restart signal (code 100) received. Looping again."
+        echo "----------------------------------------"
+    else
+        echo "Final exit signal (code $EXIT_CODE) received. Exiting loop."
+        break
+    fi
+done
+}
+
+echo "Control loop finished."
\ No newline at end of file
diff --git a/src/arch/common/charmrun_hapi b/src/arch/common/charmrun_hapi
new file mode 100755
index 0000000000..2064a05b7c
--- /dev/null
+++ b/src/arch/common/charmrun_hapi
@@ -0,0 +1,292 @@
+#!/bin/bash
+
+is_restart=false
+original_args=("$@")
+pes_file="/dev/shm/numRestartProcs.txt"
+original_nodelist_file="/tmp/hapi_original_nodelist.txt"
+
+# --- Pre-parse to find the nodelist for daemon startup ---
+machinefile=""
+for ((i=0; i<${#original_args[@]}; ++i)); do
+    if [[ "${original_args[i]}" == "++nodelist" ]]; then
+        machinefile="${original_args[i+1]}"
+        break
+    fi
+done
+
+num_nodes=0
+if [[ -n "$machinefile" ]]; then
+    if [[ ! -f "$machinefile" ]]; then
+        echo "Charmrun> Error: nodelist file not found: $machinefile" >&2
+        exit 1
+    fi
+    num_nodes=$(wc -l < "$machinefile")
+else
+    echo "Charmrun> Warning: ++nodelist not found. Assuming 1 node for HAPI daemon."
+    num_nodes=1
+fi
+
+# --- Clean up and start the memory daemon in the background ---
+# Read IP addresses and slots from nodelist file (format: ipaddress slots=X)
+declare -A node_slots
+node_ips=()
+if [[ -n "$machinefile" ]]; then
+    while IFS= read -r line; do
+        # Extract IP address and slots count
+        ip=$(echo "$line" | awk '{print $1}')
+        slots=$(echo "$line" | grep -o 'slots=[0-9]*' | cut -d'=' -f2)
+        if [[ -n "$ip" ]]; then
+            node_ips+=("$ip")
+            # Default to 1 slot if not specified
+            node_slots["$ip"]=${slots:-1}
+        fi
+    done < "$machinefile"
+else
+    # Default to localhost if no nodelist
+    node_ips=("localhost")
+    node_slots["localhost"]=1
+fi
+
+# Save the original nodelist for restart comparison (only on first run)
+if [[ ! -f "$original_nodelist_file" ]]; then
+    if [[ -n "$machinefile" ]]; then
+        cp "$machinefile" "$original_nodelist_file"
+        echo "Charmrun> Saved original nodelist to $original_nodelist_file"
+    else
+        echo "localhost slots=1" > "$original_nodelist_file"
+        echo "Charmrun> Created default nodelist file at $original_nodelist_file"
+    fi
+fi
+
+# Function to get nodes from a nodelist file
+get_nodes_from_file() {
+    local file="$1"
+    local -A nodes_map
+    local -a nodes_list
+    
+    if [[ -f "$file" && -s "$file" ]]; then
+        while IFS= read -r line; do
+            # Skip empty lines and comments
+            [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]] && continue
+            ip=$(echo "$line" | awk '{print $1}')
+            slots=$(echo "$line" | grep -o 'slots=[0-9]*' | cut -d'=' -f2)
+            if [[ -n "$ip" ]]; then
+                nodes_map["$ip"]=${slots:-1}
+                nodes_list+=("$ip")
+            fi
+        done < "$file"
+    fi
+    
+    # Return both the list and the associative array (global variables)
+    eval "original_nodes=(${nodes_list[*]})"
+    for ip in "${nodes_list[@]}"; do
+        eval "original_node_slots[\"$ip\"]=${nodes_map["$ip"]}"
+    done
+}
+
+# Function to find new nodes by comparing current nodelist with original (optimized with hashmap)
+find_new_nodes() {
+    declare -A original_node_slots
+    declare -a original_nodes
+    
+    # Get original nodes
+    get_nodes_from_file "$original_nodelist_file"
+    
+    declare -A original_nodes_map
+    for orig_ip in "${original_nodes[@]}"; do
+        original_nodes_map["$orig_ip"]=1
+    done
+    
+    local -a new_nodes
+    local -A new_node_slots
+    
+    # Compare current nodes with original nodes using hashmap lookup
+    for ip in "${node_ips[@]}"; do
+        # Check if node exists in original nodes hashmap (O(1) lookup)
+        if [[ -z "${original_nodes_map[$ip]}" ]]; then
+            echo "Charmrun> New node detected: $ip"
+            new_nodes+=("$ip")
+            new_node_slots["$ip"]=${node_slots["$ip"]}
+        fi
+    done
+    
+    # Return new nodes (use global variables)
+    eval "detected_new_nodes=(${new_nodes[*]})"
+    for ip in "${new_nodes[@]}"; do
+        eval "detected_new_node_slots[\"$ip\"]=${new_node_slots["$ip"]}"
+    done
+}
+
+# Clean up on all nodes via SSH (async)
+cleanup_pids=()
+echo "Charmrun> Initial cleanup on ${#node_ips[@]} node(s): ${node_ips[*]}"
+for ip in "${node_ips[@]}"; do
+    slots=${node_slots["$ip"]}
+    fifo_cmd="rm -f /dev/shm/numRestartProcs.txt /tmp/server_pipe_* /tmp/client_pipe_* /tmp/daemon_ready_*"
+    for ((slot=0; slot<slots; slot++)); do
+        fifo_cmd="$fifo_cmd; mkfifo -m 0666 /tmp/daemon_ready_$slot"
+    done
+    ssh "$ip" "$fifo_cmd" &
+    cleanup_pids+=($!)
+done
+
+# Wait for cleanup to complete on all nodes before starting daemons
+for pid in "${cleanup_pids[@]}"; do
+    wait "$pid"
+done
+
+# Start memory daemons on all nodes via SSH (async) - one daemon per slot
+daemon_pids=()
+echo "Charmrun> Starting memory daemons on all nodes..."
+for ip in "${node_ips[@]}"; do
+    slots=${node_slots["$ip"]}
+    echo "Charmrun> Starting $slots daemon(s) on node $ip"
+    for ((slot=0; slot<slots; slot++)); do
+        ssh "$ip" "nohup $(dirname "$0")/hapi_memory_daemon $slot > /dev/null 2>&1 &" &
+        daemon_pids+=($!)
+    done
+done
+
+# Optional: Wait a brief moment for SSH connections to establish (non-blocking)
+# sleep 1
+
+# --- Main execution loop ---
+while true; do
+    # Reset and parse arguments for each run
+    args=()
+    pes_arg=""
+    restart_arg=""
+
+    temp_args=("${original_args[@]}")
+    i=0
+    while [ $i -lt ${#temp_args[@]} ]; do
+        arg="${temp_args[$i]}"
+        case "$arg" in
+        +p|++p)
+            i=$((i+1))
+            pes_arg="$arg ${temp_args[$i]}"
+            ;;
+        +p[0-9]*)
+            pes_arg="$arg"
+            ;;
+        ++p[0-9]*)
+            pes_arg="$arg"
+            ;;
+        *)
+            args+=("$arg")
+            ;;
+        esac
+        i=$((i+1))
+    done
+
+    # Check the flag. If it's a restart, prepare the extra arguments.
+    if [ "$is_restart" = true ]; then
+        restart_arg="+shrinkexpand +restart /dev/shm"
+        if [ -f "$pes_file" ]; then
+            num_pes=$(cat "$pes_file")
+            pes_arg="+p $num_pes"
+        fi
+        
+        echo "Charmrun> Restart detected - checking for new nodes..."
+        
+        # Re-read current nodelist to check for new nodes
+        declare -A current_node_slots
+        current_node_ips=()
+        if [[ -n "$machinefile" ]]; then
+            while IFS= read -r line; do
+                ip=$(echo "$line" | awk '{print $1}')
+                slots=$(echo "$line" | grep -o 'slots=[0-9]*' | cut -d'=' -f2)
+                if [[ -n "$ip" ]]; then
+                    current_node_ips+=("$ip")
+                    current_node_slots["$ip"]=${slots:-1}
+                fi
+            done < "$machinefile"
+        else
+            current_node_ips=("localhost")
+            current_node_slots["localhost"]=1
+        fi
+        
+        # Update global variables with current state
+        node_ips=("${current_node_ips[@]}")
+        for ip in "${current_node_ips[@]}"; do
+            node_slots["$ip"]=${current_node_slots["$ip"]}
+        done
+        
+        # Find new nodes
+        declare -a detected_new_nodes
+        declare -A detected_new_node_slots
+        find_new_nodes
+        
+        if [[ ${#detected_new_nodes[@]} -gt 0 ]]; then
+            echo "Charmrun> Found ${#detected_new_nodes[@]} new node(s): ${detected_new_nodes[*]}"
+            
+            # Clean up new nodes
+            echo "Charmrun> Cleaning up new nodes..."
+            cleanup_pids=()
+            for ip in "${detected_new_nodes[@]}"; do
+                echo "Charmrun> Cleaning up node: $ip"
+                slots=${detected_new_node_slots["$ip"]}
+                fifo_cmd="rm -f /dev/shm/numRestartProcs.txt /tmp/server_fifo_* /tmp/client_fifo_* /tmp/daemon_ready_*"
+                for ((slot=0; slot<slots; slot++)); do
+                    fifo_cmd="$fifo_cmd; mkfifo -m 0666 /tmp/daemon_ready_$slot"
+                done
+                ssh "$ip" "$fifo_cmd" &
+                cleanup_pids+=($!)
+            done
+            
+            # Wait for cleanup to complete on new nodes
+            for pid in "${cleanup_pids[@]}"; do
+                wait "$pid"
+            done
+            
+            # Start memory daemons on new nodes
+            echo "Charmrun> Starting memory daemons on new nodes..."
+            daemon_pids=()
+            for ip in "${detected_new_nodes[@]}"; do
+                slots=${detected_new_node_slots["$ip"]}
+                echo "Charmrun> Starting $slots daemon(s) on new node $ip"
+                for ((slot=0; slot<slots; slot++)); do
+                    ssh "$ip" "nohup $(dirname "$0")/hapi_memory_daemon $slot > /dev/null 2>&1 &" &
+                    daemon_pids+=($!)
+                done
+            done
+            
+            # Update the original nodelist to include new nodes for future restarts
+            if [[ -n "$machinefile" ]]; then
+                cp "$machinefile" "$original_nodelist_file"
+                echo "Charmrun> Updated original nodelist with new nodes"
+            fi
+            
+            echo "Charmrun> New nodes setup completed"
+        else
+            echo "Charmrun> No new nodes detected"
+        fi
+    fi
+
+    # Pass all script arguments to the executable
+    "$(dirname "$0")/charmrun" $pes_arg "${args[@]}" $restart_arg
+
+    EXIT_CODE=$?
+
+    if [ "$EXIT_CODE" -eq 100 ]; then
+        is_restart=true
+        echo "Restart signal (code 100) received. Looping again."
+        echo "----------------------------------------"
+    else
+        echo "Final exit signal (code $EXIT_CODE) received. Exiting loop."
+        # Clean up the background daemon processes on all nodes
+        for ip in "${node_ips[@]}"; do
+            ssh "$ip" "pkill -f hapi_memory_daemon" &
+        done
+        # Also kill any remaining SSH connection PIDs
+        for pid in "${daemon_pids[@]}"; do
+            kill "$pid" 2>/dev/null
+        done
+        # Clean up temporary files
+        rm -f "$original_nodelist_file"
+        echo "Charmrun> Cleaned up temporary nodelist file"
+        break
+    fi
+done
+
+echo "Control loop finished."
\ No newline at end of file
diff --git a/src/arch/common/conv-mach-common.h b/src/arch/common/conv-mach-common.h
index 657366e07c..220d7dd7f4 100644
--- a/src/arch/common/conv-mach-common.h
+++ b/src/arch/common/conv-mach-common.h
@@ -122,5 +122,5 @@ enum cmiZCMsgType {
 
 /* GPU-aware communication is not supported by the machine layer by default */
 #ifndef CMK_GPU_COMM
-#define CMK_GPU_COMM 0
+#define CMK_GPU_COMM 1
 #endif
diff --git a/src/arch/common/conv-mach-cuda.sh b/src/arch/common/conv-mach-cuda.sh
index aeab75527d..a95dcb6005 100644
--- a/src/arch/common/conv-mach-cuda.sh
+++ b/src/arch/common/conv-mach-cuda.sh
@@ -1,4 +1,4 @@
 BUILD_CUDA=1
-CMK_INCDIR="-I$CUDA_DIR/include $CMK_INCDIR "
-CMK_LIBDIR="-L$CUDA_DIR/lib64 $CMK_LIBDIR "
-CMK_LIBS="-lcudahybridapi -lcudart -lrt $CMK_LIBS "
+CMK_INCDIR="-I$CUDA_DIR/include -I$CUDA_DIR/extras/CUPTI/include $CMK_INCDIR "
+CMK_LIBDIR="-L$CUDA_DIR/lib64 -L$CUDA_DIR/extras/CUPTI/lib64 $CMK_LIBDIR "
+CMK_LIBS="-lhybridapi -lcudart -lcupti -lrt $CMK_LIBS "
diff --git a/src/arch/common/conv-mach-hip.h b/src/arch/common/conv-mach-hip.h
new file mode 100644
index 0000000000..f4fa6fc852
--- /dev/null
+++ b/src/arch/common/conv-mach-hip.h
@@ -0,0 +1,7 @@
+#undef  CMK_HIP
+#define CMK_HIP                                           1
+
+#undef CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT
+#define CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT                   1
+#undef CMK_WHEN_PROCESSOR_IDLE_USLEEP
+#define CMK_WHEN_PROCESSOR_IDLE_USLEEP                     0
diff --git a/src/arch/common/conv-mach-hip.sh b/src/arch/common/conv-mach-hip.sh
new file mode 100644
index 0000000000..de9998de1d
--- /dev/null
+++ b/src/arch/common/conv-mach-hip.sh
@@ -0,0 +1,21 @@
+BUILD_HIP=1
+if [ -n "$ROCM_PATH" ] && [ -d "$ROCM_PATH/include" ]; then
+	CMK_ROCM_PATH="$ROCM_PATH"
+elif [ -d "/opt/rocm/include" ]; then
+	CMK_ROCM_PATH="/opt/rocm"
+elif [ -d "/opt/rocm-default/include" ]; then
+	CMK_ROCM_PATH="/opt/rocm-default"
+elif [ -d "/opt/rocm-6.2.4/include" ]; then
+	CMK_ROCM_PATH="/opt/rocm-6.2.4"
+else
+	CMK_ROCM_PATH="/opt/rocm"
+fi
+
+CMK_ROCM_LIBDIR="$CMK_ROCM_PATH/lib"
+if [ ! -d "$CMK_ROCM_LIBDIR" ] && [ -d "$CMK_ROCM_PATH/lib64" ]; then
+	CMK_ROCM_LIBDIR="$CMK_ROCM_PATH/lib64"
+fi
+
+CMK_INCDIR="-I$CMK_ROCM_PATH/include $CMK_INCDIR "
+CMK_LIBDIR="-L$CMK_ROCM_LIBDIR $CMK_LIBDIR "
+CMK_LIBS="-lhybridapi -lamdhip64 $CMK_LIBS "
diff --git a/src/arch/cuda/hybridAPI/Makefile b/src/arch/cuda/hybridAPI/Makefile
index edcca35943..9228099bc4 100644
--- a/src/arch/cuda/hybridAPI/Makefile
+++ b/src/arch/cuda/hybridAPI/Makefile
@@ -7,16 +7,16 @@ FLAGS     := $(OPTSATBUILDTIME)
 INC       := -I$(CUDA_PATH)/include -I..
 
 all: libs
-	cp libcudahybridapi.a $(CHARMDIR)/lib
+	cp libhybridapi.a $(CHARMDIR)/lib
 
-libs: libcudahybridapi.a
+libs: libhybridapi.a
 
-install: libcudahybridapi.a
-	cp libcudahybridapi.a $(CHARMDIR)/lib
+install: libhybridapi.a
+	cp libhybridapi.a $(CHARMDIR)/lib
 
-libcudahybridapi.a: hybridapi.o buddy_allocator.o
+libhybridapi.a: hybridapi.o buddy_allocator.o ck.o
 	-rm -f $@
-	ar q $@ hybridapi.o buddy_allocator.o
+	ar q $@ hybridapi.o buddy_allocator.o ck.o
 
 hybridapi.o: hapi_impl.cpp hapi_impl.h gpumanager.h devicemanager.h buddy_allocator.h hapi.h hapi_nvtx.h
 	$(CHARMC) $(FLAGS) $(INC) -o $@ -c $<
diff --git a/src/arch/cuda/hybridAPI/buddy_allocator.cpp b/src/arch/cuda/hybridAPI/buddy_allocator.cpp
index 8a940d4a99..add2485e93 100644
--- a/src/arch/cuda/hybridAPI/buddy_allocator.cpp
+++ b/src/arch/cuda/hybridAPI/buddy_allocator.cpp
@@ -4,7 +4,7 @@
 #include <cstdlib>
 #include <cmath>
 #include <algorithm>
-#include <cuda_runtime.h>
+#include <hapi_portable.h>
 
 namespace buddy {
   void allocator::print_status() {
@@ -43,6 +43,18 @@ namespace buddy {
     return free;
   }
 
+  size_t allocator::get_lb_free_size() {
+    size_t free = 0;
+    lb_free_list* tmp = head->next;
+    while(tmp != tail) {
+      free += tmp->size;
+      tmp = tmp->next;
+    }
+
+    free += lb_size - (size_t)(lb_ptr - lb_base_ptr);
+    return free;
+  }
+
   int allocator::get_bucket(size_t size) {
     return (int)std::ceil(std::log2((double)size)) - 2;
   }
@@ -51,41 +63,105 @@ namespace buddy {
     return (size_t)(ptr - base_ptr) / size;
   }
 
-  allocator::allocator(size_t size) : min_size(4), base_ptr(NULL) {
-    if (size == 0) {
+  allocator::allocator(size_t _comm_lb_size, size_t _comm_size) : min_size(4), base_ptr(NULL) {
+    if (_comm_size == 0) {
       fprintf(stderr, "Allocator size has to be larger than 0 bytes\n");
       abort();
     }
 
     // Request GPU memory (closest power of 2)
-    int total_size_log2 = std::ceil(std::log2((double)size));
-    total_size = (size_t)std::pow(2, total_size_log2);
-    cudaError_t status = cudaMalloc(&base_ptr, total_size);
-    if (status != cudaSuccess) {
+    hapiError_t status = hapiMalloc(&base_ptr, _comm_lb_size);
+    this->comm_size = _comm_size;
+    if (status != hapiSuccess) {
       fprintf(stderr, "Failed to allocate GPU memory\n");
       abort();
     }
     DEBUG_PRINT("Initialized base_ptr %p with %zu bytes\n", (void*)base_ptr, total_size);
 
     // Initialize buckets and set up last bucket (for size min_size)
+    int total_size_log2 = std::ceil(std::log2((double)_comm_size));
     bucket_count = total_size_log2 - 1;
     buckets = new std::list<FreeBlock>[bucket_count];
-    buckets[bucket_count-1].emplace_back(base_ptr, total_size);
+    buckets[bucket_count-1].emplace_back(base_ptr, _comm_size);
+
+    // Initialize vars for load balancing
+    if(_comm_lb_size != _comm_size) {
+      lb_free_pool.resize(128);
+
+      head = &(lb_free_pool[0]);
+      lb_free_pool_taken[0] = true;
+
+      tail = &(lb_free_pool[1]);
+      lb_free_pool_taken[1] = true;
+
+      head->next = tail;
+      head->prev = nullptr;
+      tail->next = nullptr;
+      tail->prev = head;
+
+      this->lb_size = _comm_lb_size - _comm_size;
+      this->lb_base_ptr = base_ptr + _comm_size;
+      this->lb_ptr = lb_base_ptr;
+    }
+
+    this->total_size = _comm_lb_size;
   }
 
   allocator::~allocator() {
     // Free GPU memory
-    cudaError_t status = cudaFree(base_ptr);
-    if (status != cudaSuccess) {
+    hapiError_t status = hapiFree(base_ptr);
+    if (status != hapiSuccess) {
       fprintf(stderr, "Failed to free GPU memory\n");
       abort();
     }
     delete[] buckets;
   }
 
-  void* allocator::malloc(size_t request) {
+  void* allocator::malloc(size_t request, bool is_comm) {
+    if(!is_comm) {
+      // buffers for load balancing
+      if(request > lb_size) return nullptr;
+      lb_free_list* tmp = head->next;
+
+      // see if existing free block can service request
+      while (tmp != tail) {
+        if(tmp->size == request) {
+          tmp->prev->next = tmp->next;
+          tmp->next->prev = tmp->prev;
+          void* ptr = tmp->ptr;
+          lb_ptr_size[ptr] = request;
+
+          tmp->next = nullptr;
+          tmp->prev = nullptr;
+          tmp->size = 0;
+          lb_free_pool_taken[tmp->indx] = false;
+          tmp->ptr = nullptr;
+
+          return ptr;
+        } else if(tmp->size > request) {
+          void* ptr = tmp->ptr;
+          lb_ptr_size[ptr] = request;
+
+          tmp->size = tmp->size - request;
+          tmp->ptr = tmp->ptr + request;
+
+          return ptr;
+        }
+
+        tmp = tmp->next;
+      }
+
+      // service the request from the tip of lb_ptr
+      if(lb_ptr + request > lb_base_ptr + lb_size) return nullptr;
+      void* ptr = lb_ptr;
+      lb_ptr_size[ptr] = request;
+      lb_ptr = lb_ptr + request;
+      return ptr;
+    }
+
+    DEBUG_PRINT("REQUEST: %ld, TOTAL_SIZE: %ld", request, total_size);
     // Cannot satisfy request larger than total size
-    if (request > total_size) return nullptr;
+    if (request > comm_size) return nullptr;
 
     // Has to be larger than minimum allocation size (4 bytes)
     // Size is rounded up to the nearest power of 2
@@ -105,19 +181,14 @@ namespace buddy {
     }
 
     // Found bucket with free block, take it and start splitting if needed
-    FreeBlock& block = buckets[bucket].front();
+    FreeBlock block = buckets[bucket].front();
     uint8_t* ptr = block.ptr;
     size_t size = block.size;
     buckets[bucket].pop_front();
 
     while (bucket-- > original_bucket) {
-      buckets[bucket].emplace_back(ptr, size / 2);
-      buckets[bucket].emplace_back(ptr + size / 2, size / 2);
-
-      block = buckets[bucket].front();
-      ptr = block.ptr;
-      size = block.size;
-      buckets[bucket].pop_front();
+      size /= 2;
+      buckets[bucket].emplace_back(ptr + size, size);
     }
 
     // Store allocation info
@@ -126,17 +197,110 @@ namespace buddy {
         std::forward_as_tuple(ptr),
         std::forward_as_tuple(size, request));
 
-    DEBUG_PRINT("Allocated ptr %p (base_ptr + %zu) with %zu bytes, requested was %zu bytes\n",
-        (void*)ptr, (size_t)(ptr - base_ptr), size, request);
-
-#if BUDDY_DEBUG
-    print_status();
-#endif
 
     return ptr;
   }
 
   void allocator::free(void* ptr) {
+    if((uint8_t*)ptr >= lb_base_ptr) {
+      size_t alloc_size = lb_ptr_size[ptr];
+      if(alloc_size == 0) {
+        printf("Load balancing allocator got a request to free buffer of size 0\n");
+        fflush(stdout);
+        std::abort();
+      }
+
+      // see if the ptr is just before lb_ptr
+      if((uint8_t*)ptr + alloc_size == lb_ptr) {
+        lb_ptr -= alloc_size;
+        lb_ptr_size[ptr] = 0;
+        return;
+      }
+
+      // see if mergeable with any existing free block
+      lb_free_list* tmp = head->next;
+      bool merged = false;
+      while(tmp != tail) {
+        if (((uint8_t*)tmp->ptr + tmp->size) == (uint8_t*)ptr) {
+          tmp->size = tmp->size + alloc_size;
+          lb_free_list* tmp_next = tmp->next;
+
+          if (tmp_next != tail && (((uint8_t*)tmp->ptr + tmp->size) == (uint8_t*)tmp_next->ptr)) {
+            tmp->size = tmp->size + tmp_next->size;
+
+            tmp_next->prev->next = tmp_next->next;
+            tmp_next->next->prev = tmp_next->prev;
+            
+            tmp_next->next = nullptr;
+            tmp_next->prev = nullptr;
+            tmp_next->size = 0;
+            lb_free_pool_taken[tmp_next->indx] = false;
+            tmp_next->ptr = nullptr;
+          }
+          
+          merged = true;
+          break;
+        } else if(((uint8_t*)ptr + alloc_size) == (uint8_t*)tmp->ptr) {
+          tmp->size = tmp->size + alloc_size;
+          tmp->ptr = ptr;
+
+          merged = true;
+          break;
+        } else if (tmp->ptr > ptr) {
+          break;
+        }
+
+        tmp = tmp->next;
+      }
+
+      // see if merging reached the lb_ptr
+      if(merged) {
+        if((uint8_t*)tmp->ptr + tmp->size == lb_ptr) {
+            lb_ptr -= tmp->size;
+
+            tmp->prev->next = tmp->next;
+            tmp->next->prev = tmp->prev;
+            
+            tmp->next = nullptr;
+            tmp->prev = nullptr;
+            tmp->size = 0;
+            lb_free_pool_taken[tmp->indx] = false;
+            tmp->ptr = nullptr;
+
+          }
+
+          lb_ptr_size[ptr] = 0;
+          return;
+      }
+
+      // add a free node just before tmp
+      size_t free_space_indx = 2;
+      while(lb_free_pool_taken[free_space_indx] && free_space_indx < lb_free_pool.size())
+        free_space_indx++;
+      lb_free_list* free_node;
+      if(free_space_indx == lb_free_pool.size()) {
+        // TODO : Implement this logic or just increase the default size of 
+        // lb_free_pool
+        printf("Load balancing allocator does not have any more free nodes\n");
+        fflush(stdout);
+        std::abort();
+      } else {
+        free_node = &(lb_free_pool[free_space_indx]);
+        lb_free_pool_taken[free_space_indx] = true;
+      }
+
+      free_node->indx = free_space_indx;
+      free_node->ptr = ptr;
+      free_node->size = lb_ptr_size[ptr];
+      free_node->prev = tmp->prev;
+      free_node->next = tmp;
+
+      tmp->prev->next = free_node;
+      tmp->prev = free_node;
+
+      lb_ptr_size[ptr] = 0;
+      return;
+    }
     // Find pointer in allocation map
     auto alloc_it = alloc_map.find((uint8_t*)ptr);
     if (alloc_it == alloc_map.end()) {
@@ -167,29 +331,21 @@ namespace buddy {
       uint8_t* buddy_ptr = block_index_even ? (merge_ptr + merge_size) : (merge_ptr - merge_size);
 
       // If buddy is also free, merge
+      bool merged = false;
       for (std::list<FreeBlock>::iterator it = buckets[i].begin(); it != buckets[i].end(); it++) {
-        const auto& block = *it;
-        if (block.ptr == buddy_ptr) {
+        if (it->ptr == buddy_ptr) {
           buckets[i+1].emplace_back(block_index_even ? merge_ptr : buddy_ptr, 2 * merge_size);
-          buckets[i].erase(it); // Iterator is invalid after this erase
+          buckets[i].erase(it);
           buckets[i].pop_back();
+          merged = true;
           break;
         }
-        else {
-          // Did not find free buddy block, stop merging
-          goto merge_done;
-        }
       }
 
+      if (!merged) break;
+
       if (!block_index_even) merge_ptr = buddy_ptr;
       merge_size *= 2;
     }
-
-merge_done:
-    DEBUG_PRINT("Freed ptr %p with %zu bytes, requested was %zu bytes\n", ptr, size, requested);
-
-#if BUDDY_DEBUG
-    print_status();
-#endif
   }
 }
diff --git a/src/arch/cuda/hybridAPI/buddy_allocator.h b/src/arch/cuda/hybridAPI/buddy_allocator.h
index 5c7fa17c36..f78469ecef 100644
--- a/src/arch/cuda/hybridAPI/buddy_allocator.h
+++ b/src/arch/cuda/hybridAPI/buddy_allocator.h
@@ -5,6 +5,9 @@
 #include <cstdint>
 #include <list>
 #include <unordered_map>
+#include <vector>
+#include <string>
+#include <pthread.h>
 
 // A cached memory allocator with GPU memory as the backing store.
 // A fixed size allocation is initially made to the backing store,
@@ -47,13 +50,30 @@ namespace buddy {
       AllocBlock(size_t size_, size_t requested_) : size(size_), requested(requested_) {}
     };
 
+    struct lb_free_list {
+      lb_free_list* next;
+      lb_free_list* prev;
+      void* ptr;
+      size_t size;
+      size_t indx;
+    };
+    lb_free_list* head;
+    lb_free_list* tail;
+    std::vector<lb_free_list> lb_free_pool;
+    std::unordered_map<size_t, bool> lb_free_pool_taken;
+    std::unordered_map<void*, size_t> lb_ptr_size;
+    uint8_t* lb_ptr;
+
     // Allocation size limits
+    size_t comm_size;
+    size_t lb_size;
     size_t total_size;
     const size_t min_size;
 
     // Base pointer of the initial allocation
     uint8_t* base_ptr;
-
+    uint8_t* lb_base_ptr;
+  
     // Buckets each with a free list
     std::list<FreeBlock>* buckets;
     int bucket_count;
@@ -64,13 +84,14 @@ namespace buddy {
     // Utility functions
     void print_status();
     size_t get_free_size();
+    size_t get_lb_free_size();
     int get_bucket(size_t size);
     int get_block_index(uint8_t* ptr, size_t size);
 
     // Allocation functions
-    allocator(size_t size);
+    allocator(size_t size, size_t);
     ~allocator();
-    void* malloc(size_t request);
+    void* malloc(size_t request, bool is_comm);
     void free(void* ptr);
   };
 }
diff --git a/src/arch/cuda/hybridAPI/devicemanager.h b/src/arch/cuda/hybridAPI/devicemanager.h
index 092a99d2e8..2351048aa7 100644
--- a/src/arch/cuda/hybridAPI/devicemanager.h
+++ b/src/arch/cuda/hybridAPI/devicemanager.h
@@ -1,7 +1,7 @@
 #ifndef __DEVICEMANAGER_H_
 #define __DEVICEMANAGER_H_
 
-#include <cuda_runtime.h>
+#include <hapi_portable.h>
 #include "converse.h"
 #include "buddy_allocator.h"
 
@@ -37,13 +37,13 @@ struct DeviceManager {
     return comm_buffer;
   }
 
-  void create_comm_buffer(size_t size) {
+  void create_comm_buffer(size_t total_size, size_t comm_size) {
     if (comm_buffer == nullptr)
-      comm_buffer = new buddy::allocator(size);
+      comm_buffer = new buddy::allocator(total_size, comm_size);
   }
 
-  void* alloc_comm_buffer(size_t size) {
-    return comm_buffer->malloc(size);
+  void* alloc_comm_buffer(size_t size, bool is_comm = true) {
+    return comm_buffer->malloc(size, is_comm);
   }
 
   void free_comm_buffer(size_t offset) {
@@ -54,6 +54,10 @@ struct DeviceManager {
     return comm_buffer->get_free_size();
   }
 
+  size_t get_lb_buffer_free_size() {
+    return comm_buffer->get_lb_free_size();
+  }
+
   void destroy_comm_buffer() {
     if (comm_buffer) {
       delete comm_buffer;
diff --git a/src/arch/cuda/hybridAPI/gpumanager.h b/src/arch/cuda/hybridAPI/gpumanager.h
index eab0064d94..6a96d516b3 100644
--- a/src/arch/cuda/hybridAPI/gpumanager.h
+++ b/src/arch/cuda/hybridAPI/gpumanager.h
@@ -1,16 +1,19 @@
 #ifndef __GPUMANAGER_H_
 #define __GPUMANAGER_H_
 
-#include <cuda_runtime.h>
 #include <vector>
 #include <string>
 #include <unordered_map>
 
+#include "hapi_portable.h"
 #include "converse.h"
 #include "hapi.h"
 #include "hapi_impl.h"
 #include "devicemanager.h"
 
+#include <unordered_map>
+#include <queue>
+
 // Initial size of the user-addressed portion of host/device buffer arrays;
 // the system-addressed portion of host/device buffer arrays (used when there
 // is no need to share buffers between work requests) will be equivalant in size.
@@ -24,19 +27,26 @@
 // CUDA IPC Event related struct, stored in host-wide shared memory.
 // One object is used for each interaction/message between sender and receiver.
 // The number of these objects per device will be equal to the CUDA IPC event pool size.
-struct cuda_ipc_event_shared {
-  cudaIpcEventHandle_t src_event_handle;
-  cudaIpcEventHandle_t dst_event_handle;
+struct hapi_ipc_event_shared {
+  hapiIpcEventHandle_t src_event_handle;
+  hapiIpcEventHandle_t dst_event_handle;
   bool src_flag; // Unused for now
   bool dst_flag;
   pthread_mutex_t lock;
 };
 
+#if CMK_LBDB_ON
+struct CuptiBufferItem {
+  uint8_t* buffer;
+  size_t validSize;
+};
+#endif
+
 // Per-device struct containing data for CUDA IPC.
 // Use SMP lock in DeviceManager if needed.
-struct cuda_ipc_device_info {
-  std::vector<cudaEvent_t> src_event_pool;
-  std::vector<cudaEvent_t> dst_event_pool;
+struct hapi_ipc_device_info {
+  std::vector<hapiEvent_t> src_event_pool;
+  std::vector<hapiEvent_t> dst_event_pool;
   // Flag per event pair (0: free, 1: used)
   std::vector<int> event_pool_flags;
   // Offset in device comm buffer (per event)
@@ -87,7 +97,7 @@ struct GPUManager {
   // specifies an invalid buffer ID.
   int next_buffer_;
 
-  cudaStream_t *streams_;
+  hapiStream_t *streams_;
   int n_streams_;
   int last_stream_id_;
 
@@ -121,9 +131,6 @@ struct GPUManager {
   CmiNodeLock device_mapping_lock;
 #endif
 
-#ifdef HAPI_CUDA_CALLBACK
-#endif
-
   int device_count; // GPU devices usable by this process (could be less than the number of visible devices)
   int device_count_on_physical_node;
   int pes_per_device;
@@ -134,8 +141,12 @@ struct GPUManager {
   // Device communication buffer
   size_t comm_buffer_size;
 
+  // Device load-balancing buffer
+  size_t lb_buffer_size;
+
   // POSIX shared memory for sharing CUDA IPC handles between processes on the same host
   bool use_shm;
+  bool test_field;
   void* shm_ptr;
   std::string shm_name;
   int shm_file;
@@ -144,12 +155,23 @@ struct GPUManager {
   void* shm_my_ptr;
 
   // CUDA IPC event pool
-  int cuda_ipc_event_pool_size_pe;
-  int cuda_ipc_event_pool_size_total;
+  int hapi_ipc_event_pool_size_pe;
+  int hapi_ipc_event_pool_size_total;
 
   // CUDA IPC handles opened for processes on the same node
   // Vector size is equal to the number of devices on the physical node
-  std::vector<cuda_ipc_device_info> cuda_ipc_device_infos;
+  std::vector<hapi_ipc_device_info> hapi_ipc_device_infos;
+
+  //CUPTI load balancing
+#ifdef CMK_LBDB_ON
+  std::unordered_map<uint32_t, uint64_t> cupti_correlation_db_;//correlationID -> ObjectID
+
+  std::unordered_map<uint64_t, uint64_t> cupti_obj_gpu_times_;//objectID -> accumulated GPU time in ns
+  
+  std::queue<CuptiBufferItem> cupti_buffer_queue_;
+
+  bool cupti_initialized_;
+#endif
 
   void init() {
     next_buffer_ = NUM_BUFFERS;
@@ -189,8 +211,8 @@ struct GPUManager {
     shm_my_ptr = NULL;
 
     // Number of CUDA IPC events per PE
-    cuda_ipc_event_pool_size_pe = -1;
-    cuda_ipc_event_pool_size_total = -1;
+    hapi_ipc_event_pool_size_pe = -1;
+    hapi_ipc_event_pool_size_total = -1;
 
     // Allocate host/device buffers array (both user and system-addressed)
     host_buffers_ = new void*[NUM_BUFFERS*2];
@@ -245,7 +267,7 @@ struct GPUManager {
     // Destroy streams
     if (streams_) {
       for (int i = 0; i < n_streams_; i++) {
-        hapiCheck(cudaStreamDestroy(streams_[i]));
+        hapiCheck(hapiStreamDestroy(streams_[i]));
       }
     }
 
@@ -277,9 +299,9 @@ struct GPUManager {
   // Returns the number of created streams.
   int createStreams() {
     int device;
-    cudaDeviceProp device_prop;
-    hapiCheck(cudaGetDevice(&device));
-    hapiCheck(cudaGetDeviceProperties(&device_prop, device));
+    hapiDeviceProp device_prop;
+    hapiCheck(hapiGetDevice(&device));
+    hapiCheck(hapiGetDeviceProperties(&device_prop, device));
 
     int new_n_streams = 0;
 
@@ -311,7 +333,7 @@ struct GPUManager {
     // Allocate total physical streams between GPU managers sharing a device...
     // i.e. PEs / num devices
     int device_count;
-    hapiCheck(cudaGetDeviceCount(&device_count));
+    hapiCheck(hapiGetDeviceCount(&device_count));
     int pes_per_device = CmiNumPesOnPhysicalNode(0) / device_count;
     pes_per_device = pes_per_device > 0 ? pes_per_device : 1;
     new_n_streams =  (new_n_streams + pes_per_device - 1) / pes_per_device;
@@ -327,9 +349,9 @@ struct GPUManager {
       return n_streams_;
     }
 
-    cudaStream_t* old_streams = streams_;
+    hapiStream_t* old_streams = streams_;
 
-    streams_ = new cudaStream_t[new_n_streams];
+    streams_ = new hapiStream_t[new_n_streams];
 
     int i = 0;
     // Copy old streams
@@ -340,7 +362,7 @@ struct GPUManager {
 
     // Create new streams
     for (; i < new_n_streams; i++) {
-      hapiCheck(cudaStreamCreate(&streams_[i]));
+      hapiCheck(hapiStreamCreate(&streams_[i]));
     }
 
     // Update
@@ -350,7 +372,7 @@ struct GPUManager {
     return n_streams_;
   }
 
-  cudaStream_t getNextStream() {
+  hapiStream_t getNextStream() {
     if (streams_ == NULL)
       return NULL;
 
@@ -358,7 +380,7 @@ struct GPUManager {
     return streams_[last_stream_id_];
   }
 
-  cudaStream_t getStream(int i) {
+  hapiStream_t getStream(int i) {
     if (streams_ == NULL)
       return NULL;
 
@@ -418,7 +440,7 @@ struct GPUManager {
 
       if (device_buffers_[index] == NULL) {
         // allocate device memory
-        hapiCheck(cudaMalloc((void **)&device_buffers_[index], size));
+        hapiCheck(hapiMalloc((void **)&device_buffers_[index], size));
 
 #ifdef HAPI_DEBUG
         CmiPrintf("[HAPI] allocated buffer %d at %p, time: %.2f, size: %zu\n",
@@ -438,8 +460,8 @@ struct GPUManager {
       host_buffers_[index] = bi.host_buffer;
 
       if (bi.transfer_to_device) {
-        hapiCheck(cudaMemcpyAsync(device_buffers_[index], host_buffers_[index], size,
-                                  cudaMemcpyHostToDevice, wr->stream));
+        hapiCheck(hapiMemcpyAsync(device_buffers_[index], host_buffers_[index], size,
+                                  hapiMemcpyHostToDevice, wr->stream));
 
 #ifdef HAPI_DEBUG
         CmiPrintf("[HAPI] transferring buffer %d from host to device, time: %.2f, "
@@ -457,8 +479,8 @@ struct GPUManager {
       size_t size = bi.size;
 
       if (bi.transfer_to_host) {
-        hapiCheck(cudaMemcpyAsync(host_buffers_[index], device_buffers_[index], size,
-                                  cudaMemcpyDeviceToHost, wr->stream));
+        hapiCheck(hapiMemcpyAsync(host_buffers_[index], device_buffers_[index], size,
+                                  hapiMemcpyDeviceToHost, wr->stream));
 
 #ifdef HAPI_DEBUG
         CmiPrintf("[HAPI] transferring buffer %d from device to host, time %.2f, "
@@ -475,7 +497,7 @@ struct GPUManager {
       int index = bi.id;
 
       if (bi.need_free) {
-        hapiCheck(cudaFree(device_buffers_[index]));
+        hapiCheck(hapiFree(device_buffers_[index]));
         device_buffers_[index] = NULL;
 
 #ifdef HAPI_DEBUG
diff --git a/src/arch/cuda/hybridAPI/hapi.h b/src/arch/cuda/hybridAPI/hapi.h
index 39a1f9c4a2..a2689ea664 100644
--- a/src/arch/cuda/hybridAPI/hapi.h
+++ b/src/arch/cuda/hybridAPI/hapi.h
@@ -1,6 +1,6 @@
 #ifndef __HAPI_H_
 #define __HAPI_H_
-#include <cuda_runtime.h>
+#include "hapi_portable.h"
 
 /* See hapi_functions.h for the majority of function declarations provided
  * by the Hybrid API. */
@@ -74,10 +74,10 @@ typedef struct hapiWorkRequest {
 #endif
 
   // Pointer to host-side function that actually invokes the kernel.
-  // The user implements this function, using the given CUDA stream and
+  // The user implements this function, using the given hapi stream and
   // device buffers (which are indexed by hapiBufferInfo->id).
   // Could be set to NULL if no kernel needs to be executed.
-  void (*runKernel)(struct hapiWorkRequest* wr, cudaStream_t kernel_stream,
+  void (*runKernel)(struct hapiWorkRequest* wr, hapiStream_t kernel_stream,
                     void** device_buffers);
 
   // flag used for control by the system
@@ -89,8 +89,8 @@ typedef struct hapiWorkRequest {
   // flags determining whether memory should be freed on destruction
   bool free_user_data;
 
-  // CUDA stream index provided by the user or assigned by GPUManager
-  cudaStream_t stream;
+  // hapi stream index provided by the user or assigned by GPUManager
+  hapiStream_t stream;
 
 #ifdef HAPI_INSTRUMENT_WRS
   double phase_start_time;
@@ -151,15 +151,15 @@ typedef struct hapiWorkRequest {
   }
 #endif
 
-  void setRunKernel(void (*_runKernel)(struct hapiWorkRequest*, cudaStream_t, void**)) {
+  void setRunKernel(void (*_runKernel)(struct hapiWorkRequest*, hapiStream_t, void**)) {
     runKernel = _runKernel;
   }
 
-  void setStream(cudaStream_t _stream) {
+  void setStream(hapiStream_t _stream) {
     stream = _stream;
   }
 
-  cudaStream_t getStream() {
+  hapiStream_t getStream() {
     return stream;
   }
 
@@ -189,7 +189,7 @@ typedef struct hapiWorkRequest hapiWorkRequest;
 
 #endif /* defined __cplusplus */
 
-// Provides support for detecting errors with CUDA API calls.
+// Provides support for detecting errors with hapi API calls.
 #ifndef HAPI_CHECK_OFF
 #define hapiCheck(code) hapiErrorDie(code, #code, __FILE__, __LINE__)
 #else
@@ -228,22 +228,54 @@ extern "C" {
 #ifdef __cplusplus
 
 // Provide a C++-only stub for this function's default parameter.
-void hapiAddCallback(cudaStream_t stream, const CkCallback& cb, void* cb_msg);
-static inline void hapiAddCallback(cudaStream_t stream, const CkCallback& cb) {
+void hapiAddCallback(hapiStream_t stream, const CkCallback& cb, void* cb_msg);
+static inline void hapiAddCallback(hapiStream_t stream, const CkCallback& cb) {
   hapiAddCallback(stream, cb, nullptr);
 }
-static inline void hapiAddCallback(cudaStream_t stream, void* cb) {
+static inline void hapiAddCallback(hapiStream_t stream, void* cb) {
   hapiAddCallback(stream, cb, nullptr);
 }
 
 // Overloaded C++ wrappers for selecting whether to pool or not using a bool.
-static inline cudaError_t hapiMallocHost(void** ptr, size_t size, bool pool) {
+static inline hapiError_t hapiMallocHost_Pool(void** ptr, size_t size, bool pool) {
   return pool ? hapiPoolMalloc(ptr, size) : hapiMallocHost(ptr, size);
 }
-static inline cudaError_t hapiFreeHost(void* ptr, bool pool) {
+static inline hapiError_t hapiFreeHost_Pool(void* ptr, bool pool) {
   return pool ? hapiPoolFree(ptr) : hapiFreeHost(ptr);
 }
 
+void hapiRecordTime(hapiStream_t stream, hapiEvent_t start);
+#ifdef CMK_LBDB_ON
+void hapiCuptiInit();
+void hapiCuptiFinalize();
+uint64_t hapiCuptiPushObjCorrelation();
+void hapiCuptiPopObjCorrelation();
+void hapiProcessCuptiBuffers();
+void hapiClearCuptiData();
+#endif
+
+#ifdef CMK_LBDB_ON
+#define HAPI_LAUNCH_KERNEL_WRAPPER(call, stream)\
+    hapiEvent_t start;\
+    hapiEventCreate(&start);\
+    hapiEventRecord(start, stream);\
+    call;\
+    hapiRecordTime(stream, start);
+#else
+#define HAPI_LAUNCH_KERNEL_WRAPPER(call, stream)\
+    call;
+#endif
+
+#ifdef CMK_LBDB_ON
+#define CUPTI_LAUNCH_WRAPPER(call)\
+  hapiCuptiPushObjCorrelation();\
+  call;\
+  hapiCuptiPopObjCorrelation();
+#else
+#define CUPTI_LAUNCH_WRAPPER(call)\
+  call;
+#endif
+
 #endif /* defined __cplusplus */
 
 #endif /* !defined AMPI_INTERNAL_SKIP_FUNCTIONS */
diff --git a/src/arch/cuda/hybridAPI/hapi_functions.h b/src/arch/cuda/hybridAPI/hapi_functions.h
index ee2bcd2120..8d3fd917f2 100644
--- a/src/arch/cuda/hybridAPI/hapi_functions.h
+++ b/src/arch/cuda/hybridAPI/hapi_functions.h
@@ -30,26 +30,33 @@ AMPI_CUSTOM_FUNC(int, hapiCreateStreams, void)
 
 // Get a CUDA stream that was created by the runtime. Current scheme is to
 // hand out streams in a round-robin fashion.
-AMPI_CUSTOM_FUNC(cudaStream_t, hapiGetStream, void)
+AMPI_CUSTOM_FUNC(hapiStream_t, hapiGetStream, void)
 
 // Add a Charm++ callback function to be invoked after the previous operation
 // in the stream completes. This call should be placed after data transfers or
 // a kernel invocation.
-AMPI_CUSTOM_FUNC(void, hapiAddCallback, cudaStream_t, void*, void*)
+AMPI_CUSTOM_FUNC(void, hapiAddCallback, hapiStream_t, void*, void*)
 
 // Thin wrappers for memory related CUDA API calls.
-AMPI_CUSTOM_FUNC(cudaError_t, hapiMalloc, void**, size_t)
-AMPI_CUSTOM_FUNC(cudaError_t, hapiFree, void*)
-AMPI_CUSTOM_FUNC(cudaError_t, hapiMallocHost, void**, size_t)
-AMPI_CUSTOM_FUNC(cudaError_t, hapiFreeHost, void*)
-AMPI_CUSTOM_FUNC(cudaError_t, hapiMemcpyAsync, void*, const void*, size_t, enum cudaMemcpyKind, cudaStream_t)
+// AMPI_CUSTOM_FUNC(cudaError_t, hapiMalloc, void**, size_t)
+// AMPI_CUSTOM_FUNC(cudaError_t, hapiFree, void*)
+// AMPI_CUSTOM_FUNC(cudaError_t, hapiMallocHost, void**, size_t)
+// AMPI_CUSTOM_FUNC(cudaError_t, hapiFreeHost, void*)
+// AMPI_CUSTOM_FUNC(cudaError_t, hapiMemcpyAsync, void*, const void*, size_t, enum cudaMemcpyKind, cudaStream_t)
+// AMPI_CUSTOM_FUNC(cudaError_t, hapiMemcpy2DAsync, void*, size_t, const void*, size_t, size_t, size_t, enum cudaMemcpyKind, cudaStream_t)
+
+// Kernel launch wrapper
+AMPI_CUSTOM_FUNC(hapiError_t, hapiLaunchKernel, const void*, dim3, dim3, void**, size_t, hapiStream_t)
 
 // Explicit memory allocations using pinned memory pool.
-AMPI_CUSTOM_FUNC(cudaError_t, hapiPoolMalloc, void**, size_t)
-AMPI_CUSTOM_FUNC(cudaError_t, hapiPoolFree, void*)
+AMPI_CUSTOM_FUNC(hapiError_t, hapiPoolMalloc, void**, size_t)
+AMPI_CUSTOM_FUNC(hapiError_t, hapiPoolFree, void*)
 
 // Provides support for detecting errors with CUDA API calls.
-AMPI_CUSTOM_FUNC(void, hapiErrorDie, cudaError_t, const char*, const char*, int)
+AMPI_CUSTOM_FUNC(void, hapiErrorDie, hapiError_t, const char*, const char*, int)
+
+// Returns the GPU device index this PE is mapped to (set during hapiMapping).
+AMPI_CUSTOM_FUNC(uint64_t, hapiMyDevice, void)
 
 #ifdef HAPI_INSTRUMENT_WRS
 AMPI_CUSTOM_FUNC(void, hapiInitInstrument, int n_chares, char n_types)
diff --git a/src/arch/cuda/hybridAPI/hapi_impl.cpp b/src/arch/cuda/hybridAPI/hapi_impl.cpp
index b5b3bb5fb6..42e3ad4f6e 100644
--- a/src/arch/cuda/hybridAPI/hapi_impl.cpp
+++ b/src/arch/cuda/hybridAPI/hapi_impl.cpp
@@ -9,12 +9,16 @@
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#include <sched.h>
 
-#define CUDA_API_PER_THREAD_DEFAULT_STREAM
-#include <cuda_runtime.h>
+#define hapi_API_PER_THREAD_DEFAULT_STREAM
 
+#include "hapi_portable.h"
 #include "converse.h"
-#include "conv-mach-opt.h" /* for CMK_CUDA */
+#include "conv-mach-opt.h" /* for CMK_hapi */
+#include "ckrescale.h"
+#include "charm++.h"
+
 #include "hapi.h"
 #include "hapi_impl.h"
 #include "gpumanager.h"
@@ -22,6 +26,38 @@
 #include "hapi_nvtx.h"
 #endif
 
+#if CMK_LBDB_ON
+#if CMK_CUDA
+#include <cupti.h>
+#endif
+#include "LBManager.h"
+
+#if CMK_CUDA
+#define CUPTI_SAFE_CALL(call)                                              \
+  do {                                                                     \
+    CUptiResult _status = call;                                            \
+    if (_status != CUPTI_SUCCESS) {                                        \
+      const char *errstr;                                                  \
+      cuptiGetResultString(_status, &errstr);                              \
+      CmiPrintf("HAPI CUPTI error: %s at %s:%d\n", errstr, __FILE__, __LINE__); \
+    }                                                          \
+  } while (0)
+#endif
+#endif
+
+#define SERVER_FIFO_TEMPLATE "/tmp/server_pipe_%ld"
+#define CLIENT_FIFO_TEMPLATE "/tmp/client_pipe_%ld"
+#define BUFFER_SIZE 256
+#define STREAM_BUF_SIZE 1024
+
+#if defined HAPI_TRACE || defined HAPI_INSTRUMENT_WRS
+// extern "C" double CmiWallTimer();
+#endif
+
+extern int Cmi_isOldProcess;
+
+extern int CmiSetCPUAffinityLogical(int core);
+
 static void createPool(int *nbuffers, int n_slots, std::vector<BufferPool> &pools);
 static void releasePool(std::vector<BufferPool> &pools);
 
@@ -36,19 +72,27 @@ struct hapiCallbackMessage {
 
 #ifndef HAPI_CUDA_CALLBACK
 typedef struct hapiEvent {
-  cudaEvent_t event;
+  hapiEvent_t event;
   CkCallback cb;
   void* cb_msg;
   hapiWorkRequest* wr; // if this is not NULL, buffers and request itself are deallocated
+  CkMigratable* obj; // pointer to the object whose load we want to set
+  hapiEvent_t start_ev; // event to record the start time
 
-  hapiEvent(cudaEvent_t event_, const CkCallback& cb_, void* cb_msg_, hapiWorkRequest* wr_ = NULL)
-            : event(event_), cb(cb_), cb_msg(cb_msg_), wr(wr_) {}
+  hapiEvent(hapiEvent_t event_, const CkCallback& cb_, void* cb_msg_, hapiWorkRequest* wr_ = NULL, CkMigratable* obj_ = NULL, hapiEvent_t start_ev_ = NULL)
+            : event(event_), cb(cb_), cb_msg(cb_msg_), wr(wr_), obj(obj_), start_ev(start_ev_) {}
 } hapiEvent;
 
 CpvDeclare(std::queue<hapiEvent>, hapi_event_queue);
+CpvDeclare(std::queue<hapiEvent_t>, hapi_event_pool);
 #endif // HAPI_CUDA_CALLBACK
 CpvDeclare(int, n_hapi_events);
 
+int firstRankForDevice = 0; // First rank for each device, used for mapping
+
+// Managing memory state in server
+int hapiAllocId = 0; // Global allocation ID for HAPI
+
 // Used to invoke user's Charm++ callback function
 void (*hapiInvokeCallback)(void*, void*) = NULL;
 
@@ -63,8 +107,11 @@ void (*hapiQdProcess)(int) = NULL;
 CsvDeclare(GPUManager, gpu_manager);
 
 CpvDeclare(int, my_device); // GPU device that this thread is mapped to
+CpvDeclare(int, my_device_id); // index to the deviceManager that stores info about the device
 CpvDeclare(bool, device_rep); // Is this PE a device representative thread? (1 per device)
 
+void hapiSendMemoryRequest(char* msg, int size);
+
 // Returns the local rank of the logical node (process) that the given PE belongs to
 static inline int CmiNodeRankLocal(int pe) {
   // Logical node index % Number of logical nodes per physical node
@@ -77,14 +124,14 @@ static inline int CmiMyNodeRankLocal() {
 }
 
 // HAPI internal function declarations
-static void hapiInitCsv();
+static void hapiInitCsv(char** argv);
 static void hapiInitCpv();
 static void hapiExitCsv();
 
 static void hapiMapping(char** argv);
 static void hapiRegisterCallbacks();
 
-// CUDA IPC related functions
+// hapi IPC related functions
 static void shmInit();
 static void shmSetup();
 static void shmCreate();
@@ -95,6 +142,53 @@ static void shmCleanup();
 static void ipcHandleCreate();
 static void ipcHandleOpen();
 
+#ifdef CMK_LBDB_ON
+
+#if CMK_CUDA
+static void CUPTIAPI cuptiBufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
+  *size = 5*1024 * 1024;  // 5MB per buffer
+  *buffer = (uint8_t *)malloc(*size);
+  *maxNumRecords = 0;
+}
+
+//TODO: handle SMP mode
+static void CUPTIAPI cuptiBufferCompleted(CUcontext ctx, uint32_t streamId,
+                                          uint8_t *buffer, size_t size, size_t validSize) {
+  GPUManager& gm = CsvAccess(gpu_manager);
+
+  gm.cupti_buffer_queue_.push({buffer, validSize});
+}
+#endif
+
+// Initialize CUPTI activity tracing — called once per process
+void hapiCuptiInit() {
+#if CMK_CUDA
+  CmiPrintf("HAPI: Initializing CUPTI...\n");
+  hapiDeviceSynchronize(); 
+  GPUManager& gm = CsvAccess(gpu_manager);
+  if (gm.cupti_initialized_) return;
+
+  CUPTI_SAFE_CALL(cuptiActivityRegisterCallbacks(cuptiBufferRequested, cuptiBufferCompleted));
+  CUPTI_SAFE_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
+  CUPTI_SAFE_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
+  CUPTI_SAFE_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION));
+
+  gm.cupti_initialized_ = true;
+#endif
+}
+
+void hapiCuptiFinalize() {
+  CmiPrintf("HAPI: Finalizing CUPTI...\n");
+  hapiDeviceSynchronize(); // Ensure all activity records are flushed
+  GPUManager& gm = CsvAccess(gpu_manager);
+  if(gm.cupti_initialized_== false) return;
+  gm.cupti_initialized_ = false;
+#if CMK_CUDA
+  CUPTI_SAFE_CALL(cuptiFinalize());
+#endif
+}
+#endif
+
 #ifndef HAPI_CUDA_CALLBACK
 #if CSD_NO_SCHEDLOOP
 #  error please disable CSD_NO_SCHEDLOOP to use HAPI
@@ -105,7 +199,7 @@ static void ipcHandleOpen();
 void hapiInit(char** argv) {
   if (!CmiInCommThread()) {
     if (CmiMyRank() == 0) {
-      hapiInitCsv(); // Initialize per-process variables (GPUManager)
+      hapiInitCsv(argv); // Initialize per-process variables (GPUManager)
     }
     hapiInitCpv(); // Initialize per-PE variables
 
@@ -113,6 +207,13 @@ void hapiInit(char** argv) {
 
     hapiMapping(argv); // Perform PE-device mapping
 
+#if CMK_SHRINK_EXPAND
+    hapiStartMemoryDaemon(argv);
+#else
+    int& cpv_my_device = CpvAccess(my_device);
+    hapiCheck(hapiSetDevice(cpv_my_device));
+#endif
+
 #ifndef HAPI_CUDA_CALLBACK
     // Register polling function to be invoked at every scheduler loop
     CcdCallOnConditionKeep(CcdSCHEDLOOP, (CcdCondFn)hapiPollEvents, NULL);
@@ -123,7 +224,7 @@ void hapiInit(char** argv) {
 
   if (CmiInCommThread()) {
     // FIXME: Comm. thread sets its device to be the same as worker thread 0
-    cudaSetDevice(CsvAccess(gpu_manager).comm_thread_device);
+    hapiSetDevice(CsvAccess(gpu_manager).comm_thread_device);
   }
 
   shmInit();
@@ -131,10 +232,143 @@ void hapiInit(char** argv) {
   hapiRegisterCallbacks(); // Register callback functions
 }
 
+
+void hapiStartMemoryDaemon(char** argv)
+{
+#if CMK_SHRINK_EXPAND
+  // start client FIFO
+  long pid = getpid();
+  char client_fifo_path[BUFFER_SIZE];
+  sprintf(client_fifo_path, CLIENT_FIFO_TEMPLATE, pid);
+  std::remove(client_fifo_path);
+  mkfifo(client_fifo_path, 0666);
+
+  int& cpv_my_device = CpvAccess(my_device);
+  CkPrintf("Device = %i\n", cpv_my_device);
+  hapiCheck(hapiSetDevice(cpv_my_device));
+
+  if (CmiPhysicalRank(CmiMyPe()) != firstRankForDevice)
+  {
+    CmiBarrier();
+    return;
+  }
+
+  char server_fifo_path[BUFFER_SIZE];
+  sprintf(server_fifo_path, SERVER_FIFO_TEMPLATE, cpv_my_device);
+
+  // Create a ready signal FIFO for synchronization
+  if (!CmiGetArgFlagDesc(argv,"+shrinkexpand","Restarting of already running prcoess")) {
+    char ready_fifo_path[BUFFER_SIZE];
+    sprintf(ready_fifo_path, "/tmp/daemon_ready_%d", cpv_my_device);
+
+    CmiPrintf("Parent: Waiting for daemon to be ready...\n");
+    
+    int ready_fd = open(ready_fifo_path, O_RDONLY);
+    if (ready_fd == -1) {
+      perror("Parent: open ready FIFO");
+      CmiAbort("Failed to open ready FIFO");
+    }
+  
+    char ready_signal;
+    read(ready_fd, &ready_signal, 1);
+    close(ready_fd);
+    unlink(ready_fifo_path);  // Clean up
+    
+    CmiPrintf("Parent: Daemon is ready!\n");
+  }
+  
+  CmiBarrier();
+  return;
+#endif
+}
+
+int hapiCheckpoint(void* devPtr, int size) {
+  pid_t pid = getpid();
+
+  char client_fifo_path[BUFFER_SIZE];
+  sprintf(client_fifo_path, CLIENT_FIFO_TEMPLATE, pid);
+
+  hapiIpcMemHandle_t ipc_handle;
+  hapiCheck(hapiIpcGetMemHandle(&ipc_handle, devPtr));
+
+  char msg_buf[BUFFER_SIZE];
+  int offset = sprintf(msg_buf, "CKPT:%ld:%d:%d:", pid, CkMyPe(), size);
+  memcpy(msg_buf + offset, &ipc_handle, sizeof(hapiIpcMemHandle_t));
+  int total_size = offset + sizeof(hapiIpcMemHandle_t);
+
+  hapiSendMemoryRequest(msg_buf, total_size);
+
+  int client_fd = open(client_fifo_path, O_RDONLY);
+  int alloc_id;
+  read(client_fd, &alloc_id, sizeof(int));
+  close(client_fd);
+
+  return alloc_id;
+}
+
+void hapiRestore(void* devPtr, int size, int alloc_id) {
+  pid_t pid = getpid();
+
+  char client_fifo_path[BUFFER_SIZE];
+  sprintf(client_fifo_path, CLIENT_FIFO_TEMPLATE, pid);
+
+  char msg_buf[BUFFER_SIZE];
+  sprintf(msg_buf, "GET:%ld:%d", pid, alloc_id);
+
+  hapiSendMemoryRequest(msg_buf, strlen(msg_buf) + 1);
+
+  int client_fd = open(client_fifo_path, O_RDONLY);
+  hapiIpcMemHandle_t ipc_handle;
+  read(client_fd, &ipc_handle, sizeof(hapiIpcMemHandle_t));
+  close(client_fd);
+
+  void* srcPtr;
+  hapiCheck(hapiIpcOpenMemHandle(&srcPtr, ipc_handle, hapiIpcMemLazyEnablePeerAccess));
+  hapiCheck(hapiMemcpy(devPtr, srcPtr, size, hapiMemcpyDeviceToDevice));
+  hapiCheck(hapiIpcCloseMemHandle(srcPtr));
+
+  char free_msg[BUFFER_SIZE];
+  sprintf(free_msg, "FREE:%ld:%d", pid, alloc_id);
+  hapiSendMemoryRequest(free_msg, strlen(free_msg) + 1);
+
+  client_fd = open(client_fifo_path, O_RDONLY);
+  char status;
+  read(client_fd, &status, sizeof(char));
+  close(client_fd);
+}
+
 void hapiExit() {
   // Ensure all PEs have finished GPU work
+  CmiPrintf("Exit called on PE %d\n", CmiMyPe());
   CmiNodeBarrier();
 
+#if CMK_SHRINK_EXPAND
+  char client_fifo_path[BUFFER_SIZE];
+  sprintf(client_fifo_path, CLIENT_FIFO_TEMPLATE, getpid());
+
+  if (!get_shrinkexpand_exit() && CmiPhysicalRank(CmiMyPe()) == firstRankForDevice)
+  {
+    char msg_buf[BUFFER_SIZE];
+    sprintf(msg_buf, "KILL:%ld:0", getpid());
+    hapiSendMemoryRequest(msg_buf, strlen(msg_buf) + 1);
+
+    int client_fd = open(client_fifo_path, O_RDONLY);
+    char status;
+    read(client_fd, &status, sizeof(char));
+    close(client_fd);
+  }
+
+  if (!get_shrinkexpand_exit())
+  {
+    // Attempt to delete the file
+    if (std::remove(client_fifo_path) == 0) {
+        CmiPrintf("File '%s' deleted successfully.\n", client_fifo_path);
+    } else {
+        CmiPrintf("Error deleting file '%s': %s\n", client_fifo_path, strerror(errno));
+    }
+  }
+#endif
+
   if (CmiMyRank() == 0) {
     shmCleanup();
 
@@ -143,23 +377,130 @@ void hapiExit() {
 }
 
 // Initialize per-process variables
-static void hapiInitCsv() {
+static void hapiInitCsv(char** argv) {
   // Create and initialize GPU Manager object
   CsvInitialize(GPUManager, gpu_manager);
   CsvAccess(gpu_manager).init();
+  #if CMK_LBDB_ON
+    CmiPrintf("HAPI: seeing _lb_args.statsOn() = %d\n", _lb_args.statsOn());
+    if (LBHasBalancersRegistered() && _lb_args.statsOn())
+      hapiCuptiInit();
+  #endif
 }
 
+
+#ifdef CMK_LBDB_ON
+
+void hapiProcessCuptiBuffers() {
+  #if CMK_CUDA
+  GPUManager& gm = CsvAccess(gpu_manager);
+  
+  uint32_t kernel_count = 0;
+  uint32_t corr_count = 0;
+  while (true) {
+    uint32_t record_count = 0;
+    CuptiBufferItem item;
+
+    // Pop one buffer from the queue
+    if (gm.cupti_buffer_queue_.empty()) {
+      break;
+    }
+    item = gm.cupti_buffer_queue_.front();
+    gm.cupti_buffer_queue_.pop();
+
+    // Parse records in this buffer
+    CUpti_Activity *record = NULL;
+    // ckout<<"valid size for the CUPTI buffer: "<<item.validSize<<" bytes"<<endl;
+    while (cuptiActivityGetNextRecord(item.buffer, item.validSize, &record) == CUPTI_SUCCESS) {
+      ++record_count;
+      if (record->kind == CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION) {
+        CUpti_ActivityExternalCorrelation *corr = (CUpti_ActivityExternalCorrelation *)record;
+        corr_count++;
+        if(gm.cupti_correlation_db_.find(corr->correlationId)!=gm.cupti_correlation_db_.end())
+        {
+          //out of order block 
+          uint64_t curr_kernel_time = gm.cupti_correlation_db_[corr->correlationId];
+          gm.cupti_obj_gpu_times_[corr->externalId] += curr_kernel_time;
+          gm.cupti_correlation_db_.erase(corr->correlationId); // Remove correlation ID after processing
+        }
+        else 
+        {
+          gm.cupti_correlation_db_[corr->correlationId] = corr->externalId;
+        }
+      }
+      else if (record->kind == CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL ||
+               record->kind == CUPTI_ACTIVITY_KIND_KERNEL) {
+        kernel_count++;
+        CUpti_ActivityKernel4 *kernel = (CUpti_ActivityKernel4 *)record;
+        uint64_t duration_ns = kernel->end - kernel->start;
+        // ckout<<"the current kernel's duration is "<<duration_ns<<" ns "<<endl;
+
+        auto it = gm.cupti_correlation_db_.find(kernel->correlationId);
+        if (it != gm.cupti_correlation_db_.end()) {
+          uint64_t obj_id = it->second;
+          gm.cupti_obj_gpu_times_[obj_id] += duration_ns;
+          gm.cupti_correlation_db_.erase(it); // Remove correlation ID after processing
+        }
+        else 
+        {
+          // CmiPrintf("found an out of order entry\n");
+          gm.cupti_correlation_db_[kernel->correlationId] = duration_ns;
+        }
+      }
+    }
+    
+    // ckout<<"number of CUPTI records in this buffer: "<<record_count<<endl;
+    
+    free(item.buffer);
+  }
+  //final state of gm.cupti_correlation_db_ and gm.cupti_obj_gpu_times_ 
+  // CmiPrintf("size of correlation DB is: %zu\n", gm.cupti_correlation_db_.size());
+  // CmiPrintf("size of obj_gpu_times_ map is: %zu\n", gm.cupti_obj_gpu_times_.size());
+  // CmiPrintf("number of kernel records processed: %u\n", kernel_count);
+  // CmiPrintf("number of correlation records processed: %u\n", corr_count);
+  
+  // DEBUG: print CUPTI obj-gpu-time map summary
+  // if (!gm.cupti_obj_gpu_times_.empty()) {
+    //   CkPrintf("[PE %d] CUPTI: %zu objects with GPU times:\n", CmiMyPe(), gm.cupti_obj_gpu_times_.size());
+    //   for (auto& kv : gm.cupti_obj_gpu_times_)
+    //     CkPrintf("[PE %d]   objID=%lu  gpu_ns=%lu (%.6f s)\n", CmiMyPe(), kv.first, kv.second, kv.second / 1.0e9);
+    // } else {
+      //   CkPrintf("[PE %d] CUPTI: no obj GPU times recorded (map empty)\n", CmiMyPe());
+      // }
+      #endif
+    }
+    
+    
+//TODO: safely handle SMP mode
+void hapiClearCuptiData() {
+  GPUManager& gm = CsvAccess(gpu_manager);
+
+  gm.cupti_obj_gpu_times_.clear();
+  gm.cupti_correlation_db_.clear();
+}
+
+#endif
+
+
 // Initialize per-PE variables
 static void hapiInitCpv() {
   // HAPI event-related
 #ifndef HAPI_CUDA_CALLBACK
   CpvInitialize(std::queue<hapiEvent>, hapi_event_queue);
+  CpvInitialize(std::queue<hapiEvent_t>, hapi_event_pool);
+  // for(int i = 0; i < 8; i++) {
+  //   hapiEvent_t ev;
+  //   hapiEventCreateWithFlags(&ev, hapiEventDisableTiming);
+  //   CpvAccess(hapi_event_pool).push(ev);
+  // }
 #endif
   CpvInitialize(int, n_hapi_events);
   CpvAccess(n_hapi_events) = 0;
 
   // Device mapping
   CpvInitialize(int, my_device);
+  CpvInitialize(int, my_device_id);
+  CpvAccess(my_device_id) = 0;
   CpvAccess(my_device) = 0;
   CpvInitialize(bool, device_rep);
   CpvAccess(device_rep) = false;
@@ -176,15 +517,20 @@ static void hapiExitCsv() {
   if (csv_gpu_manager.mempool_initialized_) {
     releasePool(csv_gpu_manager.mempool_free_bufs_);
   }
+#ifndef HAPI_CUDA_CALLBACK
+  auto& hapi_event_pool_ = CpvAccess(hapi_event_pool);
+  while(!hapi_event_pool_.empty()) {
+    hapiEventDestroy(hapi_event_pool_.front());
+    hapi_event_pool_.pop();
+  }
+#endif
 }
 
 // Set up PE to GPU mapping, invoked from all PEs
 // TODO: Support custom mappings
 static void hapiMapping(char** argv) {
   GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
-  Mapping map_type = Mapping::Block; // Default is block mapping
-  bool all_gpus = false; // If true, all GPUs are visible to all processes.
-                         // Otherwise, only a subset are visible (e.g. with jsrun)
+  Mapping map_type = Mapping::RoundRobin; // Default is round robin
   char* gpumap = NULL;
 
   // Process +gpumap
@@ -206,15 +552,6 @@ static void hapiMapping(char** argv) {
     }
   }
 
-  // Process +allgpus
-  if (CmiGetArgFlagDesc(argv, "+allgpus",
-        "all GPUs are visible to all processes")) {
-    all_gpus = true;
-    if (CmiMyPe() == 0) {
-      CmiPrintf("HAPI> All GPUs are visible to all processes\n");
-    }
-  }
-
   // No mapping specified, user assumes responsibility
   if (map_type == Mapping::None) {
     if (CmiMyPe() == 0) {
@@ -226,19 +563,19 @@ static void hapiMapping(char** argv) {
   CmiAssert(map_type != Mapping::None);
 
   if (CmiMyRank() == 0) {
+    printf("number of physical nodes is %d\n", CmiNumPhysicalNodes());
+    printf("number of nodes is %d\n", CmiNumNodes());
+    printf("my rank is %d\n", CmiMyRank());
     // Count number of GPU devices used by each process
     int visible_device_count;
-    hapiCheck(cudaGetDeviceCount(&visible_device_count));
+    hapiCheck(hapiGetDeviceCount(&visible_device_count));
     if (visible_device_count <= 0) {
       CmiAbort("Unable to perform PE-GPU mapping, no GPUs found!");
     }
 
     int& device_count = csv_gpu_manager.device_count;
-    if (all_gpus) {
-      device_count = visible_device_count / (CmiNumNodes() / CmiNumPhysicalNodes());
-    } else {
-      device_count = visible_device_count;
-    }
+    device_count = visible_device_count / (CmiNumNodes() / CmiNumPhysicalNodes());//?????
+    ckout<<"device count "<<device_count<<endl;
 
     // Handle the case where the number of GPUs per process are larger than
     // the number of PEs per process. This is needed because we currently don't
@@ -252,18 +589,36 @@ static void hapiMapping(char** argv) {
       device_count = CmiNodeSize(CmiMyNode());
     }
 
-    // Create a DeviceManager per GPU device
-    std::vector<DeviceManager>& device_managers = csv_gpu_manager.device_managers;
-    for (int i = 0; i < device_count; i++) {
-      device_managers.emplace_back(i, device_count * CmiMyNodeRankLocal() + i);
+    // We also need to handle the case where the number of GPUs are less than the 
+    // number of processes launched on a physical node. Thus multiple processes can
+    // share a GPU. In this case device_count would be 0, but instead, we will assign
+    // at least one gpu to each process
+    if(device_count == 0) {
+      device_count = 1;
     }
-
     // Count number of PEs per device
     csv_gpu_manager.pes_per_device = CmiNodeSize(CmiMyNode()) / device_count;
 
     // Count number of devices on a physical node
-    csv_gpu_manager.device_count_on_physical_node =
-      device_count * (CmiNumNodes() / CmiNumPhysicalNodes());
+    csv_gpu_manager.device_count_on_physical_node = visible_device_count;
+
+    // Create a DeviceManager per GPU device
+    std::vector<DeviceManager>& device_managers = csv_gpu_manager.device_managers;
+    if(map_type == Mapping::RoundRobin) {
+      for (int i = 0; i < device_count; i++) {
+        device_managers.emplace_back(i, (device_count * CmiMyNodeRankLocal() + i) % visible_device_count);
+      }
+    }
+    else if(map_type == Mapping::Block)
+    {
+      for (int i = 0; i < device_count; i++) {
+        device_managers.emplace_back(i, (CmiMyNodeRankLocal() * visible_device_count + i)/(CmiNumNodes() / CmiNumPhysicalNodes()));
+      }
+    }
+    else
+    {
+      CmiAbort("Unsupported mapping type!");
+    }
   }
 
   if (CmiMyPe() == 0) {
@@ -275,32 +630,35 @@ static void hapiMapping(char** argv) {
   CmiNodeBarrier();
 
   // Perform mapping and set device representative PE
-  int my_rank = all_gpus ? CmiPhysicalRank(CmiMyPe()) : CmiMyRank();
+  int my_rank = CmiMyRank();
   int& cpv_my_device = CpvAccess(my_device);
+  int& cpv_my_device_id = CpvAccess(my_device_id);
   bool& cpv_device_rep = CpvAccess(device_rep);
 
   switch (map_type) {
-    case Mapping::Block:
-      cpv_my_device = my_rank / csv_gpu_manager.pes_per_device;
-      if(cpv_my_device >= csv_gpu_manager.device_count)
-          cpv_my_device = csv_gpu_manager.device_count - 1;
-      if (my_rank % csv_gpu_manager.pes_per_device == 0) cpv_device_rep = true;
+    case Mapping::Block:{
+      cpv_my_device_id   = (my_rank*csv_gpu_manager.device_count) / CmiNodeSize(CmiMyNode());
+      cpv_my_device      = csv_gpu_manager.device_managers[cpv_my_device_id].global_index;
+      if (my_rank < csv_gpu_manager.device_count) cpv_device_rep = true;
+      firstRankForDevice = cpv_my_device;
+    }
       break;
-    case Mapping::RoundRobin:
-      cpv_my_device = my_rank % csv_gpu_manager.device_count;
+    case Mapping::RoundRobin: {
+      cpv_my_device_id   = my_rank % csv_gpu_manager.device_count;
+      cpv_my_device      = csv_gpu_manager.device_managers[cpv_my_device_id].global_index;
       if (my_rank < csv_gpu_manager.device_count) cpv_device_rep = true;
+      firstRankForDevice = cpv_my_device;
+    }
       break;
-    default:
+    default:  
       CmiAbort("Unsupported mapping type!");
   }
-
-  // Set device and store PE-device mapping
-  hapiCheck(cudaSetDevice(cpv_my_device));
+  
+  hapiCheck(hapiSetDevice(cpv_my_device));
 #if CMK_SMP
   CmiLock(csv_gpu_manager.device_mapping_lock);
 #endif
-  csv_gpu_manager.device_map.emplace(CmiMyPe(),
-      &(csv_gpu_manager.device_managers[cpv_my_device]));
+  csv_gpu_manager.device_map.emplace(CmiMyPe(), &(csv_gpu_manager.device_managers[cpv_my_device_id]));
 #if CMK_SMP
   CmiUnlock(csv_gpu_manager.device_mapping_lock);
 #endif
@@ -320,14 +678,17 @@ static void hapiMapping(char** argv) {
   }
 
   if (CmiMyRank() == 0) {
-    if (use_shm) csv_gpu_manager.use_shm = true;
+    if (use_shm) {
+      csv_gpu_manager.use_shm = true;
+    }
+    // csv_gpu_manager.test_field = true;
   }
 
   CmiNodeBarrier();
 
   if (csv_gpu_manager.use_shm) {
     // Process device communication buffer parameters (in MB)
-    int input_comm_buffer_size;
+    int input_comm_buffer_size = 0;
     if (CmiGetArgIntDesc(argv, "+gpucommbuffer", &input_comm_buffer_size,
           "GPU communication buffer size (in MB)")) {
       if (CmiMyRank() == 0) {
@@ -338,10 +699,23 @@ static void hapiMapping(char** argv) {
       }
     }
 
+    // Process device communication buffer parameters (in MB)
+    int input_lb_buffer_size = 0;
+    if (CmiGetArgIntDesc(argv, "+gpulbbuffer", &input_lb_buffer_size,
+          "GPU load balancing buffer size (in MB)")) {
+      if (CmiMyRank() == 0) {
+        csv_gpu_manager.lb_buffer_size =  (size_t)input_lb_buffer_size * 1024 * 1024;
+      }
+    }
+
     if (CmiMyPe() == 0) {
       CmiPrintf("HAPI> GPU communication buffer size: %zu MB "
           "(rounded up to the nearest power of two)\n",
           csv_gpu_manager.comm_buffer_size / (1024 * 1024));
+
+      CmiPrintf("HAPI> GPU load balancing buffer size: %zu MB "
+          "\n",
+          csv_gpu_manager.lb_buffer_size / (1024 * 1024));
     }
 
     CmiNodeBarrier(); // Ensure device communication buffer size is set
@@ -353,27 +727,27 @@ static void hapiMapping(char** argv) {
 #if CMK_SMP
       CmiLock(dm->lock);
 #endif
-      dm->create_comm_buffer(csv_gpu_manager.comm_buffer_size);
+      dm->create_comm_buffer(csv_gpu_manager.comm_buffer_size + csv_gpu_manager.lb_buffer_size, csv_gpu_manager.comm_buffer_size);
 #if CMK_SMP
       CmiUnlock(dm->lock);
 #endif
     }
 
-    // Process custom size for CUDA IPC event pool
-    int input_cuda_ipc_event_pool_size;
-    if (!CmiGetArgIntDesc(argv, "+gpuipceventpool", &input_cuda_ipc_event_pool_size,
+    // Process custom size for hapi IPC event pool
+    int input_hapi_ipc_event_pool_size;
+    if (!CmiGetArgIntDesc(argv, "+gpuipceventpool", &input_hapi_ipc_event_pool_size,
           "GPU IPC event pool size per PE")) {
-      input_cuda_ipc_event_pool_size = 16;
+      input_hapi_ipc_event_pool_size = 16;
     }
 
     if (CmiMyRank() == 0) {
-      csv_gpu_manager.cuda_ipc_event_pool_size_pe = input_cuda_ipc_event_pool_size;
-      csv_gpu_manager.cuda_ipc_event_pool_size_total = input_cuda_ipc_event_pool_size * csv_gpu_manager.pes_per_device;
+      csv_gpu_manager.hapi_ipc_event_pool_size_pe = input_hapi_ipc_event_pool_size;
+      csv_gpu_manager.hapi_ipc_event_pool_size_total = input_hapi_ipc_event_pool_size * csv_gpu_manager.pes_per_device;
     }
 
     if (CmiMyPe() == 0) {
-      CmiPrintf("HAPI> CUDA IPC event pool size - %d per PE, %d per device\n",
-          csv_gpu_manager.cuda_ipc_event_pool_size_pe, csv_gpu_manager.cuda_ipc_event_pool_size_total);
+      CmiPrintf("HAPI> hapi IPC event pool size - %d per PE, %d per device\n",
+          csv_gpu_manager.hapi_ipc_event_pool_size_pe, csv_gpu_manager.hapi_ipc_event_pool_size_total);
     }
   }
 
@@ -396,9 +770,9 @@ static void hapiMapping(char** argv) {
         if (i != cpv_my_device) {
           int can_access_peer;
 
-          hapiCheck(cudaDeviceCanAccessPeer(&can_access_peer, cpv_my_device, i));
+          hapiCheck(hapiDeviceCanAccessPeer(&can_access_peer, cpv_my_device, i));
           if (can_access_peer) {
-            cudaDeviceEnablePeerAccess(i, 0);
+            hapiDeviceEnablePeerAccess(i, 0);
           }
         }
       }
@@ -411,13 +785,25 @@ static void hapiMapping(char** argv) {
 }
 
 #ifndef HAPI_CUDA_CALLBACK
-void recordEvent(cudaStream_t stream, const CkCallback& cb, void* cb_msg, hapiWorkRequest* wr = NULL) {
-  // create CUDA event and insert into stream
-  cudaEvent_t ev;
-  cudaEventCreateWithFlags(&ev, cudaEventDisableTiming);
-  cudaEventRecord(ev, stream);
+void recordEvent(hapiStream_t stream, const CkCallback& cb, void* cb_msg, hapiWorkRequest* wr = NULL, CkMigratable* obj = NULL, hapiEvent_t start_ev = NULL) {
+  // if(obj!=NULL)
+  //   CmiAbort("non null without HAPI hapi CALLBACK");
+  // create hapi event / get hapi event from the pool and insert into stream
+  hapiEvent_t ev;
+  auto& hapi_event_pool_local = CpvAccess(hapi_event_pool);
+  if(hapi_event_pool_local.size() == 0) {
+  #if CMK_LBDB_ON
+    hapiEventCreateWithFlags(&ev, hapiEventDefault);
+  #else
+    hapiEventCreateWithFlags(&ev, hapiEventDisableTiming);
+  #endif
+  } else {
+    ev = hapi_event_pool_local.front();
+    hapi_event_pool_local.pop();
+  }
+  hapiEventRecord(ev, stream);
 
-  hapiEvent hev(ev, cb, cb_msg, wr);
+  hapiEvent hev(ev, cb, cb_msg, wr, obj, start_ev);
 
   // push event information in queue
   CpvAccess(hapi_event_queue).push(hev);
@@ -532,15 +918,15 @@ static void hapiRegisterCallbacks() {
 }
 
 #ifdef HAPI_CUDA_CALLBACK
-// Callback function invoked by the CUDA runtime certain parts of GPU work are
+// Callback function invoked by the hapi runtime certain parts of GPU work are
 // complete. It sends a converse message to the original PE to free the relevant
 // device memory and invoke the user's callback. The reason for this method is
-// that a thread created by the CUDA runtime does not have access to any of the
+// that a thread created by the hapi runtime does not have access to any of the
 // CpvDeclare'd variables as it is not one of the threads created by the Charm++
 // runtime.
-static void CUDACallback(void *data) {
+static void hapiCallback(void *data) {
 #ifdef HAPI_NVTX_PROFILE
-  NVTXTracer nvtx_range("CUDACallback", NVTXColor::Silver);
+  NVTXTracer nvtx_range("hapiCallback", NVTXColor::Silver);
 #endif
 
   // send message to the original PE
@@ -558,7 +944,7 @@ enum CallbackStage {
 static void addCallback(hapiWorkRequest *wr, CallbackStage stage) {
   GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
 
-  // create converse message to be delivered to this PE after CUDA callback
+  // create converse message to be delivered to this PE after hapi callback
   char *conv_msg = (char *)CmiAlloc(CmiMsgHeaderSizeBytes + sizeof(int) +
                                   sizeof(hapiWorkRequest *)); // FIXME memory leak?
   *((int *)(conv_msg + CmiMsgHeaderSizeBytes)) = CmiMyRank();
@@ -581,8 +967,8 @@ static void addCallback(hapiWorkRequest *wr, CallbackStage stage) {
   }
   CmiSetHandler(conv_msg, handlerIdx);
 
-  // add callback into CUDA stream
-  hapiCheck(cudaLaunchHostFunc(wr->stream, CUDACallback, (void*)conv_msg));
+  // add callback into hapi stream
+  hapiCheck(hapiLaunchHostFunc(wr->stream, hapiCallback, (void*)conv_msg));
 }
 #endif // HAPI_CUDA_CALLBACK
 
@@ -674,8 +1060,8 @@ hapiWorkRequest::hapiWorkRequest() :
   chare_index = -1;
 #endif
 
-  // Use CUDA per-thread default stream
-  stream = cudaStreamPerThread;
+  // Use hapi per-thread default stream
+  stream = hapiStreamPerThread;
 
   // Charm++ callbacks are not set by default
   host_to_device_cb = CkCallback(CkCallback::ignore);
@@ -694,30 +1080,30 @@ static void shmInit() {
   if (!CsvAccess(gpu_manager).use_shm) return;
 
   if (CmiMyRank() == 0) {
-    shmSetup();
+    if (!CmiInCommThread()) shmSetup();
     if (CmiMyNodeRankLocal() == 0) {
-      shmCreate(); // Create a per-host shared memory region
+      if (!CmiInCommThread()) shmCreate(); // Create a per-host shared memory region
       CmiBarrier(); // FIXME: Only needs to be a host-wide barrier
     } else {
       CmiBarrier();
-      shmOpen(); // Open the shared memory region created by local logical node 0
+      if (!CmiInCommThread()) shmOpen(); // Open the shared memory region created by local logical node 0
     }
-    shmMap(); // Map the shared memory file into memory
+    if (!CmiInCommThread()) shmMap(); // Map the shared memory file into memory
   } else {
     CmiBarrier();
   }
 
-  CmiNodeBarrier(); // Ensure shared memory has been mapped into the logical node
+  if (!CmiInCommThread()) CmiNodeBarrier(); // Ensure shared memory has been mapped into the logical node
 
-  ipcHandleCreate(); // Create CUDA IPC handles
+  if (!CmiInCommThread()) ipcHandleCreate(); // Create hapi IPC handles
 
-  // Ensure CUDA IPC handles are available for all processes
+  // Ensure hapi IPC handles are available for all processes
   // Note: Causes a hang when this barrier is placed after CPU topology initialization
   // FIXME: This only needs to be a host-wide synchronization
   CmiBarrier();
 
   if (CmiMyRank() == 0) {
-    ipcHandleOpen(); // Open CUDA IPC handles for accessing other processes' device memory
+    if (!CmiInCommThread()) ipcHandleOpen(); // Open hapi IPC handles for accessing other processes' device memory
   }
 }
 
@@ -725,16 +1111,16 @@ static void shmSetup() {
   GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
 
   // Set up shared memory file name
-  csv_gpu_manager.shm_name.assign("charm-cuda-host");
+  csv_gpu_manager.shm_name.assign("charm-hapi-host");
   int host_id = CmiPhysicalNodeID(CmiMyPe());
   csv_gpu_manager.shm_name.append(std::to_string(host_id));
   const char* shm_name = csv_gpu_manager.shm_name.c_str();
 
   // Calculate shared memory region size
-  csv_gpu_manager.shm_chunk_size = sizeof(cudaIpcMemHandle_t) +
-      sizeof(cuda_ipc_event_shared) * csv_gpu_manager.cuda_ipc_event_pool_size_total;
+  csv_gpu_manager.shm_chunk_size = sizeof(hapiIpcMemHandle_t) +
+      sizeof(hapi_ipc_event_shared) * csv_gpu_manager.hapi_ipc_event_pool_size_total;
   csv_gpu_manager.shm_size = csv_gpu_manager.shm_chunk_size *
-    csv_gpu_manager.device_count_on_physical_node;
+    csv_gpu_manager.device_count * ((CmiNumNodes() / CmiNumPhysicalNodes()));
 }
 
 // Create POSIX shared memory region accessible to all processes on the same host
@@ -817,12 +1203,12 @@ static void shmMap() {
 
   // Store pointer to my process' portion of the shared memory region
   csv_gpu_manager.shm_my_ptr = (void*)((char*)csv_gpu_manager.shm_ptr +
-      csv_gpu_manager.shm_chunk_size * csv_gpu_manager.device_count *
-      CmiMyNodeRankLocal());
+      csv_gpu_manager.shm_chunk_size * (csv_gpu_manager.device_count *
+      CmiMyNodeRankLocal()));
 
   // Allocate memory for local storage
-  for (int i = 0; i < csv_gpu_manager.device_count_on_physical_node; i++) {
-    csv_gpu_manager.cuda_ipc_device_infos.emplace_back();
+  for (int i = 0; i < csv_gpu_manager.device_count * ((CmiNumNodes() / CmiNumPhysicalNodes())); i++) {
+    csv_gpu_manager.hapi_ipc_device_infos.emplace_back();
   }
 }
 
@@ -851,7 +1237,7 @@ static void shmCleanup() {
   }
 }
 
-// Create CUDA IPC handles and populate shared memory region
+// Create hapi IPC handles and populate shared memory region
 // Invoked by all PEs
 static void ipcHandleCreate() {
   // Only device reps should continue to perform the following operations
@@ -859,36 +1245,44 @@ static void ipcHandleCreate() {
   if (!CpvAccess(device_rep)) return;
 
   GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
-  int& cpv_my_device = CpvAccess(my_device);
+  int& cpv_my_device_id = CpvAccess(my_device_id);
 
-  // Create CUDA IPC memory handle in shared memory
-  DeviceManager& my_dm = csv_gpu_manager.device_managers[cpv_my_device];
+  // Create hapi IPC memory handle in shared memory
+  auto it = csv_gpu_manager.device_map.find(CmiMyPe());
+  if (it == csv_gpu_manager.device_map.end()) {
+    CmiAbort("PE not found in device_map during ipcHandleCreate");
+  }
+  DeviceManager& my_dm = *(it->second);
   auto comm_buffer = my_dm.get_comm_buffer();
   CmiAssert(comm_buffer);
-  cudaIpcMemHandle_t* shm_mem_handle = (cudaIpcMemHandle_t*)((char*)csv_gpu_manager.shm_my_ptr +
-      csv_gpu_manager.shm_chunk_size * cpv_my_device);
+
+  // Use local device index (0 to device_count-1) for shm_mem_handle offset
+  // int local_device_idx = my_dm.local_index;
+  hapiIpcMemHandle_t* shm_mem_handle = (hapiIpcMemHandle_t*)((char*)csv_gpu_manager.shm_my_ptr +
+      csv_gpu_manager.shm_chunk_size * cpv_my_device_id);
+
   void* device_ptr = comm_buffer->base_ptr;
-  hapiCheck(cudaIpcGetMemHandle(shm_mem_handle, device_ptr));
+  hapiCheck(hapiIpcGetMemHandle(shm_mem_handle, device_ptr));
 
-  // Create CUDA IPC events and store them locally (in cuda_ipc_device_info),
+  // Create hapi IPC events and store them locally (in hapi_ipc_device_info),
   // and create corresponding IPC handles in shared memory
-  cuda_ipc_device_info& my_device_info = csv_gpu_manager.cuda_ipc_device_infos[my_dm.global_index];
-  cuda_ipc_event_shared* shm_event_shared = (cuda_ipc_event_shared*)((char*)shm_mem_handle + sizeof(cudaIpcMemHandle_t));
+  hapi_ipc_device_info& my_device_info = csv_gpu_manager.hapi_ipc_device_infos[csv_gpu_manager.device_count * CmiMyNodeRankLocal() + cpv_my_device_id];
+  hapi_ipc_event_shared* shm_event_shared = (hapi_ipc_event_shared*)((char*)shm_mem_handle + sizeof(hapiIpcMemHandle_t));
 
-  for (int i = 0; i < csv_gpu_manager.cuda_ipc_event_pool_size_total; i++) {
-    cuda_ipc_event_shared* cur_shm_event_shared = shm_event_shared + i;
+  for (int i = 0; i < csv_gpu_manager.hapi_ipc_event_pool_size_total; i++) {
+    hapi_ipc_event_shared* cur_shm_event_shared = shm_event_shared + i;
 
     my_device_info.event_pool_flags.push_back(0);
     my_device_info.event_pool_buff_offsets.push_back(0);
     my_device_info.src_event_pool.emplace_back();
     my_device_info.dst_event_pool.emplace_back();
-    hapiCheck(cudaEventCreateWithFlags(&my_device_info.src_event_pool[i],
-          cudaEventDisableTiming | cudaEventInterprocess));
-    hapiCheck(cudaEventCreateWithFlags(&my_device_info.dst_event_pool[i],
-          cudaEventDisableTiming | cudaEventInterprocess));
-    hapiCheck(cudaIpcGetEventHandle(&cur_shm_event_shared->src_event_handle,
+    hapiCheck(hapiEventCreateWithFlags(&my_device_info.src_event_pool[i],
+          hapiEventDisableTiming | hapiEventInterprocess));
+    hapiCheck(hapiEventCreateWithFlags(&my_device_info.dst_event_pool[i],
+          hapiEventDisableTiming | hapiEventInterprocess));
+    hapiCheck(hapiIpcGetEventHandle(&cur_shm_event_shared->src_event_handle,
           my_device_info.src_event_pool[i]));
-    hapiCheck(cudaIpcGetEventHandle(&cur_shm_event_shared->dst_event_handle,
+    hapiCheck(hapiIpcGetEventHandle(&cur_shm_event_shared->dst_event_handle,
           my_device_info.dst_event_pool[i]));
   }
 
@@ -896,7 +1290,7 @@ static void ipcHandleCreate() {
   my_device_info.buffer = device_ptr;
 }
 
-// Open CUDA IPC handles created by other processes
+// Open hapi IPC handles created by other processes
 // Invoked by PE rank 0 of each process
 static void ipcHandleOpen() {
   GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
@@ -908,30 +1302,30 @@ static void ipcHandleOpen() {
     // Loop through GPU devices per process
     for (int j = 0; j < csv_gpu_manager.device_count; j++) {
       int device_index = csv_gpu_manager.device_count * i + j;
-      cuda_ipc_device_info& cur_device_info = csv_gpu_manager.cuda_ipc_device_infos[device_index];
+      hapi_ipc_device_info& cur_device_info = csv_gpu_manager.hapi_ipc_device_infos[device_index];
 
       // Open memory handle
-      cudaIpcMemHandle_t* shm_mem_handle =
-        (cudaIpcMemHandle_t*)((char*)csv_gpu_manager.shm_ptr
+      hapiIpcMemHandle_t* shm_mem_handle =
+        (hapiIpcMemHandle_t*)((char*)csv_gpu_manager.shm_ptr
             + csv_gpu_manager.shm_chunk_size * device_index);
-      hapiCheck(cudaIpcOpenMemHandle(&cur_device_info.buffer, *shm_mem_handle,
-            cudaIpcMemLazyEnablePeerAccess));
+      hapiCheck(hapiIpcOpenMemHandle(&cur_device_info.buffer, *shm_mem_handle,
+            hapiIpcMemLazyEnablePeerAccess));
 
       // Open event handles
-      cuda_ipc_event_shared* shm_event_shared =
-        (cuda_ipc_event_shared*)((char*)shm_mem_handle + sizeof(cudaIpcMemHandle_t));
+      hapi_ipc_event_shared* shm_event_shared =
+        (hapi_ipc_event_shared*)((char*)shm_mem_handle + sizeof(hapiIpcMemHandle_t));
 
       cur_device_info.event_pool_flags.clear();
       cur_device_info.event_pool_buff_offsets.clear();
 
-      for (int k = 0; k < csv_gpu_manager.cuda_ipc_event_pool_size_total; k++) {
-        cuda_ipc_event_shared* cur_shm_event_shared = shm_event_shared + k;
+      for (int k = 0; k < csv_gpu_manager.hapi_ipc_event_pool_size_total; k++) {
+        hapi_ipc_event_shared* cur_shm_event_shared = shm_event_shared + k;
 
         cur_device_info.src_event_pool.emplace_back();
         cur_device_info.dst_event_pool.emplace_back();
-        hapiCheck(cudaIpcOpenEventHandle(&cur_device_info.src_event_pool[k],
+        hapiCheck(hapiIpcOpenEventHandle(&cur_device_info.src_event_pool[k],
               cur_shm_event_shared->src_event_handle));
-        hapiCheck(cudaIpcOpenEventHandle(&cur_device_info.dst_event_pool[k],
+        hapiCheck(hapiIpcOpenEventHandle(&cur_device_info.dst_event_pool[k],
               cur_shm_event_shared->dst_event_handle));
       }
     }
@@ -946,7 +1340,7 @@ static inline void gpuEventStart(hapiWorkRequest* wr, int* index,
   GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
   gpuEventTimer* shared_gpu_events_ = csv_gpu_manager.gpu_events_;
   int shared_time_idx_ = csv_gpu_manager.time_idx_++;
-  shared_gpu_events_[shared_time_idx_].cmi_start_time = CmiWallTimer();
+  // shared_gpu_events_[shared_time_idx_].cmi_start_time = CmiWallTimer();
   shared_gpu_events_[shared_time_idx_].event_type = event;
   shared_gpu_events_[shared_time_idx_].trace_name = wr->trace_name;
   *index = shared_time_idx_;
@@ -963,7 +1357,7 @@ static inline void gpuEventStart(hapiWorkRequest* wr, int* index,
 static inline void gpuEventEnd(int index) {
 #ifdef HAPI_TRACE
   GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
-  csv_gpu_manager.gpu_events_[index].cmi_end_time = CmiWallTimer();
+  // csv_gpu_manager.gpu_events_[index].cmi_end_time = CmiWallTimer();
   traceUserBracketEvent(csv_gpu_manager.gpu_events_[index].stage,
                         csv_gpu_manager.gpu_events_[index].cmi_start_time,
                         csv_gpu_manager.gpu_events_[index].cmi_end_time);
@@ -978,7 +1372,7 @@ static inline void gpuEventEnd(int index) {
 
 static inline void hapiWorkRequestStartTime(hapiWorkRequest* wr) {
 #ifdef HAPI_INSTRUMENT_WRS
-  wr->phase_start_time = CmiWallTimer();
+  // wr->phase_start_time = CmiWallTimer();
 #endif
 }
 
@@ -992,7 +1386,7 @@ static inline void profileWorkRequestEvent(hapiWorkRequest* wr,
 #endif
 
   if (csv_gpu_manager.init_instr_) {
-    double tt = CmiWallTimer() - (wr->phase_start_time);
+    // double tt = CmiWallTimer() - (wr->phase_start_time);
     int index = wr->chare_index;
     char type = wr->comp_type;
     char phase = wr->comp_phase;
@@ -1042,9 +1436,9 @@ static void createPool(int *n_buffers, int n_slots, std::vector<BufferPool> &poo
   }
 
   int device;
-  cudaDeviceProp device_prop;
-  hapiCheck(cudaGetDevice(&device));
-  hapiCheck(cudaGetDeviceProperties(&device_prop, device));
+  hapiDeviceProp device_prop;
+  hapiCheck(hapiGetDevice(&device));
+  hapiCheck(hapiGetDeviceProperties(&device_prop, device));
 
   // divide by # of PEs on physical node and multiply by # of PEs in logical node
   size_t available_memory = device_prop.totalGlobalMem /
@@ -1078,7 +1472,7 @@ static void createPool(int *n_buffers, int n_slots, std::vector<BufferPool> &poo
 
     // pin host memory in a contiguous block for a slot
     void* pinned_chunk;
-    hapiCheck(cudaMallocHost(&pinned_chunk, buf_size * num_buffers));
+    hapiCheck(hapiMallocHost(&pinned_chunk, buf_size * num_buffers));
 
     // initialize header structs
     for (int j = num_buffers - 1; j >= 0; j--) {
@@ -1099,11 +1493,11 @@ static void createPool(int *n_buffers, int n_slots, std::vector<BufferPool> &poo
 
 static void releasePool(std::vector<BufferPool> &pools){
   int device;
-  hapiCheck(cudaGetDevice(&device));
+  hapiCheck(hapiGetDevice(&device));
   for (int i = 0; i < pools.size(); i++) {
     void* chunk = pools[i].chunk;
     if (chunk != NULL) {
-      hapiCheck(cudaFreeHost(chunk));
+      hapiCheck(hapiFreeHost(chunk));
     }
   }
   pools.clear();
@@ -1120,7 +1514,7 @@ static int findPool(size_t size){
     csv_gpu_manager.mempool_boundaries_.push_back(size);
 
     BufferPool newpool;
-    hapiCheck(cudaMallocHost((void**)&newpool.head, size + sizeof(BufferPoolHeader)));
+    hapiCheck(hapiMallocHost((void**)&newpool.head, size + sizeof(BufferPoolHeader)));
     if (newpool.head == NULL) {
       CmiPrintf("[HAPI (%d)] findPool: failed to allocate newpool %d head, size %zu\n",
              CmiMyPe(), boundary_array_len, size);
@@ -1163,7 +1557,7 @@ static void* getBufferFromPool(int pool, size_t size){
   }
   else if (csv_gpu_manager.mempool_free_bufs_[pool].head == NULL) {
     BufferPoolHeader* hd;
-    hapiCheck(cudaMallocHost((void**)&hd, sizeof(BufferPoolHeader) +
+    hapiCheck(hapiMallocHost((void**)&hd, sizeof(BufferPoolHeader) +
                              csv_gpu_manager.mempool_free_bufs_[pool].size));
 #ifdef HAPI_MEMPOOL_DEBUG
     CmiPrintf("[HAPI (%d)] getBufferFromPool, pool: %d, size: %zu expand by 1\n",
@@ -1196,7 +1590,7 @@ static void returnBufferToPool(int pool, BufferPoolHeader* hd) {
 #endif
 }
 
-cudaError_t hapiPoolMalloc(void** ptr, size_t size) {
+hapiError_t hapiPoolMalloc(void** ptr, size_t size) {
   GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
 
 #if CMK_SMP
@@ -1242,7 +1636,7 @@ cudaError_t hapiPoolMalloc(void** ptr, size_t size) {
     CmiUnlock(csv_gpu_manager.mempool_lock_);
 #endif
 
-    return cudaErrorMemoryAllocation;
+    return hapiErrorMemoryAllocation;
   }
   *ptr = getBufferFromPool(pool, size);
 
@@ -1255,15 +1649,15 @@ cudaError_t hapiPoolMalloc(void** ptr, size_t size) {
   CmiUnlock(csv_gpu_manager.mempool_lock_);
 #endif
 
-  return cudaSuccess;
+  return hapiSuccess;
 }
 
-cudaError_t hapiPoolFree(void* ptr) {
+hapiError_t hapiPoolFree(void* ptr) {
   GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
 
   // Check if mempool was initialized
   if (!csv_gpu_manager.mempool_initialized_)
-    return cudaErrorInitializationError;
+    return hapiErrorInitializationError;
 
   BufferPoolHeader* hd = ((BufferPoolHeader*)ptr) - 1;
   int pool = hd->slot;
@@ -1288,7 +1682,7 @@ cudaError_t hapiPoolFree(void* ptr) {
          csv_gpu_manager.mempool_free_bufs_[pool].num);
 #endif
 
-  return cudaSuccess;
+  return hapiSuccess;
 }
 
 #ifdef HAPI_INSTRUMENT_WRS
@@ -1364,9 +1758,20 @@ void hapiPollEvents(void* param) {
   std::queue<hapiEvent>& queue = CpvAccess(hapi_event_queue);
   while (!queue.empty()) {
     hapiEvent hev = queue.front();
-    if (cudaEventQuery(hev.event) == cudaSuccess) {
+    if (hapiEventQuery(hev.event) == hapiSuccess) {
       queue.pop(); // TODO: investigate possible race condition with charm4py futures - temporarily resolved by popping here
 
+#if CMK_LBDB_ON
+      if (hev.obj) {
+        // CmiPrintf("should not be printed w/o hapi hapi callback \n");
+        float gpu_time;
+        hapiEventElapsedTime(&gpu_time, hev.start_ev, hev.event);
+        // hapiEventElapsedTime returns ms, convert to seconds to match wallTime units
+        double gpu_time_s = gpu_time / 1000.0;
+        hev.obj->setObjGPUTime(gpu_time_s + hev.obj->getObjGPUTime());
+        hapiEventDestroy(hev.start_ev);
+      } else 
+#endif        
       // invoke Charm++ callback if one was given
       hev.cb.send(hev.cb_msg);
 
@@ -1374,7 +1779,7 @@ void hapiPollEvents(void* param) {
       if (hev.wr) {
         hapiWorkRequestCleanup(hev.wr);
       }
-      cudaEventDestroy(hev.event);
+      CpvAccess(hapi_event_pool).push(hev.event);
       CpvAccess(n_hapi_events)--;
 
       // inform QD that an event was processed
@@ -1405,14 +1810,14 @@ int hapiCreateStreams() {
   return ret;
 }
 
-cudaStream_t hapiGetStream() {
+hapiStream_t hapiGetStream() {
   GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
 
 #if CMK_SMP
   CmiLock(csv_gpu_manager.stream_lock_);
 #endif
 
-  cudaStream_t ret = csv_gpu_manager.getNextStream();
+  hapiStream_t ret = csv_gpu_manager.getNextStream();
 
 #if CMK_SMP
   CmiUnlock(csv_gpu_manager.stream_lock_);
@@ -1420,11 +1825,67 @@ cudaStream_t hapiGetStream() {
 
   return ret;
 }
+#if CMK_LBDB_ON
+// Lightweight HAPI, to be invoked after data transfer or kernel execution.
+void hapiRecordTime(hapiStream_t stream, hapiEvent_t start) {
+  Chare* obj = CkActiveObj();
+  if (obj && dynamic_cast<CkMigratable*>(obj)) {
+
+  #ifndef HAPI_CUDA_CALLBACK
+  // record hapi event
+    recordEvent(stream, CkCallback(), NULL, NULL, dynamic_cast<CkMigratable*>(obj), start);
+#else
+  #error hapi record time with HAPI_CUDA_CALLBACK not supported
+#endif
+
+    // while there is an ongoing workrequest, quiescence should not be detected
+    // even if all PEs seem idle
+    CmiAssert(hapiQdCreate);
+    hapiQdCreate(1);
+  }
+}
+#endif
+
+uint64_t hapiCuptiPushObjCorrelation() {
+  // printf("seeing CsvAccess(gpu_manager).cupti_initialized_ as %d\n", CsvAccess(gpu_manager).cupti_initialized_);
+  if (!CsvAccess(gpu_manager).cupti_initialized_) return 0;
+
+  // Get the active Charm++ object
+  Chare* chare = CkActiveObj();
+  if (!chare)
+    CmiAbort("hapiCuptiPushObjCorrelation call without active object is not possible");
+
+  CkMigratable* mig = dynamic_cast<CkMigratable*>(chare);
+  // printf("mig %p\n", mig);
+  if (!mig) return 0;
+
+  // Use the raw element ID as the external correlation ID
+  // CmiUInt8 is a 64-bit unique object identifier
+  uint64_t obj_id = (uint64_t)mig->ckGetID();
+#if CMK_CUDA
+  CUPTI_SAFE_CALL(cuptiActivityPushExternalCorrelationId(
+      CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, obj_id));
+#endif
+  // printf("pushed corr id\n");
+
+  return obj_id;
+}
+
+void hapiCuptiPopObjCorrelation() {
+  if (!CsvAccess(gpu_manager).cupti_initialized_) return;
+
+  // printf("popped corr id\n");
+  uint64_t tag;
+#if CMK_CUDA
+  CUPTI_SAFE_CALL(cuptiActivityPopExternalCorrelationId(
+      CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, &tag));
+#endif
+}
 
 // Lightweight HAPI, to be invoked after data transfer or kernel execution.
-void hapiAddCallback(cudaStream_t stream, const CkCallback& cb, void* cb_msg) {
+void hapiAddCallback(hapiStream_t stream, const CkCallback& cb, void* cb_msg) {
 #ifndef HAPI_CUDA_CALLBACK
-  // record CUDA event
+  // record hapi event
   recordEvent(stream, cb, cb_msg);
 #else
   GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
@@ -1435,15 +1896,15 @@ void hapiAddCallback(cudaStream_t stream, const CkCallback& cb, void* cb_msg) {
 #endif
 */
 
-  // create converse message to be delivered to this PE after CUDA callback
+  // create converse message to be delivered to this PE after hapi callback
   hapiCallbackMessage* conv_msg = (hapiCallbackMessage*)CmiAlloc(sizeof(hapiCallbackMessage)); // FIXME memory leak?
   conv_msg->rank = CmiMyRank();
   conv_msg->cb = cb;
   conv_msg->cb_msg = cb_msg;
   CmiSetHandler(conv_msg, csv_gpu_manager.light_cb_idx_);
 
-  // push into CUDA stream
-  hapiCheck(cudaLaunchHostFunc(stream, CUDACallback, (void*)conv_msg));
+  // push into hapi stream
+  hapiCheck(hapiLaunchHostFunc(stream, hapiCallback, (void*)conv_msg));
 
   /*
 #if CMK_SMP
@@ -1458,33 +1919,77 @@ void hapiAddCallback(cudaStream_t stream, const CkCallback& cb, void* cb_msg) {
   hapiQdCreate(1);
 }
 
-void hapiAddCallback(cudaStream_t stream, void* cb, void* cb_msg) {
+void hapiAddCallback(hapiStream_t stream, void* cb, void* cb_msg) {
   hapiAddCallback(stream, *(CkCallback*)cb, cb_msg);
 }
 
-cudaError_t hapiMalloc(void** devPtr, size_t size) {
-  return cudaMalloc(devPtr, size);
-}
+void hapiSendMemoryRequest(char* msg, int size)
+{
+    int cpv_my_device = CpvAccess(my_device);
+    
+    char server_fifo[BUFFER_SIZE];
+    sprintf(server_fifo, SERVER_FIFO_TEMPLATE, cpv_my_device);
+    CmiPrintf("Sending request to %s\n", server_fifo);
+    
+    int server_fd = open(server_fifo, O_WRONLY | O_NONBLOCK);
+    if (server_fd == -1) {
+        perror("open server FIFO for writing");
+        return;
+    }
 
-cudaError_t hapiFree(void* devPtr) {
-  return cudaFree(devPtr);
+    ssize_t written = write(server_fd, msg, size);
+    if (written == -1) {
+        perror("write to server FIFO");
+    } else {
+        //CmiPrintf("Successfully wrote %zd bytes to server FIFO\n", written);
+    }
+    
+    close(server_fd);
 }
 
-cudaError_t hapiMallocHost(void** ptr, size_t size) {
-  return cudaMallocHost(ptr, size);
-}
 
-cudaError_t hapiFreeHost(void* ptr) {
-  return cudaFreeHost(ptr);
+// hapiError_t hapiMemcpyAsync(void* dst, const void* src, size_t count, hapiMemcpyKind kind, hapiStream_t stream = 0) {
+//   hapiError_t err;
+// #if CMK_LBDB_ON
+//   hapiEvent_t start;
+
+//   hapiEventCreate(&start);
+//   hapiEventRecord(start, stream);
+// #endif
+
+//   err = hapiMemcpyAsync(dst, src, count, kind, stream);
+// #if CMK_LBDB_ON
+//   hapiRecordTime(stream, start);  
+// #endif
+//   return err;
+// }
+
+// hapiError_t hapiMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hapiMemcpyKind kind, hapiStream_t stream = 0) {
+//   hapiError_t err;
+// #if CMK_LBDB_ON
+//   hapiEvent_t start;
+
+//   hapiEventCreate(&start);
+//   hapiEventRecord(start, stream);
+// #endif
+//   err = hapiMemcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
+// #if CMK_LBDB_ON
+//   hapiRecordTime(stream, start);
+// #endif
+//   return err;
+// }
+
+
+void hapiErrorDie(hapiError_t retCode, const char* code, const char* file, int line) {
+  if (retCode != hapiSuccess) {
+    fprintf(stderr, "Fatal hapi Error [%d] %s at %s:%d\n", retCode, hapiGetErrorString(retCode), file, line);
+    CmiAbort("Exit due to hapi error");
+  }
 }
 
-cudaError_t hapiMemcpyAsync(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0) {
-  return cudaMemcpyAsync(dst, src, count, kind, stream);
+uint64_t hapiMyDevice() {
+  int physical_node_id = CmiPhysicalNodeID(CmiMyPe());
+  int my_device = CpvAccess(my_device);
+  return (static_cast<uint64_t>(physical_node_id) << 32) | my_device;
 }
 
-void hapiErrorDie(cudaError_t retCode, const char* code, const char* file, int line) {
-  if (retCode != cudaSuccess) {
-    fprintf(stderr, "Fatal CUDA Error [%d] %s at %s:%d\n", retCode, cudaGetErrorString(retCode), file, line);
-    CmiAbort("Exit due to CUDA error");
-  }
-}
diff --git a/src/arch/cuda/hybridAPI/hapi_impl.h b/src/arch/cuda/hybridAPI/hapi_impl.h
index 63d8074f17..42d057dec7 100644
--- a/src/arch/cuda/hybridAPI/hapi_impl.h
+++ b/src/arch/cuda/hybridAPI/hapi_impl.h
@@ -13,8 +13,12 @@ extern "C" {
 // Scale the amount of memory each node pins.
 #define HAPI_MEMPOOL_SCALE 1.0
 
+
 // HAPI init & exit functions
 void hapiInit(char** argv);
+void hapiStartMemoryDaemon(char** argv);
+int hapiCheckpoint(void* devPtr, int size);
+void hapiRestore(void* devPtr, int size, int alloc_id);
 void hapiExit();
 
 // Polls for GPU work completion. Does not do anything if HAPI_CUDA_CALLBACK is defined.
diff --git a/src/arch/cuda/hybridAPI/hapi_memory_daemon.cpp b/src/arch/cuda/hybridAPI/hapi_memory_daemon.cpp
new file mode 100644
index 0000000000..9e751e7162
--- /dev/null
+++ b/src/arch/cuda/hybridAPI/hapi_memory_daemon.cpp
@@ -0,0 +1,279 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath>
+#include <algorithm>
+#include <queue>
+#include <atomic>
+#include <vector>
+#include <unordered_map>
+#include <cstring>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sched.h>
+
+#include "hapi_portable.h"
+
+#define HAPI_CHECK(call) do { \
+  hapiError_t err = call; \
+  if (err != hapiSuccess) { \
+    fprintf(stderr, "HAPI> hapi call failed at %s:%d: %s\n", __FILE__, __LINE__, hapiGetErrorString(err)); \
+  } \
+} while(0)
+
+#define SERVER_FIFO_TEMPLATE "/tmp/server_pipe_%ld"
+#define CLIENT_FIFO_TEMPLATE "/tmp/client_pipe_%ld"
+#define BUFFER_SIZE 256
+#define STREAM_BUF_SIZE 1024
+
+// Managing memory state in server
+std::unordered_map<int, std::pair<void*, size_t>> hapiMemoryMap;
+int allocId = 0;
+
+void hapiProcessMemoryRequest(int server_fd, int my_device, char* buf)
+{
+  long client_pid;
+  char command[BUFFER_SIZE];
+  sscanf(buf, "%[^:]:", command);
+
+  char* pid_str = strchr(buf, ':');
+  if (pid_str) client_pid = atol(pid_str + 1); else return;
+
+  printf("HAPI> Processing memory request: %s from client %ld\n", command, client_pid);
+
+  char client_fifo_path[BUFFER_SIZE];
+  sprintf(client_fifo_path, CLIENT_FIFO_TEMPLATE, client_pid);
+  int client_fd = open(client_fifo_path, O_WRONLY);
+
+  if (strcmp(command, "CKPT") == 0) 
+  {
+    int client_pe, size;
+    // This sscanf is fine, it extracts the needed integer values
+    sscanf(buf, "CKPT:%ld:%d:%d:", &client_pid, &client_pe, &size);
+
+    // Correctly find the start of the handle by looking for the 4th colon.
+    char* handle_start = buf;
+    for (int i = 0; i < 4; ++i) {
+      handle_start = strchr(handle_start, ':');
+      if (!handle_start) {
+        printf("DAEMON: Error parsing CKPT message, could not find 4 colons.\n");
+        close(client_fd);
+        return;
+      }
+      handle_start++; // Move past the found colon
+    }
+
+    hapiIpcMemHandle_t ipc_handle;
+    memcpy(&ipc_handle, handle_start, sizeof(hapiIpcMemHandle_t));
+
+    void* client_ptr;
+    HAPI_CHECK(hapiIpcOpenMemHandle(&client_ptr, ipc_handle, hapiIpcMemLazyEnablePeerAccess));
+
+    std::pair<void*, size_t> allocation = std::make_pair((void*) NULL, size);
+    HAPI_CHECK(hapiMalloc((void**) &(allocation.first), size));
+
+    HAPI_CHECK(hapiMemcpy((void*) allocation.first, client_ptr, size, hapiMemcpyDeviceToDevice));
+    hapiMemoryMap[allocId] = allocation;
+
+    HAPI_CHECK(hapiIpcCloseMemHandle(client_ptr));
+    write(client_fd, &allocId, sizeof(int));
+    allocId++;
+  }
+  else if (strcmp(command, "GET") == 0)
+  {
+    int alloc_id;
+    sscanf(buf, "GET:%ld:%d", &client_pid, &alloc_id);
+
+    void* ptr = hapiMemoryMap[alloc_id].first;
+    hapiIpcMemHandle_t ipc_handle;
+    HAPI_CHECK(hapiIpcGetMemHandle(&ipc_handle, ptr));
+    write(client_fd, &ipc_handle, sizeof(hapiIpcMemHandle_t));
+  }
+  else if (strcmp(command, "FREE") == 0)
+  {
+    int alloc_id;
+    sscanf(buf, "FREE:%ld:%d", &client_pid, &alloc_id);
+
+    auto it = hapiMemoryMap.find(alloc_id);
+    if (it != hapiMemoryMap.end()) {
+      HAPI_CHECK(hapiFree(it->second.first));
+      hapiMemoryMap.erase(it);
+    }
+    write(client_fd, "\0", 1);
+  }
+  else if (strcmp(command, "KILL") == 0)
+  {
+    printf("Server: KILL command received from client %ld\n", client_pid);
+    write(client_fd, "\0", 1);
+    close(server_fd);
+
+    char server_fifo[BUFFER_SIZE];
+    sprintf(server_fifo, SERVER_FIFO_TEMPLATE, my_device);
+    if (remove(server_fifo) == 0) {
+        printf("File '%s' deleted successfully.\n", server_fifo);
+    } else {
+        printf("Error deleting file '%s': %s\n", server_fifo, strerror(errno));
+    }
+    exit(0);
+  }
+
+  close(client_fd);
+}
+
+void hapiStartMemoryDaemon(int my_device) {
+
+  int current_cpu = sched_getcpu();
+  printf("Daemon: Current CPU is %d\n", current_cpu);
+
+  // Child process (daemon)
+  printf("DAEMON: Starting daemon process PID=%d\n", getpid());
+  
+  // Set up the daemon's hapi context
+  hapiSetDevice(my_device);
+
+  char server_fifo[BUFFER_SIZE];
+  sprintf(server_fifo, SERVER_FIFO_TEMPLATE, my_device);
+  mkfifo(server_fifo, 0666);
+  
+  // Open server FIFO for reading (this may block until a writer connects)
+  char server_fifo_path[BUFFER_SIZE];
+  sprintf(server_fifo_path, SERVER_FIFO_TEMPLATE, my_device);
+  printf("DAEMON: Opening server FIFO %s\n", server_fifo_path);
+  int server_fd = open(server_fifo_path, O_RDONLY | O_NONBLOCK);
+  if (server_fd == -1) {
+    perror("DAEMON: open server FIFO");
+    exit(1);
+  }
+  
+  // Make it blocking for actual reads
+  int flags = fcntl(server_fd, F_GETFL);
+  fcntl(server_fd, F_SETFL, flags & ~O_NONBLOCK);
+
+  char ready_fifo_path[BUFFER_SIZE];
+  sprintf(ready_fifo_path, "/tmp/daemon_ready_%d", my_device);
+  
+  // Signal parent that daemon is ready
+  int ready_fd = open(ready_fifo_path, O_WRONLY);
+  if (ready_fd == -1) {
+    perror("DAEMON: open ready FIFO for writing");
+    exit(1);
+  }
+  write(ready_fd, "1", 1);
+  close(ready_fd);
+  
+  printf("DAEMON: Ready signal sent to parent\n");
+  
+  // Main daemon loop
+  char stream_buf[STREAM_BUF_SIZE];
+  size_t data_in_stream = 0;
+  int bytes_read;
+
+  while (1)
+  {
+    // read() will block here until data is available
+    bytes_read = read(server_fd, stream_buf + data_in_stream, 
+                              STREAM_BUF_SIZE - data_in_stream);
+
+    if (bytes_read > 0)
+    {
+      printf("DAEMON: Read %d bytes from server FIFO\n", bytes_read);
+      data_in_stream += bytes_read;
+      
+      if (data_in_stream >= STREAM_BUF_SIZE) {
+        printf("DAEMON: Stream buffer overflow");
+        exit(1);
+      }
+
+      // Process all complete messages in the buffer
+      while (data_in_stream > 0)
+      {
+        size_t msg_len = 0;
+        // We need at least 4 bytes to identify a command
+        if (data_in_stream < 4) break;
+
+        if (strncmp(stream_buf, "CKPT", 4) == 0) {
+          // CKPT message format: "CKPT:<pid>:<pe>:<size>:<ipc_handle>"
+          // Find the end of the text part (after the 4th colon)
+          const char *p = stream_buf;
+          int colons = 0;
+          size_t header_len = 0;
+          for (size_t i = 0; i < data_in_stream; ++i) {
+            if (p[i] == ':') {
+              colons++;
+              if (colons == 4) { // This must be 4
+                header_len = i + 1;
+                break;
+              }
+            }
+          }
+
+          if (header_len == 0) {
+            // Header is incomplete, need more data
+            break;
+          }
+
+          msg_len = header_len + sizeof(hapiIpcMemHandle_t);
+          if (data_in_stream < msg_len) {
+            // Full message not yet received
+            break;
+          }
+        } else {
+          // Other messages are simple null-terminated strings
+          char* msg_end = (char*)memchr(stream_buf, '\0', data_in_stream);
+          if (msg_end == NULL) {
+            // Incomplete message
+            break;
+          }
+          msg_len = (msg_end - stream_buf) + 1;
+        }
+
+        if (msg_len == 0) break; // Should not happen
+
+        char current_request[BUFFER_SIZE];
+        if (msg_len > BUFFER_SIZE) {
+          printf("DAEMON: Error, received message too long (%zu bytes). Aborting.\n", msg_len);
+          exit(1);
+        }
+        memcpy(current_request, stream_buf, msg_len);
+        
+        // Process the request. Note: This may exit on a KILL command.
+        hapiProcessMemoryRequest(server_fd, my_device, current_request);
+
+        // Remove processed message from buffer
+        data_in_stream -= msg_len;
+        memmove(stream_buf, stream_buf + msg_len, data_in_stream);
+      }
+    }
+    else if (bytes_read == 0)
+    {
+      // A writer closed the connection. The FIFO is still open.
+      // The next read() will block until a new writer connects.
+      // A small sleep prevents a potential tight spin-loop on misconfiguration.
+      usleep(1000);
+    }
+    else // bytes_read < 0
+    {
+      // An error occurred.
+      if (errno == EINTR) {
+        continue; // Interrupted by a signal, just try again.
+      }
+      perror("DAEMON: read from server FIFO");
+      break; // Exit on fatal error.
+    }
+  }
+  
+  close(server_fd);
+  exit(0);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    fprintf(stderr, "Usage: %s <local_rank>\n", argv[0]);
+    return 1;
+  }
+  const char* local_rank_str = argv[1];
+  int local_rank = atoi(local_rank_str);
+  hapiStartMemoryDaemon(local_rank);
+}
\ No newline at end of file
diff --git a/src/arch/cuda/hybridAPI/hapi_portable.h b/src/arch/cuda/hybridAPI/hapi_portable.h
new file mode 100644
index 0000000000..c772a7ceaf
--- /dev/null
+++ b/src/arch/cuda/hybridAPI/hapi_portable.h
@@ -0,0 +1,200 @@
+#pragma once
+
+#undef CMK_CUDA
+#undef CMK_HIP
+
+#include "conv-mach-opt.h"
+
+#ifdef CMK_CUDA
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+
+#define hapiStream_t cudaStream_t
+
+#define hapiEvent_t cudaEvent_t
+
+#define hapiSetDevice(dev) cudaSetDevice(dev)
+
+#define hapiDevAttrClockRate cudaDevAttrClockRate
+#define hapiDeviceGetAttribute(a,b,c) cudaDeviceGetAttribute(a,b,c)
+
+#define hapiPeekAtLastError cudaPeekAtLastError
+#define hapiGetLastError cudaGetLastError
+#define hapiEventDefault cudaEventDefault
+#define hapiEventDisableTiming cudaEventDisableTiming
+
+#define hapiGetDeviceCount(devCount) cudaGetDeviceCount(devCount)
+
+#define hapiDeviceCanAccessPeer(canAccess, dev1, dev2) \
+    cudaDeviceCanAccessPeer(canAccess, dev1, dev2)
+
+#define hapiDeviceEnablePeerAccess(dev, flags) \
+    cudaDeviceEnablePeerAccess(dev, flags)
+
+#define hapiEventCreateWithFlags(flags, event) cudaEventCreateWithFlags(flags, event)
+
+#define hapiEventRecord(event, stream) cudaEventRecord(event, stream)
+#define hapiEventQuery(event) cudaEventQuery(event)
+#define hapiEventDestroy(event) cudaEventDestroy(event)
+#define hapiStreamWaitEvent(stream, event, flags) \
+    cudaStreamWaitEvent(stream, event, flags)
+
+#define hapiStreamSynchronize(stream) cudaStreamSynchronize(stream)
+#define hapiDeviceSynchronize cudaDeviceSynchronize
+#define hapiEventElapsedTime(a, b, c) cudaEventElapsedTime(a, b, c)
+#define hapiMemGetInfo(a, b) cudaMemGetInfo(a, b)
+#define hapiStreamCreate(stream) cudaStreamCreate(stream)
+#define hapiStreamDestroy cudaStreamDestroy
+#define hapiStreamDefault cudaStreamDefault
+#define hapiStreamNonBlocking cudaStreamNonBlocking
+#define hapiStreamCreateWithPriority cudaStreamCreateWithPriority
+
+#define hapiLaunchHostFunc(stream, func, args) \
+    cudaLaunchHostFunc(stream, func, args)
+
+#define hapiStreamPerThread cudaStreamPerThread
+
+#define hapiIpcMemHandle_t cudaIpcMemHandle_t
+
+#define hapiIpcEventHandle_t cudaIpcEventHandle_t
+
+#define hapiIpcGetMemHandle(handle, ptr) cudaIpcGetMemHandle(handle, ptr)
+#define hapiIpcCloseMemHandle(handle) cudaIpcCloseMemHandle(handle)
+
+#define hapiIpcGetEventHandle(handle, event) cudaIpcGetEventHandle(handle, event)
+
+#define hapiIpcOpenMemHandle(ptr, handle, flags) \
+    cudaIpcOpenMemHandle(ptr, handle, flags)
+
+#define hapiIpcOpenEventHandle(event, handle) \
+    cudaIpcOpenEventHandle(event, handle)
+
+#define hapiDeviceProp cudaDeviceProp
+
+#define hapiGetDeviceProperties(prop, dev) cudaGetDeviceProperties(prop, dev)
+#define hapiGetDevice(dev) cudaGetDevice(dev)
+
+#define hapiMalloc(ptr, size) cudaMalloc(ptr, size)
+#define hapiFree(ptr) cudaFree(ptr)
+#define hapiMallocHost(ptr, size) cudaMallocHost(ptr, size)
+#define hapiFreeHost(ptr) cudaFreeHost(ptr)
+
+#define hapiErrorMemoryAllocation cudaErrorMemoryAllocation
+#define hapiErrorInitializationError cudaErrorInitializationError
+#define hapiSuccess cudaSuccess
+#define hapiError_t cudaError_t
+
+#define hapiMemcpyKind cudaMemcpyKind
+#define hapiMemcpyHostToHost cudaMemcpyHostToHost
+#define hapiMemcpyHostToDevice cudaMemcpyHostToDevice
+#define hapiMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define hapiMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define hapiMemcpy(dst, src, count, kind) cudaMemcpy(dst, src, count, kind)
+#define hapiMemcpy2D(dst, dpitch, src, spitch, width, height, kind) \
+    cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kind)
+
+#define hapiGetErrorString(err) cudaGetErrorString(err)
+
+#define hapiEventInterprocess cudaEventInterprocess
+#define hapiIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess
+
+#define hapiMemcpyAsync cudaMemcpyAsync
+#define hapiMemcpy2DAsync cudaMemcpy2DAsync
+
+#endif // CMK_CUDA
+
+#ifdef CMK_HIP
+
+#include <hip/hip_runtime.h>
+
+#define hapiStream_t hipStream_t
+
+#define hapiEvent_t hipEvent_t
+
+#define hapiSetDevice(dev) hipSetDevice(dev)
+#define hapiGetDeviceCount(devCount) hipGetDeviceCount(devCount)
+#define hapiDevAttrClockRate hipDeviceAttributeClockRate
+#define hapiDeviceGetAttribute(a,b,c) hipDeviceGetAttribute(a,b,c)
+
+#define hapiPeekAtLastError hipPeekAtLastError
+#define hapiGetLastError hipGetLastError
+
+#define hapiDeviceCanAccessPeer(canAccess, dev1, dev2) \
+    hipDeviceCanAccessPeer(canAccess, dev1, dev2)
+#define hapiDeviceEnablePeerAccess(dev, flags) \
+    hipDeviceEnablePeerAccess(dev, flags)
+
+#define hapiEventCreateWithFlags(flags, event) hipEventCreateWithFlags(flags, event)
+#define hapiEventRecord(event, stream) hipEventRecord(event, stream)
+#define hapiEventQuery(event) hipEventQuery(event)
+#define hapiEventDestroy(event) hipEventDestroy(event)
+#define hapiStreamWaitEvent(stream, event, flags) \
+    hipStreamWaitEvent(stream, event, flags)
+
+#define hapiStreamSynchronize(stream) hipStreamSynchronize(stream)
+#define hapiDeviceSynchronize hipDeviceSynchronize
+#define hapiEventElapsedTime(a, b, c) hipEventElapsedTime(a, b, c)
+#define hapiMemGetInfo(a, b) hipMemGetInfo(a, b)
+#define hapiLaunchHostFunc(stream, func, args) \
+    hipLaunchHostFunc(stream, func, args)
+
+#define hapiStreamPerThread hipStreamPerThread
+
+#define hapiIpcMemHandle_t hipIpcMemHandle_t
+
+#define hapiIpcEventHandle_t hipIpcEventHandle_t
+
+#define hapiIpcGetMemHandle(handle, ptr) hipIpcGetMemHandle(handle, ptr)
+#define hapiIpcCloseMemHandle(handle) hipIpcCloseMemHandle(handle)
+
+#define hapiIpcGetEventHandle(handle, event) hipIpcGetEventHandle(handle, event)
+
+#define hapiIpcOpenMemHandle(ptr, handle, flags) \
+    hipIpcOpenMemHandle(ptr, handle, flags)
+
+#define hapiIpcOpenEventHandle(event, handle) \
+    hipIpcOpenEventHandle(event, handle)
+
+#define hapiDeviceProp hipDeviceProp_t
+
+#define hapiGetDeviceProperties(prop, dev) hipGetDeviceProperties(prop, dev)
+#define hapiGetDevice(dev) hipGetDevice(dev)
+#define hapiStreamCreate(stream) hipStreamCreate(stream)
+#define hapiStreamDestroy hipStreamDestroy
+#define hapiStreamDefault hipStreamDefault
+#define hapiStreamNonBlocking hipStreamNonBlocking
+#define hapiStreamCreateWithPriority hipStreamCreateWithPriority
+
+#define hapiMalloc(ptr, size) hipMalloc(ptr, size)
+#define hapiFree(ptr) hipFree(ptr)
+#define hapiMallocHost(ptr, size) hipHostMalloc(ptr, size)
+#define hapiFreeHost(ptr) hipHostFree(ptr)
+
+#define hapiErrorMemoryAllocation hipErrorMemoryAllocation
+#define hapiErrorInitializationError hipErrorInitializationError
+#define hapiSuccess hipSuccess
+#define hapiError_t hipError_t
+#define hapiStreamDestroy hipStreamDestroy
+#define hapiStreamDefault hipStreamDefault
+#define hapiStreamCreateWithPriority hipStreamCreateWithPriority
+
+#define hapiMemcpyKind hipMemcpyKind
+#define hapiMemcpyHostToHost hipMemcpyHostToHost
+#define hapiMemcpyHostToDevice hipMemcpyHostToDevice
+#define hapiMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define hapiMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define hapiMemcpy(dst, src, count, kind) hipMemcpy(dst, src, count, kind)
+#define hapiMemcpy2D(dst, dpitch, src, spitch, width, height, kind) \
+    hipMemcpy2D(dst, dpitch, src, spitch, width, height, kind)
+#define hapiGetErrorString(err) hipGetErrorString(err)
+
+#define hapiEventDisableTiming hipEventDisableTiming
+#define hapiEventDefault hipEventDefault
+#define hapiEventInterprocess hipEventInterprocess
+#define hapiIpcMemLazyEnablePeerAccess hipIpcMemLazyEnablePeerAccess
+
+#define hapiMemcpyAsync hipMemcpyAsync
+#define hapiMemcpy2DAsync hipMemcpy2DAsync
+
+#endif // CMK_HIP
diff --git a/src/arch/mpi/charmrun b/src/arch/mpi/charmrun
index 901df49fc4..e8f188ad9e 100755
--- a/src/arch/mpi/charmrun
+++ b/src/arch/mpi/charmrun
@@ -74,6 +74,11 @@ do
 		args=("$1" "$2" "${args[@]}")
 		shift
 		;;
+	++nodelist)
+		machinefile="$2"
+		args=("-machinefile" "$2" "${args[@]}")
+		shift
+		;;
 	++quiet)
 		QUIET=1
 		;;
diff --git a/src/arch/mpi/charmrun_elastic b/src/arch/mpi/charmrun_elastic
new file mode 100755
index 0000000000..f9e17224c3
--- /dev/null
+++ b/src/arch/mpi/charmrun_elastic
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+is_restart=false
+
+original_args=("$@")
+
+if [[ "$(uname)" == "Darwin" ]]; then
+    TMPDIR="/tmp"
+else
+    TMPDIR="/dev/shm"
+fi
+
+pes_file="$TMPDIR/numRestartProcs.txt"
+
+time {
+while true; do
+    args=()
+    pes_args=""
+    restart_arg=""
+
+    temp_args=("${original_args[@]}")
+    i=0
+    while [ $i -lt ${#temp_args[@]} ]; do
+        arg="${temp_args[$i]}"
+        case "$arg" in
+        +p|++p)
+        i=$((i+1))
+        pes_arg="$arg ${temp_args[$i]}"
+        ;;
+        +p[0-9]*)
+        pes_arg="$arg"
+        ;;
+        ++p[0-9]*)
+        pes_arg="$arg"
+        ;;
+        *)
+        args+=("$arg")
+        ;;
+        esac
+        i=$((i+1))
+    done
+
+    # 2. Check the flag. If it's a restart, prepare the extra argument.
+    if [ "$is_restart" = true ]; then
+        restart_arg="+restart $TMPDIR"
+        if [ -f "$pes_file" ]; then
+            num_pes=$(cat "$pes_file")
+            echo "Charm> Reading pes $num_pes from $pes_file"
+            pes_arg="+p $num_pes"
+        fi
+    fi
+
+    # Pass all script arguments ("$@") to the executable
+    "$(dirname "$0")/charmrun" $pes_arg "${args[@]}" $restart_arg
+
+    EXIT_CODE=$?
+
+    if [ "$EXIT_CODE" -eq 100 ]; then
+        is_restart=true
+        echo "Restart signal (code 100) received. Looping again."
+        echo "----------------------------------------"
+    else
+        echo "Final exit signal (code $EXIT_CODE) received. Exiting loop."
+        break
+    fi
+done
+}
+
+echo "Control loop finished."
\ No newline at end of file
diff --git a/src/arch/mpi/conv-common.h b/src/arch/mpi/conv-common.h
index 06aaafd9e4..e3748b6237 100644
--- a/src/arch/mpi/conv-common.h
+++ b/src/arch/mpi/conv-common.h
@@ -50,3 +50,6 @@
 #define CMK_USE_COMMON_LOCK                                1
 
 #define CMK_ONESIDED_IMPL                                  1
+
+/* cuda aware mpi machine layer supports GPU-aware communication */
+#define CMK_GPU_COMM                   1
diff --git a/src/arch/mpi/machine.C b/src/arch/mpi/machine.C
index 368cb53d7c..7eb70ef7c9 100644
--- a/src/arch/mpi/machine.C
+++ b/src/arch/mpi/machine.C
@@ -1,16 +1,18 @@
-
 /** @file
  * MPI based machine layer
  * @ingroup Machine
  */
 /*@{*/
 
+#include <string>
 #include <stdio.h>
 #include <errno.h>
 #include "converse.h"
 #include "cmirdmautils.h"
 #include <mpi.h>
 #include <algorithm>
+#include <cuda_runtime.h>
+
 
 #ifdef AMPI
 #  warning "We got the AMPI version of mpi.h, instead of the system version--"
@@ -41,6 +43,16 @@ static char* strsignal(int sig) {
 
 #include "machine.h"
 #include "pcqueue.h"
+#include "conv-ccs.h"
+#include "ccs-server.h"
+#include "ckrescale.h"
+
+#if CMK_SHRINK_EXPAND
+CcsDelayedReply shrinkExpandreplyToken;
+extern int numProcessAfterRestart;
+extern char *_shrinkexpand_basedir;
+int mynewpe=0;
+#endif
 
 /* Msg types to have different actions taken for different message types
  * REGULAR                     - Regular Charm++ message
@@ -53,7 +65,12 @@ static char* strsignal(int sig) {
  * */
 
 #define CMI_MSGTYPE(msg)            ((CmiMsgHeaderBasic *)msg)->mpiMsgType
-enum mpiMsgTypes { REGULAR, ONESIDED_BUFFER_SEND, ONESIDED_BUFFER_RECV, ONESIDED_BUFFER_DIRECT_RECV, ONESIDED_BUFFER_DIRECT_SEND, POST_DIRECT_RECV, POST_DIRECT_SEND};
+enum mpiMsgTypes { REGULAR, ONESIDED_BUFFER_SEND, ONESIDED_BUFFER_RECV, ONESIDED_BUFFER_DIRECT_RECV, ONESIDED_BUFFER_DIRECT_SEND, POST_DIRECT_RECV, POST_DIRECT_SEND,
+#if CMK_CUDA
+    DEVICE_SEND_OP,
+    DEVICE_RECV_OP
+#endif
+};
 
 /* =======Beginning of Definitions of Performance-Specific Macros =======*/
 /* Whether to use multiple send queue in SMP mode */
@@ -349,6 +366,13 @@ typedef struct msg_list {
     struct msg_list *next;
     int size, destpe, mode, type;
     MPI_Request req;
+#if CMK_CUDA
+    void* ptr;
+    size_t device_size;
+    DeviceRdmaOp* op;
+    uint64_t tag;
+    int dest_mpi_rank;
+#endif
 #if CMK_ONESIDED_IMPL
     void *ref;
     // This field can store the pointer to any structure that might have to be accessed.
@@ -409,6 +433,38 @@ static int SendMsgBuf(void);
 static  void EnqueueMsg(void *m, int size, int node, int mode, int type, void *ref);
 #endif
 
+#if CMK_CUDA
+
+CpvDeclare(int, tag_counter);
+
+MPI_Win globalDevWin = MPI_WIN_NULL;
+void LrtsInitRMA() {
+    int result = MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &globalDevWin);
+    
+    if (result != MPI_SUCCESS) {
+        CmiAbort("RMA Window Creation Failed!");
+    }
+}
+
+void LrtsCleanupRMA() {
+    if (globalDevWin != MPI_WIN_NULL) {
+        MPI_Win_free(&globalDevWin); // This will fail if memory is still attached!
+    }
+}
+
+#if CMK_SMP
+
+void* deviceRecvCallback(void* arg) {
+  DeviceRdmaOpMsg_* recv_msg = (DeviceRdmaOpMsg_*)arg;
+  CmiInvokeRecvHandler(recv_msg->op);
+  return NULL;
+}
+int deviceRecvCallbackHandler;
+
+#endif
+
+#endif
+
 /* ### End of Machine-running Related Functions ### */
 
 /* ### Beginning of Idle-state Related Functions ### */
@@ -431,6 +487,17 @@ void CmiNotifyIdleForMPI(void);
 #include "machine-ctrlmsg.C"
 #endif
 
+void print_nodelist(char* arg_nodelist){
+    FILE *f=fopen(arg_nodelist,"r");
+    char c;
+    c = fgetc(f); 
+    while (c != EOF) {
+      printf ("%c", c); 
+      c = fgetc(f); 
+    } 
+    fclose(f);
+}
+
 SMSG_LIST *allocateSmsgList(char *msg, int destNode, int size, int mode, int type, void *ref) {
   SMSG_LIST *msg_tmp = (SMSG_LIST *) malloc(sizeof(SMSG_LIST));
   msg_tmp->msg = msg;
@@ -452,14 +519,6 @@ static void EnqueueMsg(void *m, int size, int node, int mode, int type, void *re
     /*SMSG_LIST *msg_tmp = (SMSG_LIST *) CmiAlloc(sizeof(SMSG_LIST));*/
     SMSG_LIST *msg_tmp = allocateSmsgList((char *)m, node, size, mode, type, ref);
     MACHSTATE1(3,"EnqueueMsg to node %d {{ ", node);
-    msg_tmp->msg = (char *)m;
-    msg_tmp->size = size;
-    msg_tmp->destpe = node;
-    msg_tmp->next = 0;
-    msg_tmp->mode = mode;
-#if CMK_ONESIDED_IMPL
-    msg_tmp->ref = NULL;
-#endif
 
 #if MULTI_SENDQUEUE
     PCQueuePush(procState[CmiMyRank()].postMsgBuf,(char *)msg_tmp);
@@ -625,6 +684,7 @@ static void ReleasePostedMessages(void) {
 
     MACHSTATE1(2,"ReleasePostedMessages begin on %d {", CmiMyPe());
     while (msg_tmp!=0) {
+        int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank);
         done =0;
 #if CMK_SMP_TRACE_COMMTHREAD || CMK_TRACE_COMMOVERHEAD
         double startT = CmiWallTimer();
@@ -678,8 +738,13 @@ static void ReleasePostedMessages(void) {
                 // which is freed in the above code (either ONESIDED_BUFFER_DIRECT_RECV or
                 // ONESIDED_BUFFER_DIRECT_SEND)
             }
-            else
+            #if CMK_CUDA
+            else if(msg_tmp->type == DEVICE_SEND_OP || msg_tmp->type == DEVICE_RECV_OP) {
+                // TODO: check if we can remove this
+            }
+            #endif
 #endif
+            else
             {
               CmiFree(msg_tmp->msg);
             }
@@ -815,6 +880,7 @@ static int PumpMsgs(void) {
             CmiAbort("MPI_Iprobe failed\n");
 
         if (!flg) break;
+        int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank);
         CONDITIONAL_TRACE_USER_EVENT(70); /* MPI_Iprobe related user event */
         
         recd = 1;
@@ -824,7 +890,7 @@ static int PumpMsgs(void) {
 #if USE_ASYNC_RECV_FUNC
         if(nbytes >= IRECV_MSG_THRESHOLD) doSyncRecv = 0;
 #endif        
-        if(doSyncRecv){
+        if(doSyncRecv) {
             START_EVENT();
             if (MPI_SUCCESS != MPI_Recv(msg,nbytes,MPI_BYTE,sts.MPI_SOURCE,sts.MPI_TAG, charmComm,&sts))
                 CmiAbort("PumpMsgs: MPI_Recv failed!\n");            
@@ -846,7 +912,7 @@ static int PumpMsgs(void) {
 
 #endif /*end of !MPI_POST_RECV and !USE_MPI_CTRLMSG_SCHEME*/
 
-		if(doSyncRecv){
+		if (doSyncRecv) {
 			MACHSTATE2(3,"PumpMsgs recv one from node:%d to rank:%d", sts.MPI_SOURCE, CMI_DEST_RANK(msg));
 			CMI_CHECK_CHECKSUM(msg, nbytes);
 	#if CMK_ERROR_CHECKING
@@ -1048,6 +1114,13 @@ static void PumpMsgsBlocking(void) {
     handleOneRecvedMsg(nbytes, msg);
 }
 
+#if CMK_CUDA
+    #include <map>
+
+    std::vector<std::pair<MPI_Request, DeviceRdmaOp*>> rdma_requests;
+    // a map to tell how many rdma requests are gone to some rank 
+    std::map<int, int> access_epochs;
+#endif
 
 #if CMK_SMP
 
@@ -1055,32 +1128,36 @@ static void PumpMsgsBlocking(void) {
 static int SendMsgBuf(void) {
     SMSG_LIST *msg_tmp;
     char *msg;
-    int node, rank, size;
+    // int node, rank, size;
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     int i;
     int sent = 0;
 
-#if CMI_EXERT_SEND_CAP || CMI_DYNAMIC_EXERT_CAP
-    int sentCnt = 0;
-#endif
-
-#if CMI_DYNAMIC_EXERT_CAP
-    dynamicSendCap = CMI_DYNAMIC_MAXCAPSIZE;
-#endif
-
     MACHSTATE(2,"SendMsgBuf begin {");
-#if MULTI_SENDQUEUE
-    for (i=0; i<_Cmi_mynodesize+1; i++) { /* subtle: including comm thread */
-        if (!PCQueueEmpty(procState[i].postMsgBuf)) {
-            msg_tmp = (SMSG_LIST *)PCQueuePop(procState[i].postMsgBuf);
-#else
     /* single message sending queue */
     /* CmiLock(postMsgBufLock); */
     msg_tmp = (SMSG_LIST *)PCQueuePop(postMsgBuf);
     /* CmiUnlock(postMsgBufLock); */
     while (NULL != msg_tmp) {
-#endif
-
 #if CMK_ONESIDED_IMPL
+#if CMK_CUDA
+            if (msg_tmp->type == DEVICE_SEND_OP) {
+                if (MPI_Win_attach(globalDevWin, msg_tmp->ptr, msg_tmp->device_size) != MPI_SUCCESS) 
+                    CmiAbort("MPI_Win_attach failed\n");
+            } else if(msg_tmp->type == DEVICE_RECV_OP) {
+                if (access_epochs[msg_tmp->op->src_mpi_rank] == 0) 
+                    MPI_Win_lock(MPI_LOCK_SHARED, msg_tmp->op->src_mpi_rank, 0, globalDevWin);
+                access_epochs[msg_tmp->op->src_mpi_rank]++;
+                MPI_Request req;
+                int result = MPI_Rget((void*)msg_tmp->op->dest_ptr, msg_tmp->op->size, MPI_BYTE, 
+                msg_tmp->op->src_mpi_rank, (MPI_Aint)(msg_tmp->op->tag), msg_tmp->op->size, 
+                MPI_BYTE, globalDevWin, &req);
+                if (result != MPI_SUCCESS)
+                    CmiAbort("LrtsRecvDevice: MPI_Get failed!\n");
+                rdma_requests.push_back({req, msg_tmp->op});
+            } else
+#endif
             if(msg_tmp->type == ONESIDED_BUFFER_DIRECT_RECV || msg_tmp->type == ONESIDED_BUFFER_DIRECT_SEND) {
                 NcpyOperationInfo *ncpyOpInfo = (NcpyOperationInfo *)(msg_tmp->ref);
                 MPISendOrRecvOneBuffer(msg_tmp, ncpyOpInfo->tag);
@@ -1091,24 +1168,8 @@ static int SendMsgBuf(void) {
                 MPISendOneMsg(msg_tmp);
             }
             sent=1;
-
-#if CMI_EXERT_SEND_CAP
-            if (++sentCnt == SEND_CAP) break;
-#elif CMI_DYNAMIC_EXERT_CAP
-            if (++sentCnt >= dynamicSendCap) break;
-            if (CpvAccess(MsgQueueLen) > CMI_DYNAMIC_OUTGOING_THRESHOLD)
-                dynamicSendCap = CMI_DYNAMIC_SEND_CAPSIZE;
-#endif
-
-#if ! MULTI_SENDQUEUE
-            /* CmiLock(postMsgBufLock); */
             msg_tmp = (SMSG_LIST *)PCQueuePop(postMsgBuf);
-            /* CmiUnlock(postMsgBufLock); */
-#endif
         }
-#if MULTI_SENDQUEUE
-    }
-#endif
     MACHSTATE(2,"}SendMsgBuf end ");
     return sent;
 }
@@ -1144,6 +1205,120 @@ static double sendtime = 0.0;
 
 #endif //end of CMK_SMP
 
+#if CMK_CUDA
+
+#if CMK_SMP
+
+void processRdmaRequests() {
+    int n = rdma_requests.size();
+    if (n == 0) return;
+
+    static std::vector<MPI_Request> requests(10);
+    static std::vector<int> indices(10);
+    static std::vector<MPI_Status> statuses(10);
+
+    if(n > requests.size()) {
+        requests.resize(n); indices.resize(n); statuses.resize(n);
+    }
+
+    for (int i = 0; i < n; i++)
+        requests[i] = rdma_requests[i].first;
+
+    int outcount;
+    MPI_Testsome(n,
+                 requests.data(),
+                 &outcount,
+                 indices.data(),
+                 statuses.data());
+
+    if (outcount == MPI_UNDEFINED || outcount == 0)
+        return;
+
+    for (int i = 0; i < outcount; i++) {
+        int idx = indices[i];
+
+        auto &entry = rdma_requests[idx];
+        DeviceRdmaOp *op = entry.second;
+
+        DeviceRdmaOpMsg_* conv_msg =
+            (DeviceRdmaOpMsg_*)CmiAlloc(sizeof(DeviceRdmaOpMsg_));
+
+        conv_msg->op = op;
+
+        access_epochs[op->src_mpi_rank]--;
+        if (access_epochs[op->src_mpi_rank] == 0)
+            MPI_Win_unlock(op->src_mpi_rank, globalDevWin);
+
+        CmiSetHandler(conv_msg, deviceRecvCallbackHandler);
+        CmiPushPE(CmiRankOf(op->dest_pe), conv_msg);
+
+        rdma_requests[idx].first = MPI_REQUEST_NULL;
+    }
+
+    rdma_requests.erase(
+        std::remove_if(rdma_requests.begin(),
+                       rdma_requests.end(),
+                       [](const std::pair<MPI_Request, DeviceRdmaOp*> &e) {
+                           return e.first == MPI_REQUEST_NULL;
+                       }),
+        rdma_requests.end());
+}
+
+#else
+
+void processRdmaRequests() {
+    int n = rdma_requests.size();
+    if (n == 0) return;
+
+    static std::vector<MPI_Request> requests(10);
+    static std::vector<int> indices(10);
+    static std::vector<MPI_Status> statuses(10);
+
+    if(n > requests.size()) {
+        requests.resize(n); indices.resize(n); statuses.resize(n);
+    }
+
+    for (int i = 0; i < n; i++)
+        requests[i] = rdma_requests[i].first;
+
+    int outcount;
+    MPI_Testsome(n,
+                 requests.data(),
+                 &outcount,
+                 indices.data(),
+                 statuses.data());
+
+    if (outcount == MPI_UNDEFINED || outcount == 0)
+        return;
+
+    for (int i = 0; i < outcount; i++) {
+        int idx = indices[i];
+
+        auto &entry = rdma_requests[idx];
+        DeviceRdmaOp *op = entry.second;
+
+        access_epochs[op->src_mpi_rank]--;
+        if (access_epochs[op->src_mpi_rank] == 0)
+            MPI_Win_unlock(op->src_mpi_rank, globalDevWin);
+        
+        CmiInvokeRecvHandler(op);
+
+        rdma_requests[idx].first = MPI_REQUEST_NULL;
+    }
+
+    rdma_requests.erase(
+        std::remove_if(rdma_requests.begin(),
+                       rdma_requests.end(),
+                       [](const std::pair<MPI_Request, DeviceRdmaOp*> &e) {
+                           return e.first == MPI_REQUEST_NULL;
+                       }),
+        rdma_requests.end());
+}
+
+#endif
+
+#endif
+
 void LrtsAdvanceCommunication(int whenidle) {
 #if REPORT_COMM_METRICS
     double t1, t2, t3, t4;
@@ -1158,10 +1333,10 @@ void LrtsAdvanceCommunication(int whenidle) {
 #endif
 
     ReleasePostedMessages();
+    
 #if REPORT_COMM_METRICS
     t3 = CmiWallTimer();
 #endif
-
     SendMsgBuf();
 
 #if REPORT_COMM_METRICS
@@ -1171,6 +1346,10 @@ void LrtsAdvanceCommunication(int whenidle) {
     sendtime += (t4-t3);
 #endif
 
+#if CMK_CUDA
+    processRdmaRequests();
+#endif
+
 #else /* non-SMP case */
     ReleasePostedMessages();
 
@@ -1185,6 +1364,10 @@ void LrtsAdvanceCommunication(int whenidle) {
     releasetime += (t2-t1);
 #endif
 
+#if CMK_CUDA
+    processRdmaRequests();
+#endif
+
 #endif /* end of #if CMK_SMP */
 }
 /* ######End of functions related with communication progress ###### */
@@ -1322,6 +1505,9 @@ void LrtsExit(int exitcode) {
       sigaction(SIGINT, &signal_int, NULL);
 #else
       signal(SIGINT, signal_int);
+#endif
+#if CMK_CUDA
+      LrtsCleanupRMA();
 #endif
       MPI_Finalize();
 #endif
@@ -1388,6 +1574,12 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID) {
     char** largv=*argv;
     int tagUbGetResult;
     void *tagUbVal;
+    char* arg_nodelist;
+
+    /*if (CmiGetArgStringDesc(argv, "++nodelist", &arg_nodelist, "nodelist"))
+    {
+        print_nodelist(arg_nodelist);
+    }*/
 
     if (CmiGetArgFlag(largv, "+comm_thread_only_recv")) {
 #if CMK_SMP
@@ -1518,6 +1710,7 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID) {
       if (newpe == -1) {
           MPI_Barrier(charmComm);
           //MPI_Barrier(charmComm);
+          LrtsCleanupRMA();
           MPI_Finalize();
           exit(0);
       }
@@ -1681,6 +1874,12 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID) {
     rdmaTagLock = CmiCreateLock();
 #endif
 #endif
+
+#if CMK_CUDA
+    LrtsInitRMA();
+    CpvInitialize(int, tag_counter);
+    CpvAccess(tag_counter) = 0;
+#endif
 }
 
 INLINE_KEYWORD void LrtsNotifyIdle(void) {}
@@ -1818,6 +2017,7 @@ void LrtsPostCommonInit(int everReturn) {
 /* ######End of functions related with starting programs###### */
 
 /***********************************************************************
+ *
  *
  * Abort function:
  *
@@ -1989,6 +2189,56 @@ int CmiBarrierZero(void) {
 }
 
 
+#if CMK_SHRINK_EXPAND
+void ConverseCleanup(void)
+{
+  MACHSTATE(2,"ConverseCleanup {");
+
+  #if (CMK_SMP && !CMK_SMP_NO_COMMTHD)
+    CmiAbort(" ConverseCleanup called in SMP. CmiBarrier needs to be called on comm thread as well! Right now, this hangs. Remove this abort when SMP support implemented.\n");
+  #endif
+  CmiBarrier(); // TODO: for smp, this must also be called on comm thread. otherwise, hangs
+
+#if CMK_USE_SYSVSHM
+	CmiExitSysvshm();
+#elif CMK_USE_PXSHM
+	CmiExitPxshm();
+#endif
+  ConverseCommonExit();               /* should be called by every rank */
+  CmiNodeBarrier();        /* single node SMP, make sure every rank is done */
+  //if (CmiMyRank()==0) CmiStdoutFlush();
+
+  if (get_shrinkexpand_exit() && CmiMyPe() == 0) {
+    // launch charmrun here
+
+    std::string path = std::string(_shrinkexpand_basedir) + "/numRestartProcs.txt";
+    FILE *fp = fopen(path.c_str(), "w");
+    if (fp != NULL) {
+      CmiPrintf("Charm> Writing numProcessAfterRestart %i to %s\n", numProcessAfterRestart, path.c_str());
+      fprintf(fp, "%d", numProcessAfterRestart);
+      fclose(fp);
+    } else {
+      perror("Error opening file");
+    }
+
+    // Use the new synchronous reply function. This blocks until the reply is
+    // sent and acknowledged by charmrun, robustly fixing the race condition.
+    CcsSendDelayedReplyAndTerm(shrinkExpandreplyToken, 0, 0);
+
+    CmiBarrier();
+    ConverseExit(100);
+
+  } else {
+    // kill all other processes
+    CmiBarrier();
+    //printf("Exiting PE %d\n", CmiMyPe());
+    //fflush(stdout);
+    ConverseExit();
+  }
+}
+#endif
+
+
 #if CMK_MEM_CHECKPOINT || CMK_MESSAGE_LOGGING
 
 void mpi_restart_crashed(int pe, int rank)
@@ -2057,6 +2307,7 @@ void CkDieNow(void)
     }
     MPI_Barrier(charmComm);
  //   MPI_Barrier(charmComm);
+    LrtsCleanupRMA();
     MPI_Finalize();
     exit(0);
 #endif
@@ -2357,6 +2608,63 @@ void CmiSetupMachineRecvBuffersUser(void)
 }
 /*=======End of Msg Histogram or Dynamic Post-Recv Related Funcs======*/
 
+#if CMK_CUDA
+
+#include <map>
+std::map<std::pair<void*, size_t>, uint64_t> cache_window;
+
+void LrtsSendDevice(int dest_mpi_rank, int src_mpi_rank, const void*& ptr, size_t size, uint64_t& tag) {
+    if(cache_window.find(std::make_pair((void*)ptr, size)) != cache_window.end()) {
+        tag = cache_window[std::make_pair((void*)ptr, size)];
+    } else {
+#if CMK_SMP
+        SMSG_LIST *msg_tmp = (SMSG_LIST *) malloc(sizeof(SMSG_LIST));
+        msg_tmp->ptr = (void*)(ptr);
+        msg_tmp->device_size = size;
+        msg_tmp->type = DEVICE_SEND_OP;
+        msg_tmp->dest_mpi_rank = dest_mpi_rank;
+        msg_tmp->tag = (uint64_t)(void*)(ptr);
+        PCQueuePush(postMsgBuf,(char *)msg_tmp);
+#else
+        if (MPI_Win_attach(globalDevWin, (void*)ptr, size) != MPI_SUCCESS) CmiAbort("MPI_Win_attach failed\n");
+#endif
+        // tag is the virtual address of the buffer
+        tag = (uint64_t)(void*)(ptr);
+        cache_window[{(void*)ptr, size}] = tag;
+    }
+}
+
+std::map<int, bool> handler_registered;
+
+void LrtsRecvDevice(DeviceRdmaOp* op, DeviceRecvType type)
+{
+#if CMK_SMP
+    if(handler_registered[CmiMyPe()] == false) {
+        deviceRecvCallbackHandler = CmiRegisterHandler((CmiHandler) deviceRecvCallback);
+        handler_registered[CmiMyPe()] = true;
+    }
+    SMSG_LIST *msg_tmp = (SMSG_LIST *) malloc(sizeof(SMSG_LIST));
+    msg_tmp->op = op;
+    msg_tmp->type = DEVICE_RECV_OP;
+    PCQueuePush(postMsgBuf,(char *)msg_tmp);
+#else
+    if (access_epochs[op->src_mpi_rank] == 0) {
+        MPI_Win_lock(MPI_LOCK_SHARED, op->src_mpi_rank, 0, globalDevWin);
+    }
+    access_epochs[op->src_mpi_rank]++;
+    MPI_Request req;
+    int result = MPI_Rget((void*)op->dest_ptr, op->size, MPI_BYTE, 
+    op->src_mpi_rank, (MPI_Aint)(op->tag), op->size, 
+    MPI_BYTE, globalDevWin, &req);
+    if (result != MPI_SUCCESS) {
+        CmiAbort("LrtsRecvDevice: MPI_Get failed!\n");
+    }
+    rdma_requests.push_back({req, op});
+#endif
+}
+
+#endif // CMK_CUDA
+
 
 /*@}*/
 
diff --git a/src/arch/netlrts/machine-eth.C b/src/arch/netlrts/machine-eth.C
index 3f89d2c9c2..4d68d01c2f 100644
--- a/src/arch/netlrts/machine-eth.C
+++ b/src/arch/netlrts/machine-eth.C
@@ -92,12 +92,14 @@ int CheckSocketsReady(int withDelayMs)
   }
   
   CmiStdoutCheck(CMK_PIPE_SUB);
-  if (Cmi_charmrun_fd!=-1) 
+  if (Cmi_charmrun_fd!=-1) {
 	  ctrlskt_ready_read = CMK_PIPE_CHECKREAD(Cmi_charmrun_fd);
+  }
   if (dataskt!=-1) {
-	dataskt_ready_read = CMK_PIPE_CHECKREAD(dataskt);
-	if (dataWrite)
-		dataskt_ready_write = CMK_PIPE_CHECKWRITE(dataskt);
+	  dataskt_ready_read = CMK_PIPE_CHECKREAD(dataskt);
+    if (dataWrite) {
+      dataskt_ready_write = CMK_PIPE_CHECKWRITE(dataskt);
+    }
   }
   return nreadable;
 }
diff --git a/src/arch/netlrts/machine.C b/src/arch/netlrts/machine.C
index fdb0d72403..1ebda70353 100644
--- a/src/arch/netlrts/machine.C
+++ b/src/arch/netlrts/machine.C
@@ -251,7 +251,7 @@ int _kq = -1;
 #if CMK_SHRINK_EXPAND
 extern void resumeAfterRealloc(void);
 extern char willContinue;
-extern int mynewpe;
+int mynewpe=0;
 extern int numProcessAfterRestart;
 CcsDelayedReply shrinkExpandreplyToken;
 extern char *_shrinkexpand_basedir;
@@ -598,7 +598,6 @@ int    Cmi_isOldProcess = 0; // means this process was already there
 static int    Cmi_mynewpe = 0;
 static int    Cmi_oldpe = 0;
 static int    Cmi_newnumnodes = 0;
-int    Cmi_myoldpe = 0;
 static int Cmi_charmrun_assigned_pe;
 #endif
 
@@ -653,11 +652,17 @@ static void parse_netstart(void)
   int nread;
   int port;
   ns = getenv("NETSTART");
+  int dummy;
+#if CMK_SHRINK_EXPAND
+  int* ptr = Cmi_isOldProcess == 1 ? &dummy : &Lrts_myNode;
+#else
+  int* ptr = &Lrts_myNode;
+#endif
   if (ns!=0) 
   {/*Read values set by Charmrun*/
         char Cmi_charmrun_name[1024];
         nread = sscanf(ns, "%d%s%d%d%d",
-                 &Lrts_myNode,
+                 ptr,
                  Cmi_charmrun_name, &Cmi_charmrun_port,
                  &Cmi_charmrun_pid, &port);
 	Cmi_charmrun_IP=skt_lookup_ip(Cmi_charmrun_name);
@@ -666,11 +671,9 @@ static void parse_netstart(void)
                 fprintf(stderr,"Error parsing NETSTART '%s'\n",ns);
                 exit(1);
         }
+    
 #if CMK_SHRINK_EXPAND
     Cmi_charmrun_assigned_pe = Lrts_myNode;
-    if (Cmi_isOldProcess) {
-      Cmi_myoldpe = Lrts_myNode;
-    }
 #endif
     if (getenv("CmiLocal") != NULL) {      /* ++local */
           /* CmiMyLocalRank is used for setting default cpu affinity */
@@ -1395,8 +1398,10 @@ static void open_charmrun_socket(void)
   dataskt=skt_datagram(&dataport, Cmi_os_buffer_size);
 #endif
   MACHSTATE2(5,"skt_connect at dataskt:%d Cmi_charmrun_port:%d",dataskt, Cmi_charmrun_port);
+  //printf("skt_connect at dataskt:%d Cmi_charmrun_port:%d",dataskt, Cmi_charmrun_port);
   Cmi_charmrun_fd = skt_connect(Cmi_charmrun_IP, Cmi_charmrun_port, 1800);
   MACHSTATE2(5,"Opened connection to charmrun at socket %d, dataport=%d", Cmi_charmrun_fd, dataport);
+  //printf("Opened connection to charmrun at socket %d, dataport=%d", Cmi_charmrun_fd, dataport);
   skt_tcp_no_nagle(Cmi_charmrun_fd);
 }
 
@@ -1434,7 +1439,9 @@ static void send_singlenodeinfo(void)
   memset(&me, 0, sizeof(me));
 
 #if CMK_SHRINK_EXPAND
-  me.nodeNo = ChMessageInt_new(Cmi_charmrun_assigned_pe);
+  me.nodeNo = ChMessageInt_new(Lrts_myNode);
+  //if (Cmi_isOldProcess && ChMessageInt(me.nodeNo) == 3)
+  //  exit(1);
 #else
   me.nodeNo = ChMessageInt_new(Lrts_myNode);
 #endif
@@ -1454,6 +1461,8 @@ static void send_singlenodeinfo(void)
      use non-locking version */
   ctrl_sendone_nolock("initnode", (const char *)&me, sizeof(me), NULL, 0);
   MACHSTATE1(5, "send initnode - dataport:%d", dataport);
+  //fprintf(stderr, "send initnode - dataport:%d", dataport);
+  //fflush(stderr);
 
   MACHSTATE(3, "initnode sent");
 }
@@ -1798,7 +1807,7 @@ void LrtsPostCommonInit(int everReturn)
         CmiAbort("Charm++ Fatal Error: interrupt mode does not work with default system memory allocator. Run with +netpoll to disable the interrupt.");
     }
 #endif
-  }       
+  }
 
 #if MEMORYUSAGE_OUTPUT
   memoryusage_counter = 0;
@@ -1904,7 +1913,7 @@ void ConverseCleanup(void)
 
   if (CmiMyPe() == 0) {
     if (willContinue) {
-      CcsSendDelayedReply(shrinkExpandreplyToken, 0, 0); //reply to CCS client
+      //CcsSendDelayedReply(shrinkExpandreplyToken, 0, 0); //reply to CCS client
       // wait for this message to receive, hack
       // TODO: figure out why this is important
       usleep(500);
@@ -2146,14 +2155,16 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
 #if CMK_SHRINK_EXPAND
   if (Cmi_isOldProcess == 1) {
     Lrts_myNode = Cmi_mynewpe;
-    Cmi_myoldpe = Cmi_oldpe;
+    Cmi_charmrun_assigned_pe = Lrts_myNode;
+    //if (Cmi_isOldProcess && Lrts_myNode == 3)
+    //  exit(1);
     Lrts_numNodes = Cmi_newnumnodes;
   }
 #endif
 
     /* NOTE: can not acutally call timer before timerInit ! GZ */
 #if CMK_SHRINK_EXPAND
-  MACHSTATE3(2,"After reorg  %d %d %d \n", Cmi_oldpe, Lrts_myNode, Lrts_numNodes);
+  CmiPrintf("After reorg  %d %d %d \n", Cmi_oldpe, Lrts_myNode, Lrts_numNodes);
 #endif
   MACHSTATE2(5,"Init: (netpoll=%d), (idlepoll=%d)",Cmi_netpoll,Cmi_idlepoll);
 
diff --git a/src/arch/ucx/charmrun b/src/arch/ucx/charmrun
index 80a1fc23ee..fde570824f 100755
--- a/src/arch/ucx/charmrun
+++ b/src/arch/ucx/charmrun
@@ -70,6 +70,10 @@ do
 		args+=("$1" "$2")
 		shift
 		;;
+	++nodelist)
+		machinefile="$2"
+		shift
+		;;
 	++quiet)
 		QUIET=1
 		;;
@@ -114,6 +118,15 @@ done
 
 args+=("${charm_args[@]}")
 
+# Set machinefile options for mpirun and srun if ++nodelist was specified
+if [[ -n "$machinefile" ]]; then
+    mpirun_machinefile_opt=(-machinefile "$machinefile")
+    srun_nodelist_opt=(--nodelist="$machinefile")
+else
+    mpirun_machinefile_opt=()
+    srun_nodelist_opt=()
+fi
+
 if [[ "$DEBUG" = '1' || "$DEBUG_NO_PAUSE" = '1' ]]
 then
   if [[ -z "$DEBUGGER" ]]
@@ -167,7 +180,7 @@ then
     else
  	#someday this should be pmix, but our pmix launcher needs some work
 	# on machines like NCSA Delta this is the most robust solution known to me
-	runCmd srun --mpi=pmi2 -n "$nodes" "${args[@]}"
+	runCmd srun --mpi=pmi2 -n "$nodes" --exact "${args[@]}"
     fi	
 elif [[ -n "$PBS_NODEFILE" ]]
 then
@@ -386,16 +399,13 @@ then
   mpirun_cmd="$(command -v mpirun 2>/dev/null)"
   if [[ -n "$mpirun_cmd" ]]
   then
-    [[ -n "$MPI_MACHINEFILE" ]] && args=(-machinefile "$MPI_MACHINEFILE" "${args[@]}")
     setarch_cmd="$(command -v setarch 2>/dev/null)"
     if [[ -n "$setarch_cmd" && -x "$setarch_cmd" ]]
     then
-      # Disables randomization of the virtual address  space  (turns  on
-      #          ADDR_NO_RANDOMIZE).
       cur_arch="$(uname -m)"
-      runCmd "$setarch_cmd" "$cur_arch" -R mpirun -np "$nodes" "${args[@]}"
+      runCmd "$setarch_cmd" "$cur_arch" -R mpirun "${mpirun_machinefile_opt[@]}" -np "$nodes" "${args[@]}"
     else
-      runCmd mpirun -np "$nodes" "${args[@]}"
+      runCmd mpirun "${mpirun_machinefile_opt[@]}" -np "$nodes" "${args[@]}"
     fi
   else
     mpiexec_cmd="$(command -v mpiexec 2>/dev/null)"
@@ -412,7 +422,7 @@ then
   #mpirun is checked before srun to support the Bridges supercomputer at PSC
   #as srun has a known issue and fails to successfully launch the parallel job.
   #This is required to run the nightly ofi autobuild.
-  runCmd srun -n "$nodes" -c $(( ppn + 1 )) "${args[@]}"
+  runCmd srun "${srun_nodelist_opt[@]}" -n "$nodes" -c $(( ppn + 1 )) "${args[@]}"
 else
   echo "No job launcher found! (tried aprun, mpirun and srun)"
   exit 1
diff --git a/src/arch/ucx/machine.C b/src/arch/ucx/machine.C
index c8bc798a34..fbbe775a83 100644
--- a/src/arch/ucx/machine.C
+++ b/src/arch/ucx/machine.C
@@ -10,6 +10,9 @@
 #include <string>
 
 #include "converse.h"
+#include "conv-ccs.h"
+#include "ccs-server.h"
+#include "ckrescale.h"
 #include "cmirdmautils.h"
 #include "machine.h"
 #include "pcqueue.h"
@@ -30,6 +33,13 @@
 #include "runtime-pmix.C"
 #endif
 
+#if CMK_SHRINK_EXPAND
+CcsDelayedReply shrinkExpandreplyToken;
+extern int numProcessAfterRestart;
+extern char *_shrinkexpand_basedir;
+int mynewpe=0;
+#endif
+
 #define CmiSetMsgSize(msg, sz)    ((((CmiMsgHeaderBasic *)msg)->size) = (sz))
 
 #define UCX_MSG_PROBE_THRESH            32768
@@ -279,7 +289,7 @@ void LrtsInit(int *argc, char ***argv, int *numNodes, int *myNodeID)
     ucp_worker_params_t wParams;
     ucs_status_t status;
     int ret;
-
+    
     ret = runtime_init(myNodeID, numNodes);
     UCX_CHECK_PMI_RET(ret, "runtime_init");
 
@@ -768,6 +778,42 @@ void LrtsExit(int exitcode)
     }
 }
 
+void LrtsCleanup()
+{
+  int ret;
+    int i;
+    UcxRequest *req;
+    ucs_status_t status;
+
+    UCX_LOG(4, "LrtsExit");
+
+    LrtsAdvanceCommunication(0);
+
+    for (i = 0; i < ucxCtx.numRxReqs; ++i) {
+        req = ucxCtx.rxReqs[i];
+        CmiFree(req->msgBuf);
+        ucp_request_cancel(ucxCtx.worker, req);
+        ucp_request_free(req);
+    }
+
+    ucp_worker_destroy(ucxCtx.worker);
+    ucp_cleanup(ucxCtx.context);
+
+    CmiFree(ucxCtx.eps);
+    CmiFree(ucxCtx.rxReqs);
+#if CMK_SMP
+    PCQueueDestroy(ucxCtx.txQueue);
+#endif
+
+    if(!CharmLibInterOperate || userDrivenMode) {
+        ret = runtime_barrier();
+        UCX_CHECK_PMI_RET(ret, "runtime_barrier");
+
+        ret = runtime_fini();
+        UCX_CHECK_PMI_RET(ret, "runtime_fini");
+    }
+}
+
 #if CMK_MACHINE_PROGRESS_DEFINED
 void CmiMachineProgressImpl()
 {
@@ -777,6 +823,45 @@ void CmiMachineProgressImpl()
 }
 #endif
 
+
+#if CMK_SHRINK_EXPAND
+void ConverseCleanup(void)
+{
+  MACHSTATE(2,"ConverseCleanup {");
+
+  CmiBarrier();
+
+#if CMK_USE_SYSVSHM
+	CmiExitSysvshm();
+#elif CMK_USE_PXSHM
+	CmiExitPxshm();
+#endif
+  ConverseCommonExit();               /* should be called by every rank */
+  CmiNodeBarrier();        /* single node SMP, make sure every rank is done */
+  //if (CmiMyRank()==0) CmiStdoutFlush();
+
+  if (get_shrinkexpand_exit() && CmiMyPe() == 0) {
+    // launch charmrun here
+
+    std::string path = std::string(_shrinkexpand_basedir) + "/numRestartProcs.txt";
+    FILE *fp = fopen(path.c_str(), "w");
+    if (fp != NULL) {
+      fprintf(fp, "%d", numProcessAfterRestart);
+      fclose(fp);
+    }
+
+    CmiBarrier();
+    ConverseExit(100);
+  } else {
+    // kill all other processes
+    CmiBarrier();
+    //printf("Exiting PE %d\n", CmiMyPe());
+    //fflush(stdout);
+    ConverseExit();
+  }
+}
+#endif
+
 // In CMK_SMP, this is called by worker thread
 void LrtsPostNonLocal()
 {
@@ -840,7 +925,7 @@ void UcxRecvDeviceCompleted(void* request, ucs_status_t status,
   }
 }
 
-void LrtsSendDevice(int dest_pe, const void*& ptr, size_t size, uint64_t& tag) {
+void LrtsSendDevice(int dest_pe, int src_pe, const void*& ptr, size_t size, uint64_t& tag) {
   // FIXME: Is this tag generation OK?
   tag = ((uint64_t)CpvAccess(tag_counter)++ << (UCX_TAG_PE_BITS + UCX_TAG_MSG_BITS)) | (CmiMyPe() << UCX_TAG_MSG_BITS) | UCX_MSG_TAG_DEVICE;
 #if CMK_SMP
diff --git a/src/arch/util/machine-broadcast.C b/src/arch/util/machine-broadcast.C
index bcca7da608..49473a44e0 100644
--- a/src/arch/util/machine-broadcast.C
+++ b/src/arch/util/machine-broadcast.C
@@ -295,10 +295,10 @@ void CmiSyncBroadcastFn1(int size, char *msg) {
     }
     #endif
 	
-    for ( i=mype+1; i<_Cmi_numpes; i++ )
+    for ( int i=mype+1; i<_Cmi_numpes; i++ )
         CmiSyncSendFn(i, size, msg) ;
 	
-    for ( i=0; i<mype; i++ )
+    for ( int i=0; i<mype; i++ )
         CmiSyncSendFn(i, size, msg) ;
 #endif
 }
diff --git a/src/arch/util/machine-common-core.C b/src/arch/util/machine-common-core.C
index b24117428e..5403fad78b 100644
--- a/src/arch/util/machine-common-core.C
+++ b/src/arch/util/machine-common-core.C
@@ -1214,6 +1214,8 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret)
     MSG_STATISTIC = CmiGetArgFlag(argv, "+msgstatistic");
 #endif
 
+    Cmi_argvcopy = CmiCopyArgs(argv);
+
   if (CmiGetArgFlagDesc(argv,"++quiet","Omit non-error runtime messages")) {
     quietModeRequested = quietMode = 1;
   }
@@ -1409,7 +1411,6 @@ if (  MSG_STATISTIC)
 
     _Cmi_numpes = _Cmi_numnodes * _Cmi_mynodesize;
     Cmi_nodestart = _Cmi_mynode * _Cmi_mynodesize;
-    Cmi_argvcopy = CmiCopyArgs(argv);
     Cmi_argv = argv;
     Cmi_startfn = fn;
     Cmi_usrsched = usched;
@@ -1588,7 +1589,6 @@ static void ConverseRunPE(int everReturn) {
     CcdCallOnConditionKeep(CcdPROCESSOR_STILL_IDLE,(CcdCondFn)CmiNotifyStillIdle, NULL);
 #endif
 
-
     LrtsPostCommonInit(everReturn);
 
 #if CMK_SMP && CMK_LEVERAGE_COMMTHREAD
@@ -1948,6 +1948,7 @@ static char *CopyMsg(char *msg, int len) {
         CmiAbort("Error: out of memory in machine layer\n");
     }
 #endif
+    // FIXME shouldn't this be len+header size??
     memcpy(copy, msg, len);
     return copy;
 }
diff --git a/src/arch/util/machine-rdma.h b/src/arch/util/machine-rdma.h
index 3b17df5f47..50cab5817c 100644
--- a/src/arch/util/machine-rdma.h
+++ b/src/arch/util/machine-rdma.h
@@ -1,6 +1,8 @@
 #ifndef _MACHINE_RDMA_H_
 #define _MACHINE_RDMA_H_
 
+#define CMK_GPU_COMM 1
+
 /* Support for Nocopy Direct API */
 void LrtsSetRdmaBufferInfo(void *info, const void *ptr, int size, unsigned short int mode);
 void LrtsIssueRget(NcpyOperationInfo *ncpyOpInfo);
@@ -15,15 +17,15 @@ void LrtsInvokeRemoteDeregAckHandler(int pe, NcpyOperationInfo *ncpyOpInfo);
 
 void CmiInvokeNcpyAck(void *ack);
 
-#if CMK_CUDA && CMK_GPU_COMM
+#if (CMK_CUDA || CMK_HIP) && CMK_GPU_COMM
 // Function pointer to acknowledgement handler
 typedef void (*RdmaAckHandlerFn)(void *token);
 
-void LrtsSendDevice(int dest_pe, const void*& ptr, size_t size, uint64_t& tag);
+void LrtsSendDevice(int dest_rank, int src_rank, const void*& ptr, size_t size, uint64_t& tag);
 void LrtsRecvDevice(DeviceRdmaOp* op, DeviceRecvType type);
 
 void CmiInvokeRecvHandler(void* data);
-#endif // CMK_CUDA && CMK_GPU_COMM
+#endif // (CMK_CUDA || CMK_HIP) && CMK_GPU_COMM
 
 int CmiGetRdmaCommonInfoSize();
 #endif
diff --git a/src/arch/util/proc_management/runtime-pmix.C b/src/arch/util/proc_management/runtime-pmix.C
index 1f980c5b47..ded13b5ff9 100644
--- a/src/arch/util/proc_management/runtime-pmix.C
+++ b/src/arch/util/proc_management/runtime-pmix.C
@@ -84,12 +84,6 @@ int runtime_kvs_put(const char *k, const void *v, int vlen)
         return -2;
     }
 
-    if (PMIX_SUCCESS != (ret = PMIx_Commit())) {
-        fprintf(stderr, "Client ns %s rank %d: PMIx_Commit failed: %d\n",
-                myproc.nspace, myproc.rank, ret);
-        return -3;
-    }
-
     return 0;
 }
 
@@ -123,7 +117,19 @@ int runtime_barrier()
 {
     int ret;
 
-    if (PMIX_SUCCESS != (ret = PMIx_Fence(NULL, 0, NULL, 0))) {
+    if (PMIX_SUCCESS != (ret = PMIx_Commit())) {
+        fprintf(stderr, "Client ns %s rank %d: PMIx_Commit failed: %d\n",
+                myproc.nspace, myproc.rank, ret);
+        return -3;
+    }
+    
+    pmix_info_t info;
+    memset(&info, 0, sizeof(info));
+    strncpy(info.key, PMIX_COLLECT_DATA, PMIX_MAX_KEYLEN);
+    info.value.type = PMIX_BOOL;
+    info.value.data.flag = 1;
+
+    if (PMIX_SUCCESS != (ret = PMIx_Fence(NULL, 0, &info, 1))) {
         fprintf(stderr, "Client ns %s rank %d: PMIx_Fence failed: %d\n",
                 myproc.nspace, myproc.rank, ret);
         return -1;
diff --git a/src/ck-core/CMakeLists.txt b/src/ck-core/CMakeLists.txt
index fa579d55ed..5cf5261dc3 100644
--- a/src/ck-core/CMakeLists.txt
+++ b/src/ck-core/CMakeLists.txt
@@ -60,7 +60,7 @@ set(ldb-h-sources ../ck-ldb/BaseLB.h ../ck-ldb/CentralLBMsg.h
     ../ck-ldb/LBComm.h
     ../ck-ldb/LBDatabase.h ../ck-ldb/LBManager.h ../ck-ldb/LBMachineUtil.h ../ck-ldb/LBOM.h
     ../ck-ldb/LBObj.h ../ck-ldb/LBSimulation.h
-    ../ck-ldb/MetaBalancer.h ../ck-ldb/MetisLB.h
+    ../ck-ldb/MetaBalancer.h ../ck-ldb/MetisLB.h ../ck-ldb/GreedyCentralLB.h
     ../ck-ldb/RandomForestModel.h
     ../ck-ldb/RecBipartLB.h
     ../ck-ldb/Refiner.h
@@ -83,8 +83,8 @@ add_dependencies(ckmain ck)
 # CommonLBs used to be a subset of EveryLB, but they were unified as
 # part of the LB refactor. However, maintain both targets for
 # backwards compatibility and for possible future divergence.
-add_library(moduleCommonLBs ${ldb-cxx-sources} ${ldb-h-sources} ../ck-ldb/MetisLB.C ../ck-ldb/MetisLB.h)
-add_library(moduleEveryLB ${ldb-cxx-sources} ${ldb-h-sources} ../ck-ldb/MetisLB.C ../ck-ldb/MetisLB.h)
+add_library(moduleCommonLBs ${ldb-cxx-sources} ${ldb-h-sources} ../ck-ldb/MetisLB.C ../ck-ldb/MetisLB.h ../ck-ldb/GreedyCentralLB.C ../ck-ldb/GreedyCentralLB.h ../ck-ldb/GreedyRefineCentralLB.C ../ck-ldb/GreedyRefineCentralLB.h)
+add_library(moduleEveryLB ${ldb-cxx-sources} ${ldb-h-sources} ../ck-ldb/MetisLB.C ../ck-ldb/MetisLB.h ../ck-ldb/GreedyCentralLB.C ../ck-ldb/GreedyCentralLB.h ../ck-ldb/GreedyRefineCentralLB.C ../ck-ldb/GreedyRefineCentralLB.h)
 configure_file(../ck-ldb/libmoduleCommonLBs.dep ${CMAKE_BINARY_DIR}/lib/ COPYONLY)
 configure_file(../ck-ldb/libmoduleEveryLB.dep ${CMAKE_BINARY_DIR}/lib/ COPYONLY)
 add_dependencies(moduleCommonLBs ck ckmetis)
@@ -108,6 +108,10 @@ add_library(moduleTreeLB ../ck-ldb/TreeLB.C)
 add_dependencies(moduleTreeLB ck)
 add_library(moduleRecBipartLB ../ck-ldb/RecBipartLB.C ../ck-ldb/RecBipartLB.h)
 add_dependencies(moduleRecBipartLB ck)
+add_library(moduleGreedyCentralLB ../ck-ldb/GreedyCentralLB.C ../ck-ldb/GreedyCentralLB.h)
+add_dependencies(moduleGreedyCentralLB ck)
+add_library(moduleGreedyRefineCentralLB ../ck-ldb/GreedyRefineCentralLB.C ../ck-ldb/GreedyRefineCentralLB.h)
+add_dependencies(moduleGreedyRefineCentralLB ck)
 add_library(moduleScotchLB EXCLUDE_FROM_ALL ../ck-ldb/ScotchLB.C ../ck-ldb/ScotchLB.h)
 configure_file(../ck-ldb/libmoduleScotchLB.dep ${CMAKE_BINARY_DIR}/lib/ COPYONLY)
 add_dependencies(moduleScotchLB ck)
@@ -133,6 +137,10 @@ foreach(filename ${ck-h-sources} ${ldb-h-sources})
 endforeach()
 
 target_include_directories(ck PRIVATE . ../ck-ldb ../ck-perf ../ck-cp ../util/topomanager ../conv-perf)
+if(BUILD_CUDA AND CUPTI_LIBRARY)
+  target_include_directories(ck PRIVATE "${CUDA_DIR}/extras/CUPTI/include")
+  target_link_libraries(ck ${CUPTI_LIBRARY})
+endif()
 target_include_directories(ckmain PRIVATE . ../ck-ldb ../ck-perf ../ck-cp ../util/topomanager)
 
 # Fortran interface
diff --git a/src/ck-core/ck.C b/src/ck-core/ck.C
index 0ad1f22db2..f5b246fa25 100644
--- a/src/ck-core/ck.C
+++ b/src/ck-core/ck.C
@@ -175,6 +175,7 @@ static void CkChareThreadListener_suspend(CkThreadListener *l) {
 }
 
 static void CkChareThreadListener_resume(CkThreadListener *l) {
+  // printf("[PE %d] CkChareThreadListener_resume: obj=%p\n", CkMyPe(), ((CkChareThreadListener*)l)->obj);
   CkCallstackPush(((CkChareThreadListener *)l)->obj);
 }
 
@@ -470,7 +471,7 @@ void CkSectionID::pup(PUP::er &p) {
 
 /**** Tiny random API routines */
 
-#if CMK_CUDA
+#if CMK_CUDA || CMK_HIP
 void CUDACallbackManager(void *fn, void *msg) {
   if (fn) {
     ((CkCallback*)fn)->send(msg);
@@ -574,6 +575,7 @@ int CkGetArgc(void) {
 }
 
 Chare *CkActiveObj(void) {
+  // printf("[PE %d] getting active: stack size now %zu\n", CkMyPe(), CkpvAccess(runningObjs).size());
   auto &objs = *(&CkpvAccess(runningObjs));
   if (objs.empty()) {
     return nullptr;
@@ -584,10 +586,12 @@ Chare *CkActiveObj(void) {
 
 inline void _pushObj(Chare *obj) {
   CkpvAccess(runningObjs).emplace_back(obj);
+  // printf("[PE %d] pushObj: stack size now %zu\n", CkMyPe(), CkpvAccess(runningObjs).size());
 }
 
 inline Chare *_popObj(void) {
   auto &objs = *(&CkpvAccess(runningObjs));
+  // printf("[PE %d] popobj: stack size now %zu\n", CkMyPe(), CkpvAccess(runningObjs).size());
   if (objs.empty()) {
     return nullptr;
   } else {
@@ -620,10 +624,16 @@ void CkCallstackPush(Chare *obj) {
 
 // removes all instances of ( obj ) from the stack
 void CkCallstackUnwind(Chare *obj) {
+  // printf("[%d] removing all instances of obj %p\n",CkMyPe(), obj);
+
   CkAssertMsg(obj != nullptr, "expected a valid object!");
   auto &objs = *(&CkpvAccess(runningObjs));
   auto start = std::begin(objs);
   auto end = std::end(objs);
+  //   for(auto it=start;it!=end;it++)
+  // {
+  //   printf("objects still in the stack are %p\n", *it);
+  // }
   // ensures that all copies of the object are null'd
   while (end != (start = std::find(start, end, obj))) {
     *start = nullptr;
@@ -2639,14 +2649,14 @@ void CkArrayExtSend_multi(int aid, int *idx, int ndims, int epIdx, int num_bufs,
 
 
 // HAPI support
-#if CMK_CUDA
+#if CMK_CUDA || CMK_HIP
 #include "hapi.h"
 #endif
 
 void CkHapiAddCallback(long stream, void (*cb)(void*, void*), int fid) 
 {
-  #if CMK_CUDA
-  cudaStream_t stream_ptr = (cudaStream_t)stream;
+  #if CMK_CUDA || CMK_HIP
+  hapiStream_t stream_ptr = (hapiStream_t)stream;
   CkCallback callback(cb, (void *) fid);
   
   hapiAddCallback(stream_ptr, callback, NULL);
diff --git a/src/ck-core/ckcallback.C b/src/ck-core/ckcallback.C
index fc5aea7758..337a9255c1 100644
--- a/src/ck-core/ckcallback.C
+++ b/src/ck-core/ckcallback.C
@@ -401,7 +401,7 @@ void CkCallback::send(void *msg,int opts) const
 		break;
 	case sendArray: //Send message to an array element
 		if (!msg) msg=CkAllocSysMsg();
-                if (d.array.hasRefnum) CkSetRefNum(msg, d.array.refnum);
+    if (d.array.hasRefnum) CkSetRefNum(msg, d.array.refnum);
 		CkSetMsgArrayIfNotThere(msg, policy);
 		CkSendMsgArray(d.array.ep, msg, d.array.id, d.array.idx.asChild(), opts);
 		break;
diff --git a/src/ck-core/ckcheckpoint.C b/src/ck-core/ckcheckpoint.C
index 06f2b9cbdf..ca9c1f8de2 100644
--- a/src/ck-core/ckcheckpoint.C
+++ b/src/ck-core/ckcheckpoint.C
@@ -16,8 +16,11 @@ More documentation goes here...
 #include <sstream>
 using std::ostringstream;
 #include <errno.h>
+#include <fstream>
+#include <cstring>
 #include "charm++.h"
 #include "ck.h"
+#include "ckrescale.h"
 #include "ckcheckpoint.h"
 #include "CkCheckpoint.decl.h"
 #include <sys/stat.h>
@@ -36,23 +39,6 @@ void noopit(const char*, ...)
 CkGroupID _sysChkptWriteMgr;
 CkGroupID _sysChkptMgr;
 
-struct GroupInfo
-{
-  CkGroupID gID;
-  int MigCtor;
-  std::string name;
-  bool present;
-
-  void pup(PUP::er& p)
-  {
-    p | gID;
-    p | MigCtor;
-    p | name;
-    p | present;
-  }
-};
-
-bool _inrestart = false;
 bool _restarted = false;
 int _oldNumPes = 0;
 bool _chareRestored = false;
@@ -60,10 +46,10 @@ double chkptStartTimer = 0;
 #if CMK_SHRINK_EXPAND
 int originalnumGroups = -1;
 extern int Cmi_isOldProcess;
-extern int Cmi_myoldpe;
 extern char *_shrinkexpand_basedir;
 #endif
 
+
 // Required for broadcasting RO Data after recovering from failure
 #if CMK_SMP
 extern std::atomic<UInt> numZerocopyROops;
@@ -99,14 +85,15 @@ private:
 public:
         ElementCheckpointer(CkLocMgr* mgr_, PUP::er &p_):locMgr(mgr_),p(p_){};
         void addLocation(CkLocation &loc) {
-                CkArrayIndex idx=loc.getIndex();
-		CkGroupID gID = locMgr->ckGetGroupID();
-                CmiUInt8 id = loc.getID();
-		p|gID;	    // store loc mgr's GID as well for easier restore
-                p|idx;
-                p|id;
-	        p|loc;
-		//CkPrintf("[%d] addLocation: ", CkMyPe()), idx.print();
+          CkArrayIndex idx=loc.getIndex();
+          //CkPrintf("[%d] Packing index dim = %i, %s\n", CkMyPe(), idx.dimension, idx2str(idx));
+          CkGroupID gID = locMgr->ckGetGroupID();
+          CmiUInt8 id = loc.getID();
+          p|gID;	    // store loc mgr's GID as well for easier restore
+          p|idx;
+          p|id;
+          p|loc;
+		      //CkPrintf("[%d] addLocation: ", CkMyPe()), idx.print();
         }
 };
 
@@ -148,7 +135,7 @@ static void bdcastROGroupData(void){
 	CkPupROData(ps);
 	int ROSize = ps.size();
 
-	CkPupGroupData(ps1);
+	//CkPupGroupData(ps1);
 	int GroupSize = ps1.size();
 
 	char *msg = (char *)CmiAlloc(CmiMsgHeaderSizeBytes + 2*sizeof(int) + ps.size() + ps1.size());
@@ -164,7 +151,7 @@ static void bdcastROGroupData(void){
 	PUP::toMem pp((char *)payloadOffset, PUP::er::IS_CHECKPOINT);
 	CkPupROData(pp);
 
-	CkPupGroupData(pp);
+	//CkPupGroupData(pp);
 
 	CmiSetHandler(msg, _ROGroupRestartHandlerIdx);
 	CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes + 2*sizeof(int) + pp.size(), msg);
@@ -258,6 +245,36 @@ public:
       CProxy_CkCheckpointMgr(_sysChkptMgr)[index].Checkpoint(dirname, cb, requestStatus);
   }
 
+  void RescaleCheckpoint(const char* dirname, CkCallback cb, std::vector<char> avail,
+    bool requestStatus = false, int writersPerNode = 0)
+  {
+    // If currently checkpointing, drop new requests
+    if (inProgress) return;
+    inProgress = true;
+    numComplete = 0;
+
+    set_shrinkexpand_exit(true); // Set this flag to indicate that we are in the process of shrinking/expanding
+
+    if (writersPerNode > 0) numWriters = std::min(writersPerNode, nodeSize);
+
+    // Save params for future invocations and kick off the first numWriters PEs to start
+    // checkpointing
+    this->dirname = dirname;
+    this->cb = cb;
+    this->requestStatus = requestStatus;
+
+#if CMK_SHRINK_EXPAND
+    if (CkMyPe() != 0)
+    {
+      se_avail_vector = (char*) malloc(CkNumPes() * sizeof(char));
+      memcpy(se_avail_vector, avail.data(), CkNumPes() * sizeof(char));
+    }
+#endif
+
+    for (index = firstPE; index < firstPE + numWriters; index++)
+      CProxy_CkCheckpointMgr(_sysChkptMgr)[index].Checkpoint(dirname, cb, requestStatus);
+  }
+
   void FinishedCheckpoint()
   {
     numComplete++;
@@ -295,109 +312,135 @@ public:
 
 // broadcast
 void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback cb, bool _requestStatus){
+#if CMK_SHRINK_EXPAND
+  std::vector<char> avail(se_avail_vector, se_avail_vector + CkNumPes());
+  int chckPtId = CmiPhysicalRank(CmiMyPe());
+#else
+  int chckPtId = CmiPhysicalRank(CmiMyPe());
+#endif
 	chkptStartTimer = CmiWallTimer();
-	requestStatus = _requestStatus;
-	// make dir on all PEs in case it is a local directory
-	CmiMkdir(dirname);
-
-	// Create partition directories (if applicable)
-	ostringstream dirPath;
-	dirPath << dirname;
-	if (CmiNumPartitions() > 1) {
-		addPartitionDirectory(dirPath);
-		CmiMkdir(dirPath.str().c_str());
-	}
-
-	// Due to file system issues we have observed, divide checkpoints
-	// into subdirectories to avoid having too many files in a single directory.
-	// Nodegroups should be checked separately since they could go into
-	// different subdirectory.
-
-	// Save current path for later use with nodegroups
-	ostringstream dirPathNode;
-	dirPathNode << dirPath.str();
-
-	// Create subdirectories
-	int mySubDir = CkMyPe() / SUBDIR_SIZE;
-	dirPath << "/sub" << mySubDir;
-	CmiMkdir(dirPath.str().c_str());
-
-	// Create Nodegroup subdirectory if needed
-	if (CkMyRank() == 0) {
-		int mySubDirNode = CkMyNode() / SUBDIR_SIZE;
-		if (mySubDirNode != mySubDir) {
-			dirPathNode << "/sub" << mySubDirNode;
-			CmiMkdir(dirPathNode.str().c_str());
-		}
-	}
-
-	bool success = true;
-	if (CkMyPe() == 0) {
+  
 #if CMK_SHRINK_EXPAND
-    if (pending_realloc_state == REALLOC_IN_PROGRESS) {
-      // After restarting from this AtSync checkpoint, resume execution along the
-      // normal path (i.e. whatever the user defined as ResumeFromSync.)
-      CkCallback resumeFromSyncCB(CkIndex_LBManager::ResumeClients(), _lbmgr);
-      success &= checkpointOne(dirname, resumeFromSyncCB, requestStatus);
-    } else
+  if (avail[CkMyPe()])
 #endif
-    {
-      success &= checkpointOne(dirname, cb, requestStatus);
+  {
+    requestStatus = _requestStatus;
+    // make dir on all PEs in case it is a local directory
+    CmiMkdir(dirname);
+
+    // Create partition directories (if applicable)
+    ostringstream dirPath;
+    dirPath << dirname;
+    if (CmiNumPartitions() > 1) {
+      addPartitionDirectory(dirPath);
+      CmiMkdir(dirPath.str().c_str());
     }
-  }
 
-#ifndef CMK_CHARE_USE_PTR
-  // only create chare checkpoint file if this PE actually has data
-  if (CkpvAccess(chare_objs).size() > 0 || CkpvAccess(vidblocks).size() > 0)
-  {
-    // save plain singleton chares into Chares.dat
-    FILE* fChares = openCheckpointFile(dirname, "Chares", "wb", CkMyPe());
-    PUP::toDisk pChares(fChares, PUP::er::IS_CHECKPOINT);
-    CkPupChareData(pChares);
-    if (pChares.checkError()) success = false;
-    if (CmiFclose(fChares) != 0) success = false;
-  }
-#endif
+    // Due to file system issues we have observed, divide checkpoints
+    // into subdirectories to avoid having too many files in a single directory.
+    // Nodegroups should be checked separately since they could go into
+    // different subdirectory.
+
+    // Save current path for later use with nodegroups
+    ostringstream dirPathNode;
+    dirPathNode << dirPath.str();
+
+    // Create subdirectories
+    int mySubDir = chckPtId / SUBDIR_SIZE;
+    dirPath << "/sub" << mySubDir;
+    CmiMkdir(dirPath.str().c_str());
+
+    // Create Nodegroup subdirectory if needed
+    if (CkMyRank() == 0) {
+      int mySubDirNode = CkMyNode() / SUBDIR_SIZE;
+      if (mySubDirNode != mySubDir) {
+        dirPathNode << "/sub" << mySubDirNode;
+        CmiMkdir(dirPathNode.str().c_str());
+      }
+    }
 
-  // save groups into Groups.dat
-  // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed),
-  // groups(PUP'ed)
-  FILE* fGroups = openCheckpointFile(dirname, "Groups", "wb", CkMyPe());
-  PUP::toDisk pGroups(fGroups, PUP::er::IS_CHECKPOINT);
-  CkPupGroupData(pGroups);
-  if (pGroups.checkError()) success = false;
-  if (CmiFclose(fGroups) != 0) success = false;
-
-  // save nodegroups into NodeGroups.dat
-  // content of the file: numNodeGroups, GroupInfo[numNodeGroups],
-  // _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
-  if (CkMyRank() == 0)
-  {
-    FILE* fNodeGroups = openCheckpointFile(dirname, "NodeGroups", "wb", CkMyNode());
-    PUP::toDisk pNodeGroups(fNodeGroups, PUP::er::IS_CHECKPOINT);
-    CkPupNodeGroupData(pNodeGroups);
-    if (pNodeGroups.checkError()) success = false;
-    if (CmiFclose(fNodeGroups) != 0) success = false;
-  }
+    bool success = true;
+    if (CkMyPe() == 0) {
+      
+  #if CMK_SHRINK_EXPAND
+      if (pending_realloc_state == SHRINK_IN_PROGRESS) {
+        CkPrintf("Shrink in progress on PE%i\n", CkMyPe());
+        // After restarting from this AtSync checkpoint, resume execution along the
+        // normal path (i.e. whatever the user defined as ResumeFromSync.)
+        CkCallback resumeFromSyncCB(CkIndex_LBManager::ResumeClients(), _lbmgr);
+        success &= checkpointOne(dirname, resumeFromSyncCB, requestStatus);
+      } else if (pending_realloc_state == EXPAND_IN_PROGRESS) {
+        CkPrintf("Expand in progress on PE%i\n", CkMyPe());
+        CkCallback resumeFromSyncCB(CkIndex_LBManager::StartLB(), CProxy_LBManager(_lbmgr)[0]);
+        success &= checkpointOne(dirname, resumeFromSyncCB, requestStatus);
+      } else
+  #endif
+      {
+        success &= checkpointOne(dirname, cb, requestStatus);
+      }
+    }
+    
+  #if CMK_SHRINK_EXPAND
+    pending_realloc_state = NO_REALLOC;
+  #endif
+
+  #ifndef CMK_CHARE_USE_PTR
+    // only create chare checkpoint file if this PE actually has data
+    if (CkpvAccess(chare_objs).size() > 0 || CkpvAccess(vidblocks).size() > 0)
+    {
+      // save plain singleton chares into Chares.dat
+      FILE* fChares = openCheckpointFile(dirname, "Chares", "wb", chckPtId);
+      PUP::toDisk pChares(fChares, PUP::er::IS_CHECKPOINT);
+      CkPupChareData(pChares);
+      if (pChares.checkError()) success = false;
+      if (CmiFclose(fChares) != 0) success = false;
+    }
+  #endif
 
-  // DEBCHK("[%d]CkCheckpointMgr::Checkpoint called dirname={%s}\n",CkMyPe(),dirname);
-  FILE* datFile = openCheckpointFile(dirname, "arr", "wb", CkMyPe());
-  PUP::toDisk p(datFile, PUP::er::IS_CHECKPOINT);
-  CkPupArrayElementsData(p);
-  if (p.checkError()) success = false;
-  if (CmiFclose(datFile) != 0) success = false;
-
-#if ! CMK_DISABLE_SYNC
-#if CMK_HAS_SYNC_FUNC
-        sync();
-#elif CMK_HAS_SYNC
-	system("sync");
-#endif
-#endif
-	chkpStatus = success?CK_CHECKPOINT_SUCCESS:CK_CHECKPOINT_FAILURE;
-	restartCB = cb;
-	DEBCHK("[%d]restartCB installed\n",CkMyPe());
+    // save groups into Groups.dat
+    // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed),
+    // groups(PUP'ed)
+    FILE* fGroups = openCheckpointFile(dirname, "Groups", "wb", chckPtId);
+    PUP::toDisk pGroups(fGroups, PUP::er::IS_CHECKPOINT);
+    CkPupGroupData(pGroups);
+    if (pGroups.checkError()) success = false;
+    if (CmiFclose(fGroups) != 0) success = false;
 
+    // save nodegroups into NodeGroups.dat
+    // content of the file: numNodeGroups, GroupInfo[numNodeGroups],
+    // _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
+    if (CkMyRank() == 0)
+    {
+      FILE* fNodeGroups = openCheckpointFile(dirname, "NodeGroups", "wb", 0);
+      PUP::toDisk pNodeGroups(fNodeGroups, PUP::er::IS_CHECKPOINT);
+      CkPupNodeGroupData(pNodeGroups);
+      if (pNodeGroups.checkError()) success = false;
+      if (CmiFclose(fNodeGroups) != 0) success = false;
+    }
+    //std::vector<char> avail_vector;
+    //get_avail_vector(avail_vector);
+    //if (pending_realloc_state == REALLOC_IN_PROGRESS && static_cast<bool>(avail_vector[CkMyPe()]))
+    //{
+      //printf("[%d] Writing array checkpoint\n", CkMyPe());
+      
+      FILE* datFile = openCheckpointFile(dirname, "arr", "wb", chckPtId);
+      PUP::toDisk p(datFile, PUP::er::IS_CHECKPOINT);
+      CkPupArrayElementsData(p);
+      if (p.checkError()) success = false;
+      if (CmiFclose(datFile) != 0) success = false;
+    //}
+
+  #if ! CMK_DISABLE_SYNC
+  #if CMK_HAS_SYNC_FUNC
+          sync();
+  #elif CMK_HAS_SYNC
+    system("sync");
+  #endif
+  #endif
+    chkpStatus = success?CK_CHECKPOINT_SUCCESS:CK_CHECKPOINT_FAILURE;
+    restartCB = cb;
+    DEBCHK("[%d]restartCB installed\n",CkMyPe());
+  }
 	// Use barrier instead of contribute here:
 	// barrier is stateless and multiple calls to it do not overlap.
 	barrier(CkCallback(CkReductionTarget(CkCheckpointMgr, SendRestartCB), 0, thisgroup));
@@ -441,7 +484,7 @@ void CkPupROData(PUP::er &p)
 void CkPupMainChareData(PUP::er &p, CkArgMsg *args)
 {
 	int nMains=_mainTable.size();
-	DEBCHK("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains);
+	//CkPrintf("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains);
 	for(int i=0;i<nMains;i++){  /* Create all mainchares */
 		const auto& chareIdx = _mainTable[i]->chareIdx;
 		ChareInfo *entry = _chareTable[chareIdx];
@@ -449,11 +492,14 @@ void CkPupMainChareData(PUP::er &p, CkArgMsg *args)
 		if(entryMigCtor!=-1) {
 			Chare* obj;
 			if (p.isUnpacking()) {
-				DEBCHK("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, entry->size);
+				//CkPrintf("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, entry->size);
 				obj = CkAllocateChare(chareIdx);
+        //CkPrintf("Allocated mainchare %s\n", entry->name);
 				_mainTable[i]->setObj(obj);
+        //CkPrintf("Set mainchare %s\n", entry->name);
 				//void *m = CkAllocSysMsg();
 				CkInvokeEP(obj, entryMigCtor, args);
+        //CkPrintf("Invoked migration constructor for mainchare %s\n", entry->name);
 			}
 			else 
 			 	obj = (Chare *)_mainTable[i]->getObj();
@@ -548,6 +594,8 @@ void CkPupChareData(PUP::er &p)
 
 typedef void GroupCreationFn(CkGroupID groupID, int constructorIdx, envelope *env);
 
+
+
 static void CkPupPerPlaceData(PUP::er &p, GroupIDTable *idTable, GroupTable *objectTable,
                               unsigned int &numObjects, int constructionMsgType,
                               GroupCreationFn creationFn
@@ -559,7 +607,7 @@ static void CkPupPerPlaceData(PUP::er &p, GroupIDTable *idTable, GroupTable *obj
     numGroups = idTable->size();
   }
   p|numGroups;
-  DEBCHK("[%d] CkPupPerPlaceData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
+  CkPrintf("[%d] CkPupPerPlaceData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
 
   std::vector<GroupInfo> tmpInfo(numGroups);
   if (!p.isUnpacking()) {
@@ -618,21 +666,20 @@ static void CkPupPerPlaceData(PUP::er &p, GroupIDTable *idTable, GroupTable *obj
   }
 }
 
-void CkPupGroupData(PUP::er &p
-  )
+void CkPupGroupData(PUP::er &p)
 {
-        CkPupPerPlaceData(p, CkpvAccess(_groupIDTable), CkpvAccess(_groupTable),
-                          CkpvAccess(_numGroups), BocInitMsg, &CkCreateLocalGroup
-                         );
+  CkPupPerPlaceData(p, CkpvAccess(_groupIDTable), CkpvAccess(_groupTable),
+    CkpvAccess(_numGroups), BocInitMsg, &CkCreateLocalGroup
+  );
 }
 
 void CkPupNodeGroupData(PUP::er &p
   )
 {
           CkPupPerPlaceData(p, &CksvAccess(_nodeGroupIDTable),
-                            CksvAccess(_nodeGroupTable), CksvAccess(_numNodeGroups),
-                            NodeBocInitMsg, &CkCreateLocalNodeGroup
-                           );
+                           CksvAccess(_nodeGroupTable), CksvAccess(_numNodeGroups),
+                           NodeBocInitMsg, &CkCreateLocalNodeGroup
+                          );
 }
 
 // handle chare array elements for this processor
@@ -640,7 +687,7 @@ void CkPupArrayElementsData(PUP::er &p, int notifyListeners)
 {
  	int i;
 	// safe in both packing/unpacking at this stage
-        int numGroups = CkpvAccess(_groupIDTable)->size();
+  int numGroups = CkpvAccess(_groupIDTable)->size();
 
 	// number of array elements on this processor
 	int numElements = 0;
@@ -656,25 +703,25 @@ void CkPupArrayElementsData(PUP::er &p, int notifyListeners)
 	if (!p.isUnpacking())
 	{
 	  // let CkLocMgr iterate over and store every array element
-          CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk););
-        }
+    CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk););
+  }
 	else {
 	  // loop and create all array elements ourselves
 	  //CkPrintf("total chare array cnts: %d\n", numElements);
 	  for (int i=0; i<numElements; i++) {
-		CkGroupID gID;
-		CkArrayIndex idx;
-                CmiUInt8 id;
-		p|gID;
-                p|idx;
-                p|id;
-		CkLocMgr *mgr = (CkLocMgr*)CkpvAccess(_groupTable)->find(gID).getObj();
-		if (notifyListeners){
-		  mgr->resume(idx, id, p, true);
-		}
-                else{
-		  mgr->restore(idx, id, p);
-		}
+      CkGroupID gID;
+      CkArrayIndex idx;
+      CmiUInt8 id;
+      p|gID;
+      p|idx;
+      p|id;
+      //CkPrintf("[%d] Unpacked dim = %i: %s\n", CkMyPe(), idx.dimension, idx2str(idx));
+      CkLocMgr *mgr = (CkLocMgr*)CkpvAccess(_groupTable)->find(gID).getObj();
+      if (notifyListeners){
+        mgr->resume(idx, id, p, true);
+      } else{
+        mgr->restore(idx, id, p);
+      }
 	  }
 	}
 	// finish up
@@ -713,7 +760,7 @@ void CkPupProcessorData(PUP::er &p)
     CkPupChareData(p);
 
     // save groups 
-    CkPupGroupData(p);
+    //CkPupGroupData(p);
 
     // save nodegroups
     if(CkMyRank()==0) {
@@ -812,109 +859,119 @@ void CkStartCheckpoint(const char* dirname, const CkCallback& cb, bool requestSt
       .Checkpoint(dirname, cb, requestStatus, writersPerNode);
 }
 
+void CkStartRescaleCheckpoint(const char* dirname, const CkCallback& cb, 
+  std::vector<char> avail, bool requestStatus, int writersPerNode)
+{
+#if CMK_SHRINK_EXPAND
+  if (CkMyPe() != 0)
+  {
+    CkPrintf("[%d] se_avail_vector copied\n", CkMyPe());
+    se_avail_vector = (char*) malloc(CkNumPes() * sizeof(char));
+    memcpy(se_avail_vector, avail.data(), CkNumPes() * sizeof(char));
+  }
+
+  if (cb.isInvalid())
+  CkAbort("callback after checkpoint is not set properly");
+
+  if (cb.containsPointer())
+  CkAbort("Cannot restart from a callback based on a pointer");
+
+  CkPrintf("[%d] Checkpoint starting in %s\n", CkMyPe(), dirname);
+
+  // hand over to checkpoint managers for per-processor checkpointing
+  CProxy_CkCheckpointWriteMgr(_sysChkptWriteMgr)
+      .RescaleCheckpoint(dirname, cb, avail, requestStatus, writersPerNode);
+#endif
+}
+
 /**
   * Restart: There's no such object as restart manager is created
   *          because a group cannot restore itself anyway.
   *          The mechanism exists as converse code and get invoked by
   *          broadcast message.
   **/
-
 CkCallback globalCb;
-void CkRestartMain(const char* dirname, CkArgMsg *args){
-	int i;
-	
-        if (CmiMyRank() == 0) {
-          _inrestart = true;
-          _restarted = true;
-          CkMemCheckPT::inRestarting = true;
-        }
+void CkRecvGroupROData(char* msg)
+{
+  char* origMsg = msg;
+  msg = msg + CmiMsgHeaderSizeBytes;
+  int dirSize = *reinterpret_cast<int*>(msg);
+  msg += sizeof(int);
+  std::string dirname(msg, dirSize);
+  msg += dirSize;
+  int ROsize = *reinterpret_cast<int*>(msg);
+  msg += sizeof(int);
+
+  //CkPrintf("dirname = %s, groupsize = %i\n", dirname.c_str(), groupSize);
+  PUP::fromMem bRO(msg, PUP::er::IS_CHECKPOINT);
 
-	// restore readonlys
-	FILE* fRO = openCheckpointFile(dirname, "RO", "rb", -1);
-	int _numPes = -1;
-	PUP::fromDisk pRO(fRO, PUP::er::IS_CHECKPOINT);
-	pRO|_numPes;
+  int _numPes = -1;
+  bRO|_numPes;
 	int _numNodes = -1;
-	pRO|_numNodes;
-	pRO|globalCb;
-	if (CmiMyRank() == 0) CkPupROData(pRO);
+	bRO|_numNodes;
+	bRO|globalCb;
+	/*if (CmiMyRank() == 0)*/ CkPupROData(bRO);
 	bool requestStatus = false;
-	pRO|requestStatus;
-	CmiFclose(fRO);
-	DEBCHK("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
-        _oldNumPes = _numPes;
-
-	CmiNodeBarrier();
-
-        // Restore mainchares on PE 0
-        if (CkMyPe() == 0)
-        {
-          FILE* fMain = openCheckpointFile(dirname, "MainChares", "rb");
-          if (fMain)
-          {
-            PUP::fromDisk pMain(fMain, PUP::er::IS_CHECKPOINT);
-            CkPupMainChareData(pMain, args);
-            CmiFclose(fMain);
-            DEBCHK("[%d]CkRestartMain: mainchares restored\n", CkMyPe());
-          }
-        }
+	bRO|requestStatus;
+
+  CkPrintf("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes());
+
+  msg += ROsize;
+
+  if (CkMyPe() >= _numPes) {
+    PUP::fromMem bGroups(msg, PUP::er::IS_CHECKPOINT);
+    CkPupGroupData(bGroups);
+  }
 
 #ifndef CMK_CHARE_USE_PTR
-        // restore chares only when number of pes is the same
-        if (CkNumPes() == _numPes)
-        {
-          // A chare checkpoint file only exists when the PE actually contained singleton
-          // chares at checkpoint time, so check to see if the file exists before trying
-          // to restore
-          std::string filename = getCheckpointFileName(dirname, "Chares", CkMyPe());
-          FILE* fChares = CmiFopen(filename.c_str(), "rb");
-          if (fChares)
-          {
-            PUP::fromDisk pChares(fChares, PUP::er::IS_CHECKPOINT);
-            CkPupChareData(pChares);
-            CmiFclose(fChares);
-            _chareRestored = true;
-          }
-        }
+  // restore chares only when number of pes is the same
+  if (CkNumPes() == _numPes)
+  {
+    // A chare checkpoint file only exists when the PE actually contained singleton
+    // chares at checkpoint time, so check to see if the file exists before trying
+    // to restore
+    std::string filename = getCheckpointFileName(dirname.c_str(), "Chares", CkMyPe());
+    FILE* fChares = CmiFopen(filename.c_str(), "rb");
+    if (fChares)
+    {
+      PUP::fromDisk pChares(fChares, PUP::er::IS_CHECKPOINT);
+      CkPupChareData(pChares);
+      CmiFclose(fChares);
+      _chareRestored = true;
+    }
+  }
 #endif
-
-	// restore groups
-	// content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
-	// restore from PE0's copy if shrink/expand
-	FILE* fGroups = openCheckpointFile(dirname, "Groups", "rb",
-                                     (CkNumPes() == _numPes) ? CkMyPe() : 0);
-	PUP::fromDisk pGroups(fGroups, PUP::er::IS_CHECKPOINT);
-    CkPupGroupData(pGroups);
-	CmiFclose(fGroups);
-
-	// restore nodegroups
-	// content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
-	if(CkMyRank()==0){
-                FILE* fNodeGroups = openCheckpointFile(dirname, "NodeGroups", "rb",
-                                                       (CkNumNodes() == _numNodes) ? CkMyNode() : 0);
-                PUP::fromDisk pNodeGroups(fNodeGroups, PUP::er::IS_CHECKPOINT);
-        CkPupNodeGroupData(pNodeGroups);
-		CmiFclose(fNodeGroups);
-	}
+  CmiFree(origMsg);
 
 	// for each location, restore arrays
 	//DEBCHK("[%d]Trying to find location manager\n",CkMyPe());
-	DEBCHK("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes());
-	if(CkMyPe() < _numPes) 	// in normal range: restore, otherwise, do nothing
-          for (i=0; i<_numPes;i++) {
-            if (i%CkNumPes() == CkMyPe()) {
-              FILE *datFile = openCheckpointFile(dirname, "arr", "rb", i);
-	      PUP::fromDisk  p(datFile, PUP::er::IS_CHECKPOINT);
-	      CkPupArrayElementsData(p);
-	      CmiFclose(datFile);
-            }
-	  }
+	
+	if(CkMyPe() < _numPes) {	// in normal range: restore, otherwise, do nothing
+    int rank = CmiPhysicalRank(CmiMyPe());
+    CkPrintf("[%d]CkRestartMain: restoring array elements from physical rank %d\n", CkMyPe(), rank);
+
+    FILE* groupFile = openCheckpointFile(dirname.c_str(), "Groups", "rb", rank);
+    PUP::fromDisk bGroups(groupFile, PUP::er::IS_CHECKPOINT);
+    CkPupGroupData(bGroups);
+    CmiFclose(groupFile);
+
+    if(CmiMyRank()==0) {
+      FILE* nodeGroupFile = openCheckpointFile(dirname.c_str(), "NodeGroups", "rb", 0);
+      PUP::fromDisk bNodeGroups(nodeGroupFile, PUP::er::IS_CHECKPOINT);
+      CkPupNodeGroupData(bNodeGroups);
+      CmiFclose(nodeGroupFile);
+    }
+
+    FILE *datFile = openCheckpointFile(dirname.c_str(), "arr", "rb", rank);
+    PUP::fromDisk  p(datFile, PUP::er::IS_CHECKPOINT);
+    CkPupArrayElementsData(p);
+    CmiFclose(datFile);
+  }
 
-        _inrestart = false;
+  set_in_restart(false);
+
+  if (CmiMyRank()==0) _initDone();  // this rank will trigger other ranks
 
-   	if (CmiMyRank()==0) _initDone();  // this rank will trigger other ranks
-   	//_initDone();
-	CkMemCheckPT::inRestarting = false;
 	if(CkMyPe()==0) {
 		CmiPrintf("[%d]CkRestartMain done. sending out callback.\n",CkMyPe());
 		if(requestStatus)
@@ -927,49 +984,117 @@ void CkRestartMain(const char* dirname, CkArgMsg *args){
 		  globalCb.send();
 		}
 	}
+  
+  if (CmiMyRank() == 0) CkMemCheckPT::inRestarting = false;
+
+  if (CmiMyPe() == 0) {
+    CkPrintf("Restore from disk finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
+  }
 }
 
+void CkRestartMain(const char* dirname, CkArgMsg *args){
 #if CMK_SHRINK_EXPAND
-// after resume and getting message
-void CkResumeRestartMain(char * msg) {
-  int i;
-  char filename[1024];
-  const char * dirname = "";
-  _inrestart = true;
-  _restarted = true;
-  CkMemCheckPT::inRestarting = true;
-  CmiPrintf("[%d]CkResumeRestartMain: Inside Resume Restart\n",CkMyPe());
-  CmiPrintf("[%d]CkResumeRestartMain: Group restored %d\n",CkMyPe(), CkpvAccess(_numGroups)-1);
+  chkptStartTimer = CmiWallTimer();
+	int i;
+	
+  if (CmiMyRank() == 0) {
+    set_in_restart(true);
+    _restarted = true;
+    CkMemCheckPT::inRestarting = true;
+  }
 
-  int _numPes = -1;
-  if(CkMyPe()!=0) {
-    PUP::fromMem pRO((char *)(msg+CmiMsgHeaderSizeBytes+2*sizeof(int)), PUP::er::IS_CHECKPOINT);
+  // Restore mainchares on PE 0
+  if (CkMyPe() == 0)
+  {
+    FILE* fMain = openCheckpointFile(dirname, "MainChares", "rb");
+    if (fMain)
+    {
+      PUP::fromDisk pMain(fMain, PUP::er::IS_CHECKPOINT);
+      CkPupMainChareData(pMain, args);
+      CmiFclose(fMain);
+      DEBCHK("[%d]CkRestartMain: mainchares restored\n", CkMyPe());
+    }
+  }
 
-    CkPupROData(pRO);
-    CmiPrintf("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
+  if (CkMyPe() == 0)
+  {
+    std::string dirnameStr(dirname);
+    int strLen = dirnameStr.size();
+
+    std::string ROFileName = getCheckpointFileName(dirname, "RO", -1);
+    std::ifstream ROFile(ROFileName, std::ios::binary | std::ios::ate);
+    std::streamsize ROSize = ROFile.tellg();
+    ROFile.seekg(0, std::ios::beg);
+    
+    // Check for and exclude EOF character if present
+    if (ROSize > 0) {
+      ROFile.seekg(-1, std::ios::end);
+      char lastChar;
+      ROFile.get(lastChar);
+      if (lastChar == EOF || lastChar == '\0') {
+        ROSize--;
+      }
+      ROFile.seekg(0, std::ios::beg);
+    }
 
-    CkPupGroupData(pRO);
-    CmiPrintf("[%d]CkResumeRestartMain: Group restored %d\n",CkMyPe(), CkpvAccess(_numGroups)-1);
-  }
+    //CkPrintf("GroupMetadataSize = %lld\n", (long long)GroupMetadataSize);
 
-  CmiFree(msg);
-  CmiNodeBarrier();
-  if(Cmi_isOldProcess) {
-    /* CmiPrintf("[%d] For shrinkexpand newpe=%d, oldpe=%d \n",Cmi_myoldpe, CkMyPe(), Cmi_myoldpe); */
-    // non-shrink files would be empty since LB would take care
-    FILE *datFile = openCheckpointFile(dirname, "arr", "rb", Cmi_myoldpe);
-    PUP::fromDisk  p(datFile, PUP::er::IS_CHECKPOINT);
-    CkPupArrayElementsData(p);
-    CmiFclose(datFile);
+    std::string GroupFilename = getCheckpointFileName(dirname, "Groups", 0);
+    std::ifstream GroupFile(GroupFilename, std::ios::binary | std::ios::ate);
+    std::streamsize GroupSize = GroupFile.tellg();
+    GroupFile.seekg(0, std::ios::beg);
+
+    // Check for and exclude EOF character if present
+    if (GroupSize > 0) {
+      GroupFile.seekg(-1, std::ios::end);
+      char lastChar;
+      GroupFile.get(lastChar);
+      if (lastChar == EOF || lastChar == '\0') {
+        GroupSize--;
+      }
+      GroupFile.seekg(0, std::ios::beg);
+    }
+
+    char* msg = (char*) CmiAlloc(ROSize + GroupSize + 2 * sizeof(int) + strLen + CmiMsgHeaderSizeBytes);
+    char* buffer = msg + CmiMsgHeaderSizeBytes;
+    std::memcpy(buffer, &strLen, sizeof(int));
+    buffer += sizeof(int);
+    std::memcpy(buffer, dirname, strLen);
+    buffer += strLen;
+    std::memcpy(buffer, &ROSize, sizeof(int));
+    buffer += sizeof(int);
+
+    ROFile.read(buffer, ROSize);
+    buffer += ROSize;
+
+    GroupFile.read(buffer, GroupSize);
+    buffer += GroupSize;
+
+    CmiSetHandler(msg, _shrinkExpandRestartHandlerIdx);
+
+    CmiSyncBroadcastAllAndFree(ROSize + GroupSize + 2 * sizeof(int) + strLen + CmiMsgHeaderSizeBytes, msg);
+
+    //CkPrintf("PE %i at barrier\n", CkMyPe());
+    //CmiBarrier();
   }
-  _initDone();
-  _inrestart = false;
-  CkMemCheckPT::inRestarting = false;
-  if(CkMyPe()==0) {
-    CmiPrintf("[%d]CkResumeRestartMain done. sending out callback.\n",CkMyPe());
-    CkPrintf("Restart from shared memory  finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
-    globalCb.send();
+
+   	//_initDone();
+#endif
+}
+
+#if CMK_SHRINK_EXPAND
+// NOTE - This function doesn't appear to be called anywhere
+// after resume and getting message
+void CkResumeRestartMain(char * msg) {
+}
+
+int GetNewPeNumber(std::vector<char> avail){
+  int mype = CkMyPe();
+  int count =0;
+  for (int i =0; i <mype; i++){
+    if(avail[i] ==0) count++;
   }
+  return (mype - count);
 }
 #endif
 
diff --git a/src/ck-core/ckcheckpoint.ci b/src/ck-core/ckcheckpoint.ci
index 7abf2d2c0f..30b03d197e 100644
--- a/src/ck-core/ckcheckpoint.ci
+++ b/src/ck-core/ckcheckpoint.ci
@@ -2,11 +2,13 @@ module CkCheckpoint {
   readonly CkGroupID _sysChkptWriteMgr;
   readonly CkGroupID _sysChkptMgr;
   extern module CkCheckpointStatus;
-  nodegroup [migratable] CkCheckpointWriteMgr {
+  group [migratable] CkCheckpointWriteMgr {
     entry CkCheckpointWriteMgr(void);
-    entry [exclusive] void Checkpoint(char dirname[strlen(dirname) + 1], CkCallback cb,
+    entry void Checkpoint(char dirname[strlen(dirname) + 1], CkCallback cb,
                           bool requestStatus, int writersPerNode);
-    entry [exclusive] void FinishedCheckpoint(void);
+    entry void RescaleCheckpoint(char dirname[strlen(dirname) + 1], CkCallback cb,
+                          std::vector<char> avail, bool requestStatus, int writersPerNode);
+    entry void FinishedCheckpoint(void);
   };
   group [migratable] CkCheckpointMgr {
 	entry CkCheckpointMgr(void);
diff --git a/src/ck-core/ckcheckpoint.h b/src/ck-core/ckcheckpoint.h
index 463c672865..c5cceb9c14 100644
--- a/src/ck-core/ckcheckpoint.h
+++ b/src/ck-core/ckcheckpoint.h
@@ -42,6 +42,29 @@ restarting of Charm++ programs. ...
     }	\
   }
 
+  #if CMK_SHRINK_EXPAND
+  extern char* se_avail_vector;
+  #endif
+
+//int   _shrinkExpandRestartHandlerIdx;
+
+
+struct GroupInfo
+{
+  CkGroupID gID;
+  int MigCtor;
+  std::string name;
+  bool present;
+
+  void pup(PUP::er& p)
+  {
+    p | gID;
+    p | MigCtor;
+    p | name;
+    p | present;
+  }
+};
+
 // utility functions to pup system global tables
 void CkPupROData(PUP::er &p);
 void CkPupMainChareData(PUP::er &p, CkArgMsg *args);
@@ -51,22 +74,36 @@ void CkPupNodeGroupData(PUP::er &p);
 void CkPupArrayElementsData(PUP::er &p, int notifyListeners=1);
 void CkPupProcessorData(PUP::er &p);
 void CkRemoveArrayElements();
+void CkRecvGroupROData(char* msg);
 //void CkTestArrayElements();
 
 // If writersPerNode <= 0 the number of writers is unchanged, if > 0, then set to
 // min(writersPerNode, CkMyNodeSize())
 void CkStartCheckpoint(const char* dirname, const CkCallback& cb,
                        bool requestStatus = false, int writersPerNode = 0);
+void CkStartRescaleCheckpoint(const char* dirname, const CkCallback& cb, 
+  std::vector<char> avail, bool requestStatus = false, int writersPerNode = 0);
 void CkRestartMain(const char* dirname, CkArgMsg* args);
+
 #if CMK_SHRINK_EXPAND
+int GetNewPeNumber(std::vector<char> avail);
 void CkResumeRestartMain(char *msg);
 #endif
+
 #if __FAULT__
 int  CkCountArrayElements();
 #endif
 
 #if CMK_SHRINK_EXPAND
-enum realloc_state : uint8_t { NO_REALLOC=0, REALLOC_MSG_RECEIVED=1, REALLOC_IN_PROGRESS=2 };
+enum realloc_state : uint8_t 
+{
+  NO_REALLOC=0, 
+  SHRINK_MSG_RECEIVED=1 << 0, 
+  EXPAND_MSG_RECEIVED=1 << 1,
+  SHRINK_IN_PROGRESS=1 << 2,
+  EXPAND_IN_PROGRESS=1 << 3
+};
+
 extern realloc_state pending_realloc_state;
 extern CkGroupID _lbmgr;
 #endif
diff --git a/src/ck-core/cklocation.C b/src/ck-core/cklocation.C
index 83ffe65615..2bb8efecb7 100644
--- a/src/ck-core/cklocation.C
+++ b/src/ck-core/cklocation.C
@@ -23,6 +23,15 @@
 #include <stdarg.h>
 #include <vector>
 
+#if CMK_CUDA || CMK_HIP
+
+#include "hapi.h"
+#include "gpumanager.h"
+
+CsvExtern(GPUManager, gpu_manager);
+
+#endif // CMK_CUDA || CMK_HIP
+
 #if CMK_LBDB_ON
 #  include "LBManager.h"
 #  include "MetaBalancer.h"
@@ -83,15 +92,22 @@ int _messageBufferingThreshold;
 #  if CMK_GLOBAL_LOCATION_UPDATE
 void UpdateLocation(MigrateInfo& migData)
 {
+  // CmiPrintf("calls update location\n");
   CkGroupID locMgrGid = ck::ObjID(migData.obj.id).getCollectionID();
-  if (locMgrGid.idx == 0)
-  {
-    return;
-  }
-
   CkLocMgr* localLocMgr = (CkLocMgr*)CkLocalBranch(locMgrGid);
-  // CkLocMgr only uses element IDs, so extract just that part from the ObjID
-  localLocMgr->updateLocation(ck::ObjID(migData.obj.id).getElementID(), migData.to_pe);
+  CkLocCache *cache = (CkLocCache *)CkLocalBranch(localLocMgr->getLocationCache());
+
+  CmiUInt8 elementID = ck::ObjID(migData.obj.id).getElementID();
+  CkArrayIndex idx = localLocMgr->lookupIdx(elementID);
+
+  CkLocEntry entry;
+  entry.id = elementID;
+  entry.pe = migData.to_pe;
+  entry.epoch = cache->getEpoch(elementID) + 1;
+
+  // CkPrintf("[%d] UpdateLocation: obj id=%llu from_pe=%d to_pe=%d epoch=%d\n",
+  //          CkMyPe(), entry.id, migData.from_pe, entry.pe, entry.epoch);
+  localLocMgr->updateLocation(idx, entry);
 }
 #  endif
 
@@ -1824,9 +1840,15 @@ void CkMigratable::UserSetLBLoad()
 
 #if CMK_LBDB_ON  // For load balancing:
 // user can call this helper function to set obj load (for model-based lb)
-void CkMigratable::setObjTime(double cputime) { myRec->setObjTime(cputime); }
+void CkMigratable::setObjTime(double cputime) { 
+  myRec->setObjTime(cputime); }
 double CkMigratable::getObjTime() { return myRec->getObjTime(); }
 
+void CkMigratable::setObjGPUTime(double gputime) {
+  myRec->setObjGPUTime(gputime);
+}
+double CkMigratable::getObjGPUTime() { return myRec->getObjGPUTime(); }
+
 #  if CMK_LB_USER_DATA
 /**
  * Use this method to set user specified data to the lbdatabase.
@@ -1936,12 +1958,21 @@ void CkMigratable::AtSync(int waitForMigration)
   if (usesAutoMeasure == false)
     UserSetLBLoad();
 
+  #if CMK_CUDA || CMK_HIP
+  PUP::sizer ps(PUP::er::IS_MIGRATION);
+  this->virtual_pup(ps);
+  // printf("[%d] gpu pup size %ld\n",CkMyPe(), ps.gpu_size() );
+  setGPUPupSize(ps.gpu_size());
+  #endif
   if (_lb_psizer_on || _lb_args.metaLbOn())
   {
     PUP::sizer ps(PUP::er::IS_MIGRATION);
     this->virtual_pup(ps);
     if (_lb_psizer_on)
+    {
       setPupSize(ps.size());
+    }
+    //TODO: check if this is correct after gpuPUP size
     if (_lb_args.metaLbOn())
       myRec->getMetaBalancer()->SetCharePupSize(ps.size());
   }
@@ -2040,6 +2071,9 @@ void CkMigratable::setMigratable(int migratable) { myRec->setMigratable(migratab
 
 void CkMigratable::setPupSize(size_t obj_pup_size) { myRec->setPupSize(obj_pup_size); }
 
+void CkMigratable::setGPUPupSize(size_t obj_gpu_pup_size) { myRec->setGPUPupSize(obj_gpu_pup_size); }
+
+
 void CkMigratable::CkAddThreadListeners(CthThread tid, void* msg)
 {
   Chare::CkAddThreadListeners(tid, msg);  // for trace
@@ -2049,6 +2083,8 @@ void CkMigratable::CkAddThreadListeners(CthThread tid, void* msg)
 #else
 void CkMigratable::setObjTime(double cputime) {}
 double CkMigratable::getObjTime() { return 0.0; }
+void CkMigratable::setObjGPUTime(double gputime) {}
+double CkMigratable::getObjGPUTime() { return 0.0; }
 
 #  if CMK_LB_USER_DATA
 void* CkMigratable::getObjUserData(int idx) { return NULL; }
@@ -2129,13 +2165,23 @@ void CkLocRec::stopTiming(int ignore_running)
   if (!ignore_running)
     running = false;
 }
-void CkLocRec::setObjTime(double cputime) { lbmgr->EstObjLoad(ldHandle, cputime); }
+void CkLocRec::setObjTime(double cputime) { 
+  lbmgr->EstObjLoad(ldHandle, cputime); }
 double CkLocRec::getObjTime()
 {
   LBRealType walltime, cputime;
   lbmgr->GetObjLoad(ldHandle, walltime, cputime);
   return walltime;
 }
+void CkLocRec::setObjGPUTime(double gputime) {
+  lbmgr->EstObjGPULoad(ldHandle, gputime);
+}
+double CkLocRec::getObjGPUTime()
+{
+  LBRealType gputime;
+  lbmgr->GetObjGPULoad(ldHandle, gputime);
+  return gputime;
+}
 #  if CMK_LB_USER_DATA
 void* CkLocRec::getObjUserData(int idx) { return lbmgr->GetDBObjUserData(ldHandle, idx); }
 #  endif
@@ -2273,6 +2319,11 @@ void CkLocRec::setPupSize(size_t obj_pup_size)
   lbmgr->setPupSize(ldHandle, obj_pup_size);
 }
 
+void CkLocRec::setGPUPupSize(size_t obj_gpu_pup_size)
+{
+  lbmgr->setGPUPupSize(ldHandle, obj_gpu_pup_size);
+}
+
 #endif
 
 // Call ckDestroy for each record, which deletes the record, and ~CkLocRec()
@@ -2363,8 +2414,10 @@ void CkLocCache::requestLocation(CmiUInt8 id, const int peToTell)
 
 void CkLocCache::updateLocation(const CkLocEntry& newEntry)
 {
+  // printf("[%d] updateLocation: id=%llu pe=%d epoch=%d\n", CmiMyPe(), newEntry.id, newEntry.pe, newEntry.epoch);
   CkAssert(newEntry.pe != -1);
   CkLocEntry& oldEntry = locMap[newEntry.id];
+  // printf("[%d] updateLocation: oldEntry.epoch=%d\n", CmiMyPe(), oldEntry.epoch);
   if (newEntry.epoch > oldEntry.epoch)
   {
     oldEntry = newEntry;
@@ -2376,6 +2429,8 @@ void CkLocCache::recordEmigration(CmiUInt8 id, int pe)
 {
   LocationMap::iterator itr = locMap.find(id);
 
+  // printf("[%d] recordEmigration: id=%llu itr->second.pe=%d pe=%d\n", CmiMyPe(), id, itr->second.pe, pe);
+
   CkAssert(itr != locMap.end());
   CkAssert(itr->second.pe == CkMyPe());
 
@@ -2947,6 +3002,44 @@ void CkLocMgr::migratableList(CkLocRec* rec, std::vector<CkMigratable*>& list)
   }
 }
 
+bool did_inter_node_gpudirect_rdma(int srcPe, int dstPe) {
+  CmiEnforce((srcPe >= 0) && (srcPe <= CmiNumPes()));
+  CmiEnforce((dstPe >= 0) && (dstPe <= CmiNumPes()));
+
+  if (CmiNodeOf(srcPe) == CmiNodeOf(dstPe)) {
+    return false;
+  } else if (CmiPeOnSamePhysicalNode(srcPe, dstPe)) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+#if CMK_CUDA || CMK_HIP
+void CkLocMgr::sendGPUMsg(CmiUInt8 id)
+{
+  auto gpuData = sendGPUBuffers[id];
+  sendGPUBuffers.erase(id);
+  thisProxy[gpuData.toPe].immigrateGPU(id, gpuData.size, CkDeviceBuffer(gpuData.data, gpuData.size,
+    CkCallbackResumeThread()));
+
+  if(did_inter_node_gpudirect_rdma(CkMyPe(), gpuData.toPe)) {
+    GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
+    if(csv_gpu_manager.use_shm) {
+      DeviceManager* dm = csv_gpu_manager.device_map[CkMyPe()];
+  #if CMK_SMP
+      CmiLock(dm->lock);
+  #endif
+    dm->free_comm_buffer((size_t)((char*)gpuData.data - (char*)dm->comm_buffer->base_ptr));
+  #if CMK_SMP
+      CmiUnlock(dm->lock);
+  #endif
+    }
+  }
+  //CkPrintf("PE %d sent GPU msg of size %zu for id %llu\n", CkMyPe(), gpuData.size, id);
+}
+#endif
+
 /// Migrate this local element away to another processor.
 void CkLocMgr::emigrate(CkLocRec* rec, int toPe)
 {
@@ -2972,12 +3065,16 @@ void CkLocMgr::emigrate(CkLocRec* rec, int toPe)
   callMethod(rec, &CkMigratable::ckAboutToMigrate);
 
   // First pass: find size of migration message
-  size_t bufSize;
-  {
-    PUP::sizer p(PUP::er::IS_MIGRATION);
-    pupElementsFor(p, rec, CkElementCreation_migrate);
-    bufSize = p.size();
-  }
+  size_t bufSize, gpuBufSize;
+  PUP::sizer p(PUP::er::IS_MIGRATION);
+  pupElementsFor(p, rec, CkElementCreation_migrate);
+  bufSize = p.size();
+
+  gpuBufSize = 0;
+#if CMK_CUDA || CMK_HIP
+  gpuBufSize = p.gpu_size();
+#endif
+
 #if CMK_ERROR_CHECKING
   if (bufSize > std::numeric_limits<int>::max())
   {
@@ -2986,6 +3083,8 @@ void CkLocMgr::emigrate(CkLocRec* rec, int toPe)
   }
 #endif
 
+
+  void* gpuMsg = nullptr;
   // Allocate and pack into message
   CkArrayElementMigrateMessage* msg =
       new (bufSize, 0) CkArrayElementMigrateMessage(idx, id,
@@ -2995,10 +3094,30 @@ void CkLocMgr::emigrate(CkLocRec* rec, int toPe)
                                                     false,
 #endif
                                                     bufSize, managers.size(),
-                                                    cache->getEpoch(id) + 1);
+                                                    cache->getEpoch(id) + 1,
+                                                    gpuBufSize > 0);
 
   {
-    PUP::toMem p(msg->packData, PUP::er::IS_MIGRATION);
+#if CMK_CUDA || CMK_HIP
+    if (gpuBufSize > 0) {
+      GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
+      if(csv_gpu_manager.use_shm) {
+        DeviceManager* dm = csv_gpu_manager.device_map[CkMyPe()];
+#if CMK_SMP
+        CmiLock(dm->lock);
+#endif
+        gpuMsg = dm->alloc_comm_buffer(gpuBufSize, false);
+        if (gpuMsg == nullptr) {
+          CkAbort("PE %d, device %d: Not enough memory on device Load balance buffer (%zu free)",
+              CkMyPe(), dm->global_index, dm->get_lb_buffer_free_size());
+        }
+#if CMK_SMP
+        CmiUnlock(dm->lock);
+#endif
+      }
+    }
+#endif
+    PUP::toMem p(msg->packData, gpuMsg, PUP::er::IS_MIGRATION);
     p.becomeDeleting();
     pupElementsFor(p, rec, CkElementCreation_migrate);
     if (p.size() != bufSize)
@@ -3013,6 +3132,13 @@ void CkLocMgr::emigrate(CkLocRec* rec, int toPe)
 
   DEBM((AA "Migrated index size %s to %d \n" AB, idx2str(idx), toPe));
 
+#if CMK_CUDA || CMK_HIP
+  // Ensure all device-to-device copies from PUP packing are complete before
+  // destroying elements, since cudaMemcpy(D2D) can be async in CUDA 12.x.
+  if (gpuBufSize > 0)
+    hapiDeviceSynchronize();
+#endif
+
   thisProxy[toPe].immigrate(msg);
 
   duringMigration = true;
@@ -3024,9 +3150,16 @@ void CkLocMgr::emigrate(CkLocRec* rec, int toPe)
 
   cache->recordEmigration(id, toPe);
   informHome(idx, toPe);
+#if CMK_CUDA || CMK_HIP
+  if (gpuBufSize > 0)
+  {
+    sendGPUBuffers[id] = GPUMigrateData(toPe, gpuBufSize, gpuMsg);
+    thisProxy[CkMyPe()].sendGPUMsg(id);
+  }
+#endif
 
 #if !CMK_LBDB_ON && CMK_GLOBAL_LOCATION_UPDATE
-  DEBM((AA "Global location update. idx %s "
+  CmiPrintf((AA "Global location update. idx %s "
            "assigned to %d \n" AB,
         idx2str(idx), toPe));
   thisProxy.updateLocation(id, toPe);
@@ -3047,14 +3180,65 @@ void CkLocMgr::metaLBCallLB(CkLocRec* rec)
 }
 #endif
 
+#if CMK_CUDA || CMK_HIP
+void CkLocMgr::immigrateGPU(CmiUInt8& id, int& size, char* &data, CkDeviceBufferPost* post)
+{
+  //CkPrintf("PE %d allocating GPU memory size %d for id %llu\n", CkMyPe(), size, id);
+  GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
+  if(csv_gpu_manager.use_shm) {
+    DeviceManager* dm = csv_gpu_manager.device_map[CkMyPe()];
+#if CMK_SMP
+    CmiLock(dm->lock);
+#endif
+    data = (char*)(dm->alloc_comm_buffer(size, false));
+    if (data == nullptr) {
+      CkAbort("PE %d, device %d: Not enough memory on device Load balance buffer (%zu free)",
+          CkMyPe(), dm->global_index, dm->get_lb_buffer_free_size());
+    }
+#if CMK_SMP
+    CmiUnlock(dm->lock);
+#endif
+  }
+  hapiDeviceSynchronize();
+  receivedDeviceMsgs[id] = data;
+  post[0].hapi_stream = (hapiStream_t) 0;
+}
+
+void CkLocMgr::immigrateGPU(CmiUInt8 id, int size, char* data)
+{
+  void* dataPtr = receivedDeviceMsgs[id];
+  receivedDeviceMsgs.erase(id);
+  bufferedDeviceMigrateMsgs[id] = dataPtr;
+  if (bufferedHostMigrateMsgs.find(id) != bufferedHostMigrateMsgs.end())
+  {
+    immigrate(bufferedHostMigrateMsgs[id]);
+    bufferedHostMigrateMsgs.erase(id);
+  }
+}
+#endif
+
 /**
   Migrating array element is arriving on this processor.
 */
 void CkLocMgr::immigrate(CkArrayElementMigrateMessage* msg)
 {
+  void* gpuMsg = nullptr;
+  if (msg->hasGPUMsg)
+  {
+    auto it = bufferedDeviceMigrateMsgs.find(msg->id);
+
+    if (it == bufferedDeviceMigrateMsgs.end())
+    {
+      bufferedHostMigrateMsgs[msg->id] = msg;
+      return;
+    }
+    
+    gpuMsg = it->second;
+  }
+
   const CkArrayIndex& idx = msg->idx;
 
-  PUP::fromMem p(msg->packData, PUP::er::IS_MIGRATION);
+  PUP::fromMem p(msg->packData, gpuMsg, PUP::er::IS_MIGRATION);
 
   if (msg->nManagers < managers.size())
     CkAbort("Array element arrived from location with fewer managers!\n");
@@ -3067,15 +3251,35 @@ void CkLocMgr::immigrate(CkArrayElementMigrateMessage* msg)
     return;
   }
 
+  if (msg->hasGPUMsg)
+    bufferedDeviceMigrateMsgs.erase(msg->id);
+
   insertID(idx, msg->id);
 
   // Create a record for this element
-  CkLocRec* rec =
-      createLocal(idx, true, msg->ignoreArrival, false /* home told on departure */, msg->epoch);
+  CkLocRec* rec = elementNrec(msg->id);
+  if (rec == nullptr)
+      rec = createLocal(idx, true, msg->ignoreArrival, false /* home told on departure */, msg->epoch);
 
   CmiAssert(CpvAccess(newZCPupGets).empty());  // Ensure that vector is empty
   // Create the new elements as we unpack the message
   pupElementsFor(p, rec, CkElementCreation_migrate);
+  hapiDeviceSynchronize();
+
+#if CMK_CUDA || CMK_HIP
+  GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
+  if(csv_gpu_manager.use_shm) {
+    DeviceManager* dm = csv_gpu_manager.device_map[CkMyPe()];
+#if CMK_SMP
+    CmiLock(dm->lock);
+#endif
+  dm->free_comm_buffer((size_t)((char*)gpuMsg - (char*)dm->comm_buffer->base_ptr));
+#if CMK_SMP
+    CmiUnlock(dm->lock);
+#endif
+  }
+#endif
+
   bool zcRgetsActive = !CpvAccess(newZCPupGets).empty();
   if (zcRgetsActive)
   {
diff --git a/src/ck-core/cklocation.ci b/src/ck-core/cklocation.ci
index 30d1d158c6..5e51ee93bc 100644
--- a/src/ck-core/cklocation.ci
+++ b/src/ck-core/cklocation.ci
@@ -1,3 +1,5 @@
+#include "conv-mach-opt.h"
+
 module CkLocation {
   extern module CkMarshall;
 
@@ -14,6 +16,10 @@ module CkLocation {
   group [migratable] CkLocMgr {
     entry CkLocMgr(CkArrayOptions opts);
     entry [expedited] void immigrate(CkArrayElementMigrateMessage *msg);
+#if CMK_CUDA || CMK_HIP
+    entry [expedited, threaded] void sendGPUMsg(CmiUInt8 id);
+    entry [expedited] void immigrateGPU(CmiUInt8 id, int size, nocopydevice char data[size]);
+#endif
     entry [expedited] void requestLocation(const CkArrayIndex& idx, int peToTell);
     entry [expedited] void updateLocation(const CkArrayIndex& idx, const CkLocEntry& e);
     entry void reclaimRemote(const CkArrayIndex& idx, int deletedOnPe);
diff --git a/src/ck-core/cklocation.h b/src/ck-core/cklocation.h
index 380db7570a..b636f23d91 100644
--- a/src/ck-core/cklocation.h
+++ b/src/ck-core/cklocation.h
@@ -92,12 +92,13 @@ class CkArrayElementMigrateMessage : public CMessage_CkArrayElementMigrateMessag
 {
 public:
   CkArrayElementMigrateMessage(CkArrayIndex idx_, CmiUInt8 id_, bool ignoreArrival_,
-                               int length_, int nManagers_, int epoch_)
+                               int length_, int nManagers_, int epoch_, bool hasGPUMsg_ = false)
       : idx(idx_),
         id(id_),
         ignoreArrival(ignoreArrival_),
         length(length_),
         nManagers(nManagers_),
+        hasGPUMsg(hasGPUMsg_),
         epoch(epoch_)
   {
   }
@@ -105,6 +106,7 @@ class CkArrayElementMigrateMessage : public CMessage_CkArrayElementMigrateMessag
   CkArrayIndex idx;    // Array index that is migrating
   CmiUInt8 id;         // ID of the elements with this index in this collection
   bool ignoreArrival;  // if to inform LB of arrival
+  bool hasGPUMsg;
   int length;          // Size in bytes of the packed data
   int nManagers;       // Number of associated array managers
   int epoch;
@@ -220,6 +222,17 @@ CkpvExtern(int, CkSaveRestorePrefetch);
 
 #include "ckmigratable.h"
 
+class GPUMigrateData
+{
+public:
+  int toPe;
+  int size;
+  void* data;
+
+  GPUMigrateData() : toPe(-1), size(0), data(nullptr) {}
+  GPUMigrateData(int toPe_, int size_, void* data_) : toPe(toPe_), size(size_), data(data_) {}
+};
+
 /********************** CkLocMgr ********************/
 /// A tiny class for detecting heap corruption
 class CkMagicNumber_impl
@@ -418,6 +431,13 @@ class CkLocMgr : public IrrGroup
   // Immigration messages which are waiting for all array managers to be ready
   std::list<CkArrayElementMigrateMessage*> pendingImmigrate;
 
+  std::unordered_map<CmiUInt8, GPUMigrateData> sendGPUBuffers;
+  std::unordered_map<CmiUInt8, CkArrayElementMigrateMessage*> bufferedHostMigrateMsgs;
+  std::unordered_map<CmiUInt8, void*> bufferedDeviceMigrateMsgs;
+  std::unordered_map<CmiUInt8, void*> sentDeviceMsgs;
+
+  std::unordered_map<CmiUInt8, void*> receivedDeviceMsgs;
+
   // The mapping of index to ID is either done via compression or an explicit map,
   // depending on if the bounds of this array are compressible into a 64bit ID.
   CkArrayIndex bounds;
@@ -524,7 +544,7 @@ class CkLocMgr : public IrrGroup
 
   CmiUInt8 lookupID(const CkArrayIndex& idx) const
   {
-    CkAssert(checkInBounds(idx));
+    //CkAssert(checkInBounds(idx));
     if (compressor)
     {
       const CmiUInt8 home = homePe(idx);
@@ -550,7 +570,7 @@ class CkLocMgr : public IrrGroup
   // TODO: This should be better
   bool lookupID(const CkArrayIndex& idx, CmiUInt8& id) const
   {
-    CkAssert(checkInBounds(idx));
+    //CkAssert(checkInBounds(idx));
     if (compressor)
     {
       const CmiUInt8 home = homePe(idx);
@@ -691,6 +711,11 @@ class CkLocMgr : public IrrGroup
 
   // Communication:
   void immigrate(CkArrayElementMigrateMessage* msg);
+#if CMK_CUDA || CMK_HIP
+  void sendGPUMsg(CmiUInt8 id);
+  void immigrateGPU(CmiUInt8& id, int& size, char* &data, CkDeviceBufferPost* post);
+  void immigrateGPU(CmiUInt8 id, int size, char* data);
+#endif
   void requestLocation(CmiUInt8 id);
   void requestLocation(const CkArrayIndex& idx);
   bool requestLocation(const CkArrayIndex& idx, int peToTell);
diff --git a/src/ck-core/cklocrec.h b/src/ck-core/cklocrec.h
index 8528aafcd2..df3271d9fe 100644
--- a/src/ck-core/cklocrec.h
+++ b/src/ck-core/cklocrec.h
@@ -49,6 +49,10 @@ class CkLocRec {
   void stopTiming(int ignore_running=0);
   void setObjTime(double cputime);
   double getObjTime();
+
+  void setObjGPUTime(double gputime);
+  double getObjGPUTime();
+
   void *getObjUserData(int idx);
 #else
   inline void startTiming(int ignore_running=0) {  }
@@ -70,6 +74,7 @@ class CkLocRec {
   void recvMigrate(int dest);
   void setMigratable(int migratable);	/// set migratable
   void setPupSize(size_t obj_pup_size);
+  void setGPUPupSize(size_t obj_gpu_pup_size);
   void AsyncMigrate(bool use);
   bool isAsyncMigrate()   { return asyncMigrate; }
   void ReadyMigrate(bool ready) { readyMigrate = ready; } ///called from user
diff --git a/src/ck-core/ckmemcheckpoint.C b/src/ck-core/ckmemcheckpoint.C
index b691fca2e3..1232b47946 100644
--- a/src/ck-core/ckmemcheckpoint.C
+++ b/src/ck-core/ckmemcheckpoint.C
@@ -678,7 +678,8 @@ static inline void _handleProcData(PUP::er &p)
 #endif
 
     // save groups into Groups.dat
-    CkPupGroupData(p);
+    //std::vector<GroupInfo> groupMetadata = CkPupGroupMetadata(p);
+    //CkPupGroupData(p, groupMetadata.size(), groupMetadata);
 
     // save nodegroups into NodeGroups.dat
     if(CkMyRank()==0) CkPupNodeGroupData(p);
diff --git a/src/ck-core/ckmigratable.h b/src/ck-core/ckmigratable.h
index d0e9a96eeb..7cec2884ad 100644
--- a/src/ck-core/ckmigratable.h
+++ b/src/ck-core/ckmigratable.h
@@ -80,6 +80,8 @@ class CkMigratable : public Chare {
   virtual void UserSetLBLoad(void);  /// user define this when setLBLoad is true
   void setObjTime(double cputime);
   double getObjTime();
+  void setObjGPUTime(double cputime);
+  double getObjGPUTime();
 #if CMK_LB_USER_DATA
   void *getObjUserData(int idx);
 #endif
@@ -96,6 +98,7 @@ class CkMigratable : public Chare {
   void ckFinishConstruction(int epoch = -1);
   void setMigratable(int migratable);
   void setPupSize(size_t obj_pup_size);
+  void setGPUPupSize(size_t obj_gpu_pup_size);
 #else
   void AtSync(int waitForMigration=1) { ResumeFromSync();}
   void setMigratable(int migratable)  { }
diff --git a/src/ck-core/ckrdmadevice.C b/src/ck-core/ckrdmadevice.C
index 806ef289e0..4b9ea79967 100644
--- a/src/ck-core/ckrdmadevice.C
+++ b/src/ck-core/ckrdmadevice.C
@@ -27,7 +27,7 @@
  *    ordering between these data transfers. Because multiple PEs can be mapped
  *    to the same GPU and hence concurrently request allocations from the same
  *    device communication buffer, a thread-safe allocator using the buddy
- *    allocation algorithm was implemented. The allocator first calls cudaMalloc
+ *    allocation algorithm was implemented. The allocator first calls hapiMalloc
  *    to obtain a relatively large chunk of memory and then services allocation
  *    and deallocation requests from PEs that are mapped to its GPU device.
  *    The buddy algorithm was used to minimize the external fragmentation that
@@ -50,25 +50,123 @@
 #include "ck.h"
 #include "ckrdmadevice.h"
 
-#if CMK_CUDA
+#define CMK_GPU_COMM 1
+
+#if CMK_CUDA || CMK_HIP
+
+CmiNcpyModeDevice findTransferModeDevice(int srcPe, int dstPe) {
+  CmiEnforce((srcPe >= 0) && (srcPe <= CmiNumPes()));
+  CmiEnforce((dstPe >= 0) && (dstPe <= CmiNumPes()));
+
+  if (CmiNodeOf(srcPe) == CmiNodeOf(dstPe)) {
+    // Same logical node
+    return CmiNcpyModeDevice::MEMCPY;
+  } else if (CmiPeOnSamePhysicalNode(srcPe, dstPe)) {
+    // Different logical nodes, same physical node
+    return CmiNcpyModeDevice::IPC;
+  } else {
+    // Different physical nodes, requires GPUDirect RDMA
+    return CmiNcpyModeDevice::RDMA;
+  }
+}
 
 #include "hapi.h"
 #include "gpumanager.h"
 
 CsvExtern(GPUManager, gpu_manager);
+CpvExtern(int, my_device_id);
+
+// void CkRdmaDeviceRecvHandler(void* data)
+// {
+//   DeviceRdmaOp* op = (DeviceRdmaOp*)data;
+//   DeviceRdmaInfo* info = op->info;
+
+//   // Invoke source callbacks
+//   if (op->src_cb) {
+//     int rank;
+//     CkCallback* cb = (CkCallback*)op->src_cb;
+//     cb->send();
+//     delete cb;
+//   }
+
+//   // Update counter (there may be multiple buffers in transit)
+//   info->counter++;
+
+//   // Check if all buffers have been received
+//   // If so, invoke regular entry method
+//   if (info->counter == info->n_ops) {
+//     QdCreate(1);
+
+//     enqueueNcpyMessage(op->dest_pe, info->msg);
+
+//     // Free RDMA metadata
+//     CmiFree(info);
+//   }
+// }
+
+struct LoopBackMsg {
+  char header[CmiMsgHeaderSizeBytes];
+  void* msg;
+};
+
+extern "C" {
+  void* loopback_bridge(void* arg) {
+    QdProcess(1);
+    LoopBackMsg* recv_msg = (LoopBackMsg*)arg;
+    CkRdmaDeviceRecvHandler(recv_msg->msg);
+    CmiFree(recv_msg);
+    return NULL;
+  }
+  
+  int loopback_handler;
+}
 
-// Invoked when a GPU buffer arrives on the receiver
-#if !CMK_GPU_COMM
-void CkRdmaDeviceRecvHandler(void* data, void* msg)
-#else
 void CkRdmaDeviceRecvHandler(void* data)
-#endif
 {
-#if CMK_GPU_COMM
-  // Process QD to mark completion of buffer transfer
+  NcpyOperationInfo *ncpy_op_info = (NcpyOperationInfo *)data;
+  DeviceRdmaOp* op = (DeviceRdmaOp*)(ncpy_op_info->deviceRdmaOpInfo);
+
+  if(op->dest_pe != CmiMyPe()) {
+        int infoSize = ncpy_op_info->ncpyOpInfoSize;
+        NcpyOperationInfo* copy = (NcpyOperationInfo*)CmiAlloc(infoSize);
+        memcpy(copy, ncpy_op_info, infoSize);
+
+        LoopBackMsg* conv_msg = (LoopBackMsg*)CmiAlloc(sizeof(LoopBackMsg));
+        conv_msg->msg = copy;
+
+        QdCreate(1);
+        CmiSetHandler(conv_msg, loopback_handler);
+        CmiPushPE(CmiRankOf(op->dest_pe), conv_msg);
+        return;
+  }
+
   QdProcess(1);
-#endif
+  DeviceRdmaInfo* info = op->info;
+
+  // Invoke source callbacks
+  if (op->src_cb) {
+    CkCallback* cb = (CkCallback*)op->src_cb;
+    cb->send();
+    delete cb;
+  }
 
+  // Update counter (there may be multiple buffers in transit)
+  info->counter++;
+
+  // Check if all buffers have been received
+  // If so, invoke regular entry method
+  if (info->counter == info->n_ops) {
+    QdCreate(1);
+
+    enqueueNcpyMessage(op->dest_pe, info->msg);
+
+    // Free RDMA metadata
+    // CmiFree(info);
+  }
+}
+// Invoked when a GPU buffer arrives on the receiver
+void CkRdmaDeviceRecvHandler(void* data, void* msg)
+{
   DeviceRdmaOp* op = (DeviceRdmaOp*)data;
   DeviceRdmaInfo* info = op->info;
 
@@ -105,12 +203,12 @@ void CkDevicePersistent::init() {
 
 void CkDevicePersistent::open() {
   // Create a CUDA IPC handle for inter-process communication
-  hapiCheck(cudaIpcGetMemHandle(&cuda_ipc_handle, (void*)ptr));
+  hapiCheck(hapiIpcGetMemHandle(&hapi_ipc_handle, (void*)ptr));
 }
 
 void CkDevicePersistent::close() {
   // Close the CUDA IPC handle if it was opened
-  hapiCheck(cudaIpcCloseMemHandle(ipc_ptr));
+  hapiCheck(hapiIpcCloseMemHandle(ipc_ptr));
 }
 
 void CkDevicePersistent::set_msg(void* msg) {
@@ -122,7 +220,7 @@ void CkDevicePersistent::pup(PUP::er& p) {
   p|cnt;
   p|pe;
   p|cb;
-  p((char*)&cuda_ipc_handle, sizeof(cuda_ipc_handle));
+  p((char*)&hapi_ipc_handle, sizeof(hapi_ipc_handle));
 }
 
 CkDeviceStatus CkDevicePersistent::get(CkDevicePersistent& src) {
@@ -135,24 +233,24 @@ CkDeviceStatus CkDevicePersistent::get(CkDevicePersistent& src) {
 
   // Perform get
   if (mode == CkNcpyModeDevice::MEMCPY) {
-    cudaMemcpyAsync((void*)ptr, src.ptr, cnt, cudaMemcpyDeviceToDevice, cuda_stream);
+    hapiMemcpyAsync((void*)ptr, src.ptr, cnt, hapiMemcpyDeviceToDevice, hapi_stream);
   } else if (mode == CkNcpyModeDevice::IPC) {
     if (!src.ipc_open) {
-      hapiCheck(cudaIpcOpenMemHandle(&src.ipc_ptr, src.cuda_ipc_handle,
-            cudaIpcMemLazyEnablePeerAccess));
+      hapiCheck(hapiIpcOpenMemHandle(&src.ipc_ptr, src.hapi_ipc_handle,
+            hapiIpcMemLazyEnablePeerAccess));
       src.ipc_open = true;
     }
-    cudaMemcpyAsync((void*)ptr, src.ipc_ptr, cnt, cudaMemcpyDeviceToDevice, cuda_stream);
+    hapiMemcpyAsync((void*)ptr, src.ipc_ptr, cnt, hapiMemcpyDeviceToDevice, hapi_stream);
   } else {
     CkAbort("Persistant GPU messaging is currently not supported for inter-node messages");
   }
 
   // Set callbacks to be invoked once get is complete
   if (src.cb.type != CkCallback::ignore) {
-    hapiAddCallback(cuda_stream, src.cb, src.cb_msg);
+    hapiAddCallback(hapi_stream, src.cb, src.cb_msg);
   }
   if (cb.type != CkCallback::ignore) {
-    hapiAddCallback(cuda_stream, cb, cb_msg);
+    hapiAddCallback(hapi_stream, cb, cb_msg);
   }
 
   return CkDeviceStatus::incomplete;
@@ -168,24 +266,24 @@ CkDeviceStatus CkDevicePersistent::put(CkDevicePersistent& dst) {
 
   // Perform put
   if (mode == CkNcpyModeDevice::MEMCPY) {
-    cudaMemcpyAsync((void*)dst.ptr, ptr, cnt, cudaMemcpyDeviceToDevice, cuda_stream);
+    hapiMemcpyAsync((void*)dst.ptr, ptr, cnt, hapiMemcpyDeviceToDevice, hapi_stream);
   } else if (mode == CkNcpyModeDevice::IPC) {
     if (!dst.ipc_open) {
-      hapiCheck(cudaIpcOpenMemHandle(&dst.ipc_ptr, dst.cuda_ipc_handle,
-            cudaIpcMemLazyEnablePeerAccess));
+      hapiCheck(hapiIpcOpenMemHandle(&dst.ipc_ptr, dst.hapi_ipc_handle,
+            hapiIpcMemLazyEnablePeerAccess));
       dst.ipc_open = true;
     }
-    cudaMemcpyAsync(dst.ipc_ptr, ptr, cnt, cudaMemcpyDeviceToDevice, cuda_stream);
+    hapiMemcpyAsync(dst.ipc_ptr, ptr, cnt, hapiMemcpyDeviceToDevice, hapi_stream);
   } else {
     CkAbort("Persistant GPU messaging is not yet supported for inter-node messages");
   }
 
   // Set callbacks to be invoked once get is complete
   if (cb.type != CkCallback::ignore) {
-    hapiAddCallback(cuda_stream, cb, cb_msg);
+    hapiAddCallback(hapi_stream, cb, cb_msg);
   }
   if (dst.cb.type != CkCallback::ignore) {
-    hapiAddCallback(cuda_stream, dst.cb, dst.cb_msg);
+    hapiAddCallback(hapi_stream, dst.cb, dst.cb_msg);
   }
 
   return CkDeviceStatus::incomplete;
@@ -193,6 +291,17 @@ CkDeviceStatus CkDevicePersistent::put(CkDevicePersistent& dst) {
 
 /****************************** Recv Entry Method API ******************************/
 
+// Returns the local rank of the logical node (process) that the given PE belongs to
+static inline int CmiNodeRankLocal(int pe) {
+  // Logical node index % Number of logical nodes per physical node
+  return CmiNodeOf(pe) % (CmiNumNodes() / CmiNumPhysicalNodes());
+}
+
+// Returns the local rank of the logical node that I belong to
+static inline int CmiMyNodeRankLocal() {
+  return CmiNodeRankLocal(CmiMyPe());
+}
+
 // Invoked after post entry method
 void CkRdmaDeviceIssueRgets(envelope *env, int numops, void **arrPtrs, int *arrSizes, CkDeviceBufferPost *postStructs) {
   // Change message header to invoke regular entry method
@@ -211,13 +320,12 @@ void CkRdmaDeviceIssueRgets(envelope *env, int numops, void **arrPtrs, int *arrS
 
   CkDeviceBuffer source;
 
-#if !CMK_GPU_COMM
   // Machine layer does not support GPU-aware communication
   GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
 
   // Find which mode of transfer should be used
+  // CmiPrintf("[%d] CkRdmaDeviceOnSender: src_pe=%d, dst_pe=%d\n", CkMyPe(), env->getSrcPe(), CkMyPe());
   CkNcpyModeDevice mode = findTransferModeDevice(env->getSrcPe(), CkMyPe());
-#endif
 
   // Allocate and fill in metadata for this zerocopy operation
   void* rdma_data = CmiAlloc(sizeof(DeviceRdmaInfo) + sizeof(DeviceRdmaOp) * numops);
@@ -238,18 +346,20 @@ void CkRdmaDeviceIssueRgets(envelope *env, int numops, void **arrPtrs, int *arrS
     // Store information about this buffer
     DeviceRdmaOp& save_op = *(DeviceRdmaOp*)((char*)rdma_data
         + sizeof(DeviceRdmaInfo) + sizeof(DeviceRdmaOp) * i);
-    save_op.dest_pe = CkMyPe();
+    save_op.dest_pe  = source.dest_pe;
     save_op.dest_ptr = arrPtrs[i];
     save_op.size = (size_t)arrSizes[i];
     save_op.info = rdma_info;
     save_op.src_cb = (source.cb.type != CkCallback::ignore) ? new CkCallback(source.cb) : nullptr;
     save_op.dst_cb = nullptr;
 
-#if !CMK_GPU_COMM
     // Machine layer does not support GPU-aware communication
     // Check if destination PE is correct
     // TODO: Handle this case instead of aborting
+    // Chare* obj = CkActiveObj();
+    // CmiUInt8 id = obj->id; 
     if (source.dest_pe != CkMyPe()) {
+      CmiPrintf("Current PE %d does not match the destination PE %d and sender determined to be %d\n", CkMyPe(), source.dest_pe, env->getSrcPe());
       CkAbort("Current PE does not match the destination PE determined by the sender. "
           "Please enable CMK_GLOBAL_LOCATION_UPDATE.");
     }
@@ -261,64 +371,58 @@ void CkRdmaDeviceIssueRgets(envelope *env, int numops, void **arrPtrs, int *arrS
     if (mode == CkNcpyModeDevice::MEMCPY) {
       // Source and destination PEs are in the same process (logical node)
       // Directly invoke memcpy from source buffer to destination buffer
-      hapiCheck(cudaMemcpyAsync((void*)dest.ptr, source.ptr, dest.cnt,
-            cudaMemcpyDeviceToDevice, postStructs[i].cuda_stream));
+      hapiCheck(hapiMemcpyAsync((void*)dest.ptr, source.ptr, dest.cnt,
+            hapiMemcpyDeviceToDevice, postStructs[i].hapi_stream));      
     } else if (mode == CkNcpyModeDevice::IPC && csv_gpu_manager.use_shm) {
       // Inter-process using shared memory optimizations
       // Use optimiziations with POSIX shared memory
-      cuda_ipc_device_info& device_info =
-        csv_gpu_manager.cuda_ipc_device_infos[source.device_idx];
+      hapi_ipc_device_info& device_info =
+        csv_gpu_manager.hapi_ipc_device_infos[source.device_idx];
 
-      // 1. Make user-provided stream wait for IPC event using cudaStreamWaitEvent
+      // 1. Make user-provided stream wait for IPC event using hapiStreamWaitEvent
       //    (source buffer to device comm buffer on source)
-      hapiCheck(cudaStreamWaitEvent(postStructs[i].cuda_stream,
+      hapiCheck(hapiStreamWaitEvent(postStructs[i].hapi_stream,
             device_info.src_event_pool[source.event_idx], 0));
 
-      // 2. Invoke cudaMemcpyAsync (from source device comm buffer to destination buffer)
-      hapiCheck(cudaMemcpyAsync((void*)dest.ptr,
+      // 2. Invoke hapiMemcpyAsync (from source device comm buffer to destination buffer)
+      hapiCheck(hapiMemcpyAsync((void*)dest.ptr,
             (void*)((char*)device_info.buffer + source.comm_offset),
-            dest.cnt, cudaMemcpyDeviceToDevice, postStructs[i].cuda_stream));
+            dest.cnt, hapiMemcpyDeviceToDevice, postStructs[i].hapi_stream));
 
       // 3. Record IPC event so that the sender can query it for freeing
       //    device comm buffer and corresponding pair of CUDA IPC events
-      hapiCheck(cudaEventRecord(device_info.dst_event_pool[source.event_idx],
-            postStructs[i].cuda_stream));
+      hapiCheck(hapiEventRecord(device_info.dst_event_pool[source.event_idx],
+            postStructs[i].hapi_stream));
 
       // 4. Set flag in shared memory so that the sender can start querying
       //    completion of the IPC event
-      cuda_ipc_event_shared* shm_event_shared =
-        (cuda_ipc_event_shared*)((char*)csv_gpu_manager.shm_ptr
+      hapi_ipc_event_shared* shm_event_shared =
+        (hapi_ipc_event_shared*)((char*)csv_gpu_manager.shm_ptr
             + csv_gpu_manager.shm_chunk_size * source.device_idx
-            + sizeof(cudaIpcMemHandle_t)) + source.event_idx;
-      pthread_mutex_lock(&shm_event_shared->lock);
-      shm_event_shared->dst_flag = true;
-      pthread_mutex_unlock(&shm_event_shared->lock);
+            + sizeof(hapiIpcMemHandle_t)) + source.event_idx;
+      __atomic_store_n(&shm_event_shared->dst_flag, 1, __ATOMIC_RELEASE);
     } else {
+      // CmiPrintf("it should never be called during intra node\n");
+#if CMK_GPU_COMM
+      // Machine layer supports GPU-aware communication
+      QdCreate(1);
+      CmiSetDirectNcpyAckHandler(CkRdmaDeviceRecvHandler);
+      CmiNcpyBuffer lci_dest_ncpy_buffer(arrPtrs[i], (size_t)arrSizes[i], (void*)(&save_op));
+      lci_dest_ncpy_buffer.rdmaGet(source.lci_ncpy_buffer, 0, nullptr, nullptr);
+      continue;
+#else
       // Handle all other cases (basic inter-process and inter-node)
       // Transfer the received/unpacked data on host to the destination device buffer
       // FIXME: Print warning that this is slow?
       CkAssert(source.data_stored);
-      hapiCheck(cudaMemcpyAsync((void*)dest.ptr, source.data, dest.cnt,
-            cudaMemcpyHostToDevice, postStructs[i].cuda_stream));
+      hapiCheck(hapiMemcpyAsync((void*)dest.ptr, source.data, dest.cnt,
+            hapiMemcpyHostToDevice, postStructs[i].hapi_stream));
+#endif
     }
 
     // Add source callback for polling, so that it can be invoked once the transfer is complete
-    hapiAddCallback(postStructs[i].cuda_stream, CkCallback(CkRdmaDeviceRecvHandler, &save_op));
-#else
-    // Machine layer supports GPU-aware communication
-    save_op.tag = source.tag;
-#endif // CMK_GPU_COMM
+    hapiAddCallback(postStructs[i].hapi_stream, CkCallback(CkRdmaDeviceRecvHandler, &save_op));
   }
-
-#if CMK_GPU_COMM
-  // Post ucp_tag_recv_nb's to receive GPU data
-  for (int i = 0; i < numops; i++) {
-    DeviceRdmaOp* save_op = (DeviceRdmaOp*)((char*)rdma_data
-        + sizeof(DeviceRdmaInfo) + sizeof(DeviceRdmaOp) * i);
-    QdCreate(1);
-    CmiRecvDevice(save_op, DEVICE_RECV_TYPE_CHARM);
-  }
-#endif
 }
 
 // Unused, left for future reference
@@ -340,48 +444,43 @@ int CkRdmaGetDestPEChare(int dest_pe, void* obj_ptr) {
 }
 */
 
-static int findFreeIpcEvent(DeviceManager* dm, const size_t comm_offset) {
-  int pool_size = CsvAccess(gpu_manager).cuda_ipc_event_pool_size_pe;
+static int findFreeIpcEvent(DeviceManager* dm, const size_t comm_offset, int cpv_my_device_id) {
+  GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
+  int pool_size = csv_gpu_manager.hapi_ipc_event_pool_size_pe;
   int pool_start = CkMyRank() * pool_size;
-  int device_index = dm->global_index;
-  cuda_ipc_device_info& my_device_info = CsvAccess(gpu_manager).cuda_ipc_device_infos[device_index];
+  hapi_ipc_device_info& my_device_info = csv_gpu_manager.hapi_ipc_device_infos[csv_gpu_manager.device_count * CmiMyNodeRankLocal() + cpv_my_device_id];
 
   // Free IPC events that are complete
   // TODO: Don't do this every time but only when the event pool is somewhat empty
   for (int i = pool_start; i < pool_start + pool_size; i++) {
     int& event_flag = my_device_info.event_pool_flags[i];
-    cudaEvent_t& ev = my_device_info.dst_event_pool[i];
+    hapiEvent_t& ev = my_device_info.dst_event_pool[i];
     size_t& buff_offset = my_device_info.event_pool_buff_offsets[i];
     // For a used event, check if it's complete and mark as free if so
     if (event_flag != 0) {
       // Check in shared memory if receiver has invoked the memcpy from
       // the device comm buffer on sender to destination buffer
-      cuda_ipc_event_shared* shm_event_shared =
-        (cuda_ipc_event_shared*)((char*)CsvAccess(gpu_manager).shm_ptr
-            + CsvAccess(gpu_manager).shm_chunk_size * device_index
-            + sizeof(cudaIpcMemHandle_t)) + i;
-      bool can_query = false;
-      pthread_mutex_lock(&shm_event_shared->lock);
-      if (shm_event_shared->dst_flag == true) {
-        shm_event_shared->dst_flag = false;
-        can_query = true;
-      }
-      pthread_mutex_unlock(&shm_event_shared->lock);
+      hapi_ipc_event_shared* shm_event_shared =
+        (hapi_ipc_event_shared*)((char*)csv_gpu_manager.shm_ptr
+            + csv_gpu_manager.shm_chunk_size * (csv_gpu_manager.device_count * CmiMyNodeRankLocal() + cpv_my_device_id)
+            + sizeof(hapiIpcMemHandle_t)) + i;
+      bool can_query = __atomic_load_n(&shm_event_shared->dst_flag, __ATOMIC_ACQUIRE);
 
       // If the receiver has invoked the memcpy,
       // the sender can query the event for completion
       if (can_query) {
-        if (cudaEventQuery(ev) == cudaSuccess) {
+        if (hapiEventQuery(ev) == hapiSuccess) {
           // Event completion means that the transfer from source device comm buffer
           // to dest buffer is complete, so free the allocated block
           if (event_flag == 1) {
             dm->free_comm_buffer(buff_offset);
           } else {
-            CkAbort("Retrieved cudaSuccess for a free IPC event");
+            CkAbort("Retrieved hapiSuccess for a free IPC event");
           }
 
           // Mark event as free
           event_flag = 0;
+          __atomic_store_n(&shm_event_shared->dst_flag, 0, __ATOMIC_RELEASE);
         }
       }
     }
@@ -389,11 +488,11 @@ static int findFreeIpcEvent(DeviceManager* dm, const size_t comm_offset) {
 
   // Allocate CUDA IPC events from the pool
   // Two events are used per message:
-  // 1) Recorded by the sender after 'source buffer -> device comm buffer' cudaMemcpy.
+  // 1) Recorded by the sender after 'source buffer -> device comm buffer' hapiMemcpy.
   //    Can be used by the sender to determine if the sender buffer is free for reuse.
-  //    It is also used by the receiver to create a dependency for the second cudaMemcpy
+  //    It is also used by the receiver to create a dependency for the second hapiMemcpy
   //    ('device comm buffer -> dest buffer')
-  // 2) Recorded by the receiver after 'device comm buffer -> dest buffer' cudaMemcpy.
+  // 2) Recorded by the receiver after 'device comm buffer -> dest buffer' hapiMemcpy.
   //    It is used by the sender to determine when the allocated block on
   //    device comm buffer and IPC events can be freed.
   for (int i = pool_start; i < pool_start + pool_size; i++) {
@@ -414,38 +513,51 @@ void CkRdmaDeviceOnSender(int dest_pe, int numops, CkDeviceBuffer** buffers) {
   // TODO: Need to handle the case where the destination PE could be wrong
   //       (due to migration, etc.). Currently the code relies on a global
   //       location update after migration (with CMK_GLOBAL_LOCATION_UPDATE).
-#if !CMK_GPU_COMM
-  GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
-
-  // Determine transfer mode (intra-process, inter-process, inter-node)
+  // CmiPrintf("[%d] CkRdmaDeviceOnSender: src_pe=%d, dst_pe=%d\n", CkMyPe(), CkMyPe(), dest_pe);
   CkNcpyModeDevice transfer_mode = findTransferModeDevice(CkMyPe(), dest_pe);
 
   // Store destination PE in the metadata message
   // FIXME: Not necessary? save_op.dest_pe is set to CkMyPe() on the receiver
   for (int i = 0; i < numops; i++) {
     buffers[i]->dest_pe = dest_pe;
+    buffers[i]->dest_mpi_rank = CmiNodeOf(dest_pe);
+    buffers[i]->src_pe = CmiMyPe();
+    buffers[i]->src_mpi_rank = CmiNodeOf(CmiMyPe());
   }
-
-  if (transfer_mode == CkNcpyModeDevice::MEMCPY) {
-    // Don't need to do anything for intra-process
+  if(transfer_mode == CkNcpyModeDevice::MEMCPY)
+  {
+    for (int i = 0; i < numops; i++)
+      hapiStreamSynchronize(buffers[i]->hapi_stream);
     return;
-  } else if (transfer_mode == CkNcpyModeDevice::IPC && csv_gpu_manager.use_shm) {
+  }
+
+  GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
+  //int cpv_my_device_id = CmiMyRank() % csv_gpu_manager.device_count;
+  int cpv_my_device_id = CpvAccess(my_device_id);
+
+  if(transfer_mode == CkNcpyModeDevice::IPC && csv_gpu_manager.use_shm) {
     // Use optimizations with POSIX shaerd memory
     // Allocate blocks on device comm buffer
     DeviceManager* dm = csv_gpu_manager.device_map[CkMyPe()];
 
     for (int i = 0; i < numops; i++) {
+      bool is_lb_buffer = ( (size_t)((char*)(buffers[i]->ptr) - (char*)(dm->comm_buffer->base_ptr)) < dm->comm_buffer->total_size );
 #if CMK_SMP
       CmiLock(dm->lock);
 #endif
-      void* alloc_comm_buffer = dm->alloc_comm_buffer(buffers[i]->cnt);
-      if (alloc_comm_buffer == nullptr) {
-        CkAbort("PE %d, device %d: Not enough memory on device communication buffer (%zu free)",
-            CkMyPe(), dm->global_index, dm->get_comm_buffer_free_size());
+      void* alloc_comm_buffer;
+      if(is_lb_buffer) {
+        alloc_comm_buffer = const_cast<void*>(buffers[i]->ptr);
+      } else {
+        alloc_comm_buffer = dm->alloc_comm_buffer(buffers[i]->cnt);
+        if (alloc_comm_buffer == nullptr) {
+          CkAbort("PE %d, device %d: Not enough memory on device communication buffer (%zu free)",
+              CkMyPe(), dm->global_index, dm->get_comm_buffer_free_size());
+        }
       }
       buffers[i]->comm_offset = (char*)alloc_comm_buffer - (char*)dm->comm_buffer->base_ptr;
-      buffers[i]->device_idx = dm->global_index;
-      buffers[i]->event_idx = findFreeIpcEvent(dm, buffers[i]->comm_offset);
+      buffers[i]->device_idx = (csv_gpu_manager.device_count * CmiMyNodeRankLocal() + cpv_my_device_id);
+      buffers[i]->event_idx = findFreeIpcEvent(dm, buffers[i]->comm_offset, cpv_my_device_id);
       // Abort if no free IPC event was found
       // FIXME: Instead of aborting, we can maybe create IPC events on demand
       // (although they probably cannot be shared through the shared memory
@@ -458,34 +570,36 @@ void CkRdmaDeviceOnSender(int dest_pe, int numops, CkDeviceBuffer** buffers) {
 #endif
 
       // Initiate transfer from source buffer to device comm buffer
-      hapiCheck(cudaMemcpyAsync(alloc_comm_buffer, buffers[i]->ptr, buffers[i]->cnt,
-            cudaMemcpyDeviceToDevice, buffers[i]->cuda_stream));
+      if(!is_lb_buffer) {
+        hapiCheck(hapiMemcpyAsync(alloc_comm_buffer, buffers[i]->ptr, buffers[i]->cnt,
+              hapiMemcpyDeviceToDevice, buffers[i]->hapi_stream));
+      }
 
       // Record event
-      cuda_ipc_device_info& my_device_info = csv_gpu_manager.cuda_ipc_device_infos[dm->global_index];
-      hapiCheck(cudaEventRecord(my_device_info.src_event_pool[buffers[i]->event_idx], buffers[i]->cuda_stream));
+      hapi_ipc_device_info& my_device_info = csv_gpu_manager.hapi_ipc_device_infos[(csv_gpu_manager.device_count * CmiMyNodeRankLocal() + cpv_my_device_id)];
+      hapiCheck(hapiEventRecord(my_device_info.src_event_pool[buffers[i]->event_idx], buffers[i]->hapi_stream));
     }
   } else {
+#if !CMK_GPU_COMM
     // Use a naive host-staged mechanism
     // Allocate temporary host buffers and copy source buffers
     for (int i = 0; i < numops; i++) {
       buffers[i]->data_stored = true;
-      hapiCheck(cudaMallocHost(&buffers[i]->data, buffers[i]->cnt));
-      hapiCheck(cudaMemcpyAsync(buffers[i]->data, buffers[i]->ptr, buffers[i]->cnt,
-            cudaMemcpyDeviceToHost, buffers[i]->cuda_stream));
+      hapiCheck(hapiMallocHost(&buffers[i]->data, buffers[i]->cnt));
+      hapiCheck(hapiMemcpyAsync(buffers[i]->data, buffers[i]->ptr, buffers[i]->cnt,
+            hapiMemcpyDeviceToHost, buffers[i]->hapi_stream));
     }
 
     // Wait for the copies to finish
     for (int i = 0; i < numops; i++) {
-      hapiCheck(cudaStreamSynchronize(buffers[i]->cuda_stream));
+      hapiCheck(hapiStreamSynchronize(buffers[i]->hapi_stream));
     }
-  }
 #else
-  // Post ucp_tag_send_nb's to send GPU data. When receiver receives the metadata,
-  // it should post ucp_tag_recv_nb's to receive the GPU data.
   for (int i = 0; i < numops; i++) {
-    CmiSendDevice(dest_pe, buffers[i]->ptr, buffers[i]->cnt, buffers[i]->tag);
+    hapiStreamSynchronize(buffers[i]->hapi_stream);
+    buffers[i]->lci_ncpy_buffer = CmiNcpyBuffer(buffers[i]->ptr, buffers[i]->cnt);
+  }
+#endif
   }
-#endif // CMK_GPU_COMM
 }
 #endif // CMK_CUDA
diff --git a/src/ck-core/ckrdmadevice.h b/src/ck-core/ckrdmadevice.h
index c9f97c4a72..b2beea5b35 100644
--- a/src/ck-core/ckrdmadevice.h
+++ b/src/ck-core/ckrdmadevice.h
@@ -4,8 +4,8 @@
 #include "ckcallback.h"
 #include "conv-rdmadevice.h"
 
-#if CMK_CUDA
-#include <cuda_runtime.h>
+#if CMK_CUDA || CMK_HIP
+#include "hapi_portable.h"
 
 #define CkNcpyModeDevice CmiNcpyModeDevice
 #define CkDeviceStatus CmiDeviceStatus
@@ -15,9 +15,9 @@ struct CkDevicePersistent {
   size_t cnt;
   CkCallback cb;
   void* cb_msg;
-  cudaStream_t cuda_stream;
+  hapiStream_t hapi_stream;
   int pe;
-  cudaIpcMemHandle_t cuda_ipc_handle;
+  hapiIpcMemHandle_t hapi_ipc_handle;
   void* ipc_ptr;
   bool ipc_open; // Used only by the remote chare
 
@@ -34,15 +34,15 @@ struct CkDevicePersistent {
     init();
   }
 
-  explicit CkDevicePersistent(const void* ptr_, size_t cnt_, cudaStream_t cuda_stream_)
+  explicit CkDevicePersistent(const void* ptr_, size_t cnt_, hapiStream_t hapi_stream_)
     : ptr(ptr_), cnt(cnt_), cb(CkCallback(CkCallback::ignore)),
-      cuda_stream(cuda_stream_) {
+      hapi_stream(hapi_stream_) {
     init();
   }
 
   explicit CkDevicePersistent(const void* ptr_, size_t cnt_, const CkCallback& cb_,
-      cudaStream_t cuda_stream_)
-    : ptr(ptr_), cnt(cnt_), cb(cb_), cuda_stream(cuda_stream_) {
+      hapiStream_t hapi_stream_)
+    : ptr(ptr_), cnt(cnt_), cb(cb_), hapi_stream(hapi_stream_) {
     init();
   }
 
@@ -62,10 +62,10 @@ struct CkDevicePersistent {
 
 struct CkDeviceBufferPost {
   // CUDA stream for device transfers
-  cudaStream_t cuda_stream;
+  hapiStream_t hapi_stream;
 
   // Use per-thread stream by default
-  CkDeviceBufferPost() : cuda_stream(cudaStreamPerThread) {}
+  CkDeviceBufferPost() : hapi_stream(hapiStreamPerThread) {}
 };
 
 class CkDeviceBuffer : public CmiDeviceBuffer {
@@ -85,14 +85,14 @@ class CkDeviceBuffer : public CmiDeviceBuffer {
     cb = cb_;
   }
 
-  explicit CkDeviceBuffer(const void* ptr_, cudaStream_t cuda_stream_) : CmiDeviceBuffer(ptr_, 0) {
+  explicit CkDeviceBuffer(const void* ptr_, hapiStream_t hapi_stream_) : CmiDeviceBuffer(ptr_, 0) {
     cb = CkCallback(CkCallback::ignore);
-    cuda_stream = cuda_stream_;
+    hapi_stream = hapi_stream_;
   }
 
-  explicit CkDeviceBuffer(const void* ptr_, const CkCallback& cb_, cudaStream_t cuda_stream_) : CmiDeviceBuffer(ptr_, 0) {
+  explicit CkDeviceBuffer(const void* ptr_, const CkCallback& cb_, hapiStream_t hapi_stream_) : CmiDeviceBuffer(ptr_, 0) {
     cb = cb_;
-    cuda_stream = cuda_stream_;
+    hapi_stream = hapi_stream_;
   }
 
   explicit CkDeviceBuffer(const void* ptr_, size_t cnt_) : CmiDeviceBuffer(ptr_, cnt_) {
@@ -103,14 +103,14 @@ class CkDeviceBuffer : public CmiDeviceBuffer {
     cb = cb_;
   }
 
-  explicit CkDeviceBuffer(const void* ptr_, size_t cnt_, cudaStream_t cuda_stream_) : CmiDeviceBuffer(ptr_, cnt_) {
+  explicit CkDeviceBuffer(const void* ptr_, size_t cnt_, hapiStream_t hapi_stream_) : CmiDeviceBuffer(ptr_, cnt_) {
     cb = CkCallback(CkCallback::ignore);
-    cuda_stream = cuda_stream_;
+    hapi_stream = hapi_stream_;
   }
 
-  explicit CkDeviceBuffer(const void* ptr_, size_t cnt_, const CkCallback& cb_, cudaStream_t cuda_stream_) : CmiDeviceBuffer(ptr_, cnt_) {
+  explicit CkDeviceBuffer(const void* ptr_, size_t cnt_, const CkCallback& cb_, hapiStream_t hapi_stream_) : CmiDeviceBuffer(ptr_, cnt_) {
     cb = cb_;
-    cuda_stream = cuda_stream_;
+    hapi_stream = hapi_stream_;
   }
 
   void pup(PUP::er &p) {
@@ -121,14 +121,16 @@ class CkDeviceBuffer : public CmiDeviceBuffer {
   friend void CkRdmaDeviceIssueRgets(envelope *env, int numops, void **arrPtrs, int *arrSizes, CkDeviceBufferPost *postStructs);
 };
 
-#if !CMK_GPU_COMM
-void CkRdmaDeviceRecvHandler(void* data, void* msg);
-#else
 void CkRdmaDeviceRecvHandler(void* data);
-#endif
+void CkRdmaDeviceRecvHandler(void* data, void* msg);
 void CkRdmaDeviceIssueRgets(envelope *env, int numops, void **arrPtrs, int *arrSizes, CkDeviceBufferPost *postStructs);
 void CkRdmaDeviceOnSender(int dest_pe, int numops, CkDeviceBuffer** buffers);
 
+extern "C" {
+  void* loopback_bridge(void* arg);
+  extern int loopback_handler;
+}
+
 #endif // CMK_CUDA
 
 #endif // _CKRDMADEVICE_H_
diff --git a/src/ck-core/ckreduction.C b/src/ck-core/ckreduction.C
index c4f6e33f56..ead38b3de1 100644
--- a/src/ck-core/ckreduction.C
+++ b/src/ck-core/ckreduction.C
@@ -51,6 +51,7 @@ waits for the migrant contributions to straggle in.
 
 #include "charm++.h"
 #include "ck.h"
+#include "ckrescale.h"
 
 #include "pathHistory.h"
 
@@ -83,7 +84,6 @@ waits for the migrant contributions to straggle in.
 #define INT_MAX 2147483647
 #endif
 
-extern bool _inrestart;
 #if CMK_CHARM4PY
 //define a global instance of CkReductionTypesExt for external access
 CkReductionTypesExt charm_reducers;
@@ -92,7 +92,7 @@ extern int (*PyReductionExt)(char**, int*, int, char**);
 
 Group::Group():thisIndex(CkMyPe())
 {
-	if (_inrestart) CmiAbort("A Group object did not call the migratable constructor of its base class!");
+	if (get_in_restart()) CmiAbort("A Group object did not call the migratable constructor of its base class!");
 
 	creatingContributors();
 	contributorStamped(&reductionInfo);
diff --git a/src/ck-core/init.C b/src/ck-core/init.C
index 3190ea818c..b6ea5e6f37 100644
--- a/src/ck-core/init.C
+++ b/src/ck-core/init.C
@@ -65,6 +65,7 @@ never be excluded...
 
 #include "ckcheckpoint.h"
 #include "ck.h"
+#include "ckrescale.h"
 #include "trace.h"
 #include "ckrdma.h"
 #include "CkCheckpoint.decl.h"
@@ -78,7 +79,8 @@ never be excluded...
 #include "TreeLB.h"
 #endif
 
-#if CMK_CUDA
+#define CMK_GPU_COMM 1
+#if CMK_CUDA || CMK_HIP
 #include "hapi_impl.h"
 #include "ckrdmadevice.h"
 
@@ -155,10 +157,12 @@ int   _infoIdx;
 int   _charmHandlerIdx;
 int   _initHandlerIdx;
 int   _roRestartHandlerIdx;
+int   _shrinkExpandRestartHandlerIdx;
 int   _bocHandlerIdx;
 int   _qdHandlerIdx;
 int   _qdCommHandlerIdx;
 int   _triggerHandlerIdx;
+
 bool  _mainDone = false;
 CksvDeclare(bool, _triggersSent);
 
@@ -683,15 +687,16 @@ static void _exitHandler(envelope *env)
       }
       else
         CmiFree(env);
-#if CMK_SHRINK_EXPAND
-      ConverseCleanup();
-#endif
 
-#if CMK_CUDA
+#if CMK_CUDA || CMK_HIP
       // Clean up HAPI
       hapiExit();
 #endif
 
+#if CMK_SHRINK_EXPAND
+      ConverseCleanup();
+#endif
+
       //everyone exits here - there may be issues with leftover messages in the queue
 #if !CMK_WITH_STATS && !CMK_WITH_WARNINGS
       DEBUGF(("[%d] Calling converse exit from ReqStatMsg \n",CkMyPe()));
@@ -773,7 +778,7 @@ static inline void _processBufferedBocInits(void)
     envelope *env = inits[i];
     if(env==0) {
 #if CMK_SHRINK_EXPAND
-      if(_inrestart){
+      if(get_in_restart()){
         CkPrintf("_processBufferedBocInits: empty message in restart, ignoring\n");
         break;
       }
@@ -1445,6 +1450,7 @@ void _initCharm(int unused_argc, char **argv)
 #if CMK_SHRINK_EXPAND
 	// for shrink expand cleanup
 	CmiAssignOnce(&_ROGroupRestartHandlerIdx, CkRegisterHandler(_ROGroupRestartHandler));
+	CmiAssignOnce(&_shrinkExpandRestartHandlerIdx, CkRegisterHandler(CkRecvGroupROData));
 #endif
 
 	_infoIdx = CldRegisterInfoFn((CldInfoFn)_infoFn);
@@ -1481,8 +1487,8 @@ void _initCharm(int unused_argc, char **argv)
 	// Set the ack handler function used for the direct nocopy api
 	CmiSetDirectNcpyAckHandler(CkRdmaDirectAckHandler);
 
-#if CMK_CUDA && CMK_GPU_COMM
-	CmiRdmaDeviceRecvInit(CkRdmaDeviceRecvHandler);
+#if (CMK_CUDA || CMK_HIP) && CMK_GPU_COMM
+  loopback_handler = CmiRegisterHandler((CmiHandler) loopback_bridge);
 #endif
 
 #if CMK_USE_SHMEM
@@ -1691,9 +1697,10 @@ void _initCharm(int unused_argc, char **argv)
         }
     }
 
-#if CMK_CUDA
+#if CMK_CUDA || CMK_HIP
   // Perform HAPI initialization for GPU support
   hapiInit(argv);
+  //hapiStartMemoryDaemon();
 
   // Initialize Charm++ layer functions
   hapiInvokeCallback = CUDACallbackManager;
@@ -1806,6 +1813,7 @@ void _initCharm(int unused_argc, char **argv)
 		// NOTE: this assumes commthreads will not block from this point on
 	}
 
+
 	DEBUGF(("[%d,%d%.6lf] inCommThread %d\n",CmiMyPe(),CmiMyRank(),CmiWallTimer(),inCommThread));
 	// when I am a communication thread, I don't participate initDone.
         if (inCommThread) {
@@ -1829,7 +1837,6 @@ void _initCharm(int unused_argc, char **argv)
                 readKillFile();                                        
         }
 #endif
-
 }
 
 int charm_main(int argc, char **argv)
diff --git a/src/ck-core/init.h b/src/ck-core/init.h
index cf2a383b2b..12b73a3abd 100644
--- a/src/ck-core/init.h
+++ b/src/ck-core/init.h
@@ -137,6 +137,7 @@ extern int     _charmHandlerIdx;
 extern int     _roRestartHandlerIdx;     /* for checkpoint/restart */
 #if CMK_SHRINK_EXPAND
 extern int     _ROGroupRestartHandlerIdx;     /* for checkpoint/restart */
+extern int     _shrinkExpandRestartHandlerIdx;
 #endif
 extern int     _bocHandlerIdx;
 extern int     _qdHandlerIdx;
diff --git a/src/ck-ldb/BaseLB.h b/src/ck-ldb/BaseLB.h
index e7b6683d7f..2c057bac3b 100644
--- a/src/ck-ldb/BaseLB.h
+++ b/src/ck-ldb/BaseLB.h
@@ -53,9 +53,17 @@ class BaseLB: public CBase_BaseLB
     // double utilization;
     int pe;			// processor id
     bool available;
+#if CMK_CUDA || CMK_HIP
+    size_t gpu_mem_remaining;
+    size_t pool_buff_mem_remaining;
+    uint64_t gpu_device_id;		// GPU device this PE is mapped to (-1 = no GPU)
+#endif
     ProcStats(): n_objs(0), pe_speed(1), total_walltime(0.0), idletime(0.0),
 #if CMK_LB_CPUTIMER
 		 total_cputime(0.0), bg_cputime(0.0),
+#endif
+#if CMK_CUDA || CMK_HIP
+	   	 gpu_device_id(-1), gpu_mem_remaining(0), pool_buff_mem_remaining(0),
 #endif
 	   	 bg_walltime(0.0), pe(-1), available(true) {}
     inline void clearBgLoad() {
@@ -78,7 +86,12 @@ class BaseLB: public CBase_BaseLB
          double dummy;  p|dummy;    // for old format with utilization
       }
       p|available; p|n_objs;
-      if (_lb_args.lbversion()>=2) p|pe; 
+      if (_lb_args.lbversion()>=2) p|pe;
+#if CMK_CUDA || CMK_HIP
+      p|gpu_mem_remaining;
+      p|pool_buff_mem_remaining;
+      p|gpu_device_id;
+#endif
     }
   };
 
diff --git a/src/ck-ldb/CentralLB.C b/src/ck-ldb/CentralLB.C
index dbc3e6a60d..768957e75d 100644
--- a/src/ck-ldb/CentralLB.C
+++ b/src/ck-ldb/CentralLB.C
@@ -10,6 +10,17 @@
 #include "envelope.h"
 #include "CentralLB.h"
 #include "LBSimulation.h"
+#if CMK_CUDA || CMK_HIP
+#if CMK_CUDA
+#include <cupti.h>
+#endif
+#include "gpumanager.h"
+// extern void hapiProcessCuptiBuffers();
+// extern void hapiClearCuptiData();
+CsvExtern(GPUManager, gpu_manager);
+CkpvExtern(int, _lb_obj_index);
+#include "hapi.h"
+#endif
 
 #define  DEBUGF(x)       // CmiPrintf x;
 #define  DEBUG(x)        // x;
@@ -37,14 +48,13 @@ extern "C" void charmrun_realloc(char *s);
 extern char willContinue;
 extern realloc_state pending_realloc_state;
 extern char * se_avail_vector;
-extern "C" int mynewpe;
+extern int mynewpe;
 extern char *_shrinkexpand_basedir;
-int numProcessAfterRestart;
-int mynewpe=0;
+extern int numProcessAfterRestart;
 #endif
 CkGroupID loadbalancer;
 int * lb_ptr;
-bool load_balancer_created;
+extern bool load_balancer_created;
 
 static void lbinit()
 {
@@ -71,6 +81,10 @@ void CentralLB::initLB(const CkLBOptions &opt)
   if (opt.getSeqNo() > 0 || (_lb_args.metaLbOn() && _lb_args.metaLbModelDir() != nullptr))
     turnOff();
 
+  #if CMK_CUDA || CMK_HIP && CMK_LB_USER_DATA
+  CkpvAccess(_lb_obj_index) = LBRegisterObjUserData(sizeof(size_t));//gpu allocation size
+  #endif
+
   stats_msg_count = 0;
   statsMsgsList = NULL;
   statsData = NULL;
@@ -130,11 +144,11 @@ int CentralLB::GetPESpeed()
   return myspeed;
 }
 
-void CentralLB::InvokeLB()
+void CentralLB::CallLB()
 {
-#if CMK_LBDB_ON
+  #if CMK_LBDB_ON
   DEBUGF(("[%d] CentralLB AtSync step %d!!!!!\n",CkMyPe(),step()));
-#if CMK_MEM_CHECKPOINT	
+#if CMK_MEM_CHECKPOINT
   CkSetInLdb();
 #endif
 
@@ -143,12 +157,42 @@ void CentralLB::InvokeLB()
     MigrationDone(0);
     return;
   }
+  
+#if CMK_CUDA || CMK_HIP
+#if CMK_SMP
+  CmiNodeBarrier();  // ensure rank 0 finishes buffer processing before other ranks read the map
+#endif
+if (CmiMyRank() == 0)
+{
+#if CMK_CUDA
+  double start = CkWallTimer();
+  cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);//sync flush cupti records which are finished, does not wait for partial records
+  hapiProcessCuptiBuffers();
+#endif
+}
+#if CMK_SMP
+  CmiNodeBarrier();  // ensure rank 0 finishes buffer processing before other ranks read the map
+#endif
+  // Every PE matches its own objects against the shared per-process CUPTI map
+  lbmgr->SetObjGPULoad(CsvAccess(gpu_manager).cupti_obj_gpu_times_);
+#endif
+
   {
     thisProxy [CkMyPe()].ProcessAtSync();
   }
 #endif
 }
 
+void CentralLB::InvokeLB()
+{
+  lbmgr->lb_in_progress = true;
+#if CMK_SHRINK_EXPAND
+  contribute(CkCallback(CkReductionTarget(CentralLB, CheckForLB), thisProxy[0]));
+#else
+  CallLB();
+#endif
+}
+
 void CentralLB::ProcessAtSync()
 {
 #if CMK_LBDB_ON
@@ -306,16 +350,35 @@ void CentralLB::BuildStatsMsg()
   msg->pe_speed = myspeed;
 #endif
 
-  DEBUGF(("Processor %d Total time (wall,cpu) = %f %f Idle = %f Bg = %f %f\n", CkMyPe(),msg->total_walltime,msg->total_cputime,msg->idletime,msg->bg_walltime,msg->bg_cputime));
+#if CMK_CUDA || CMK_HIP
+  // printf("CMK_CUDA setting device is %ld\n", hapiMyDevice());
+  msg->gpu_device_id = hapiMyDevice();
+  size_t freeMem, totalMem;
+  hapiMemGetInfo(&freeMem, &totalMem);
+  msg->gpu_mem_remaining = freeMem;
+  GPUManager& csv_gpu_manager = CsvAccess(gpu_manager);
+  if(csv_gpu_manager.use_shm) {
+    DeviceManager* dm = csv_gpu_manager.device_map[CkMyPe()];
+    msg->pool_buff_mem_remaining = dm->get_lb_buffer_free_size();
+    // printf("PE %d: GPU %ld free mem: %ld, pool buffer free mem: %ld\n", CkMyPe(), msg->gpu_device_id, msg->gpu_mem_remaining, msg->pool_buff_mem_remaining);
+  } else 
+  {
+    msg->pool_buff_mem_remaining = 0;//// should not run
+  }
+  // printf("msg->gpu_device_id is %ld\n", msg->gpu_device_id);
+#endif
+
+  DEBUGF(("Processor %d Total time (wall,cpu) = %f Idle = %f Bg = %f\n", CkMyPe(),msg->total_walltime,msg->idletime,msg->bg_walltime));
 
   msg->objData.resize(osz);
   lbmgr->GetObjData(msg->objData.data());
   msg->commData.resize(csz);
   lbmgr->GetCommData(msg->commData.data());
 //  lbmgr->ClearLoads();
-  DEBUGF(("PE %d BuildStatsMsg %d objs, %d comm\n",CkMyPe(),msg->n_objs,msg->n_comm));
+  DEBUGF(("PE %d BuildStatsMsg %d objs, %d comm\n",CkMyPe(),msg->objData.size(),msg->commData.size()));
 
   if(CkMyPe() == cur_ld_balancer) {
+    int count_avail = 0;
     lbmgr->get_avail_vector(msg->avail_vector);
     msg->next_lb = LBManagerObj()->new_lbbalancer();
   }
@@ -435,6 +498,11 @@ void CentralLB::depositData(CLBStatsMsg *m)
   procStat.bg_cputime = m->bg_cputime;
 #endif
   procStat.pe_speed = m->pe_speed;
+#if CMK_CUDA || CMK_HIP
+  procStat.gpu_device_id = m->gpu_device_id;
+  procStat.gpu_mem_remaining = m->gpu_mem_remaining;
+  procStat.pool_buff_mem_remaining = m->pool_buff_mem_remaining;
+#endif
 
   //procStat.utilization = 1.0;
   procStat.available = true;
@@ -510,6 +578,11 @@ void CentralLB::ReceiveStats(CkMarshalledCLBStatsMessage &&msg)
       procStat.bg_cputime = m->bg_cputime;
 #endif
       procStat.pe_speed = m->pe_speed;
+#if CMK_CUDA || CMK_HIP
+      procStat.gpu_device_id = m->gpu_device_id;
+      procStat.gpu_mem_remaining = m->gpu_mem_remaining;
+      procStat.pool_buff_mem_remaining = m->pool_buff_mem_remaining;
+#endif
       //procStat.utilization = 1.0;
       procStat.available = true;
       procStat.n_objs = msg_n_objs;
@@ -985,6 +1058,7 @@ void CentralLB::ProcessMigrationDecision() {
 
 void CentralLB::ProcessReceiveMigration()
 {
+  // CmiPrintf("[%d] ProcessReceiveMigration\n", CkMyPe());
 #if CMK_LBDB_ON
 	int i;
         LBMigrateMsg *m = storedMigrateMsg;
@@ -997,6 +1071,7 @@ void CentralLB::ProcessReceiveMigration()
   CmiAssert(migrates_expected <= 0 || migrates_completed == migrates_expected);
   migrates_expected = 0;
   future_migrates_expected = 0;
+  // CmiPrintf("[%d] ProcessReceiveMigration: n_moves=%d\n", CkMyPe(), m->n_moves);
   for(i=0; i < m->n_moves; i++) {
     MigrateInfo& move = m->moves[i];
     const int me = CkMyPe();
@@ -1021,11 +1096,11 @@ void CentralLB::ProcessReceiveMigration()
       else future_migrates_expected++;
     }
     else {
-#if CMK_GLOBAL_LOCATION_UPDATE      
-      UpdateLocation(move); 
-#endif
+      #if CMK_GLOBAL_LOCATION_UPDATE
+      // CmiPrintf("[%d] Updating location for obj id=%llu from %d to %d\n", CkMyPe(), move.obj.id, move.from_pe, move.to_pe);
+        UpdateLocation(move);
+      #endif
     }
-
   }
 
   DEBUGF(("[%d] in ReceiveMigration %d moves expected: %d future expected: %d\n",CkMyPe(),m->n_moves, migrates_expected, future_migrates_expected));
@@ -1049,52 +1124,58 @@ void CentralLB::ProcessReceiveMigration()
 #endif
 }
 
+void CentralLB::CheckForLB() {
+  //sleep(5);
+#if CMK_SHRINK_EXPAND
+  if (pending_realloc_state == EXPAND_MSG_RECEIVED)
+    CheckForRealloc();
+  //else if (pending_realloc_state == NO_REALLOC)
+  //  thisProxy.ResumeClients(0);
+  else
+    thisProxy.CallLB();
+#else
+  // if we are not in shrink/expand mode, just call LB
+  thisProxy.CallLB();
+#endif
+  //else
+  //  thisProxy.ResumeClients(0);
+}
+
 // We assume that bit vector would have been aptly set async by either scheduler or charmrun.
 void CentralLB::CheckForRealloc(){
 #if CMK_SHRINK_EXPAND
-   if(pending_realloc_state == REALLOC_MSG_RECEIVED) {
-        pending_realloc_state = REALLOC_IN_PROGRESS; //in progress
-        CkPrintf("Load balancer invoking charmrun to handle reallocation on pe %d\n", CkMyPe());
-        double end_lb_time = CkWallTimer();
-        CkPrintf("CharmLB> %s: PE [%d] step %d finished at %f duration %f s\n\n",
-            lbname, cur_ld_balancer, step()-1, end_lb_time,	end_lb_time-start_lb_time);
-        // do checkpoint
-        CkCallback cb(CkIndex_CentralLB::ResumeFromReallocCheckpoint(), thisProxy[0]);
-        CkStartCheckpoint(_shrinkexpand_basedir, cb);
-    }
-    else{
-        thisProxy.MigrationDoneImpl(1);
-    }
+  if(pending_realloc_state != NO_REALLOC) {
+    pending_realloc_state = (pending_realloc_state == SHRINK_MSG_RECEIVED) ? SHRINK_IN_PROGRESS : EXPAND_IN_PROGRESS; //in progress
+    CkPrintf("Load balancer invoking charmrun to handle reallocation on pe %d\n", CkMyPe());
+    double end_lb_time = CkWallTimer();
+    CkPrintf("CharmLB> %s: PE [%d] step %d finished at %f duration %f s\n\n",
+        lbname, cur_ld_balancer, step()-1, end_lb_time,	end_lb_time-start_lb_time);
+    // do checkpoint
+    CkCallback cb(CkIndex_CentralLB::ResumeFromReallocCheckpoint(), thisProxy[0]);
+    CkStartRescaleCheckpoint(_shrinkexpand_basedir, cb, 
+      std::vector<char>(se_avail_vector, se_avail_vector + CkNumPes()));
+  } else {
+    thisProxy.MigrationDoneImpl(1);
+  }
 #endif
 }
 
 void CentralLB::ResumeFromReallocCheckpoint(){
 #if CMK_SHRINK_EXPAND
-    const int count = CkNumPes();
-    std::vector<char> avail(se_avail_vector, se_avail_vector + count);
-    memset(se_avail_vector, 0, sizeof(char) * count);
+    CkPrintf("Resumed from realloc\n");
+    std::vector<char> avail(se_avail_vector, se_avail_vector + CkNumPes());
+    //free(se_avail_vector);
     thisProxy.WillIbekilled(avail, numProcessAfterRestart);
 #endif
 }
 
-
-
-#if CMK_SHRINK_EXPAND
-int GetNewPeNumber(std::vector<char> avail){
-  int mype = CkMyPe();
-  int count =0;
-  for (int i =0; i <mype; i++){
-    if(avail[i] ==0) count++;
-  }
-  return (mype - count);
-}
-#endif
-
 void CentralLB::WillIbekilled(std::vector<char> avail, int newnumProcessAfterRestart){
 #if CMK_SHRINK_EXPAND
  numProcessAfterRestart = newnumProcessAfterRestart;
  mynewpe =  GetNewPeNumber(avail);
+ //CkPrintf("[%d] -> new pe %d\n", CkMyPe(), mynewpe);
  willContinue = avail[CkMyPe()];
+ //CkPrintf("PE%i> Sending start cleanup reduction\n", CkMyPe());
  CkCallback cb(CkIndex_CentralLB::StartCleanup(), thisProxy[0]);
  contribute(cb);
 #endif
@@ -1102,9 +1183,12 @@ void CentralLB::WillIbekilled(std::vector<char> avail, int newnumProcessAfterRes
 
 void CentralLB::StartCleanup(){
 #if CMK_SHRINK_EXPAND
-		CkCleanup();
+  //CkAbort("FLAG\n");
+  //CkPrintf("Starting cleanup\n");
+	CkCleanup();
 #endif
 }
+
 void CentralLB::MigrationDone(int balancing)
 {
 #if CMK_SHRINK_EXPAND
@@ -1116,6 +1200,7 @@ void CentralLB::MigrationDone(int balancing)
     MigrationDoneImpl(balancing);
 #endif
 }
+
 void CentralLB::MigrationDoneImpl (int balancing)
 {
 
@@ -1124,6 +1209,10 @@ void CentralLB::MigrationDoneImpl (int balancing)
   migrates_expected = -1;
   // clear load stats
   if (balancing) lbmgr->ClearLoads();
+#if CMK_CUDA || CMK_HIP
+  if (CmiMyRank() == 0)
+    hapiClearCuptiData();
+#endif
   // Increment to next step
   lbmgr->incStep();
 	DEBUGF(("[%d] Incrementing Step %d \n",CkMyPe(),step()));
@@ -1158,7 +1247,7 @@ void CentralLB::ResumeClients()
 void CentralLB::ResumeClients(int balancing)
 {
 #if CMK_LBDB_ON
-  DEBUGF(("[%d] Resuming clients. balancing:%d.\n",CkMyPe(),balancing));
+  //CkPrintf("[%d] Resuming clients. balancing:%d.\n",CkMyPe(),balancing);
 
   lbmgr->ResumeClients();
   if (balancing)  {
@@ -1169,6 +1258,10 @@ void CentralLB::ResumeClients(int balancing)
       CheckMigrationComplete();
     }
   }
+  lbmgr->lb_in_progress = false;
+
+  if (CkMyPe() == 0)
+    lbmgr->callRealloc();
 #endif
 }
 
@@ -1652,6 +1745,11 @@ CLBStatsMsg::~CLBStatsMsg() {
 void CLBStatsMsg::pup(PUP::er &p) {
   p|from_pe;
   p|pe_speed;
+#if CMK_CUDA || CMK_HIP
+  p|gpu_device_id;
+  p|gpu_mem_remaining;
+  p|pool_buff_mem_remaining;
+#endif
   p|total_walltime;
   p|idletime;
 #if defined(TEMP_LDB)
diff --git a/src/ck-ldb/CentralLB.ci b/src/ck-ldb/CentralLB.ci
index 59694ce685..acab70ec82 100644
--- a/src/ck-ldb/CentralLB.ci
+++ b/src/ck-ldb/CentralLB.ci
@@ -31,11 +31,13 @@ group [migratable] CentralLB : BaseLB {
   entry [reductiontarget] void ProcessReceiveMigration();
   entry [reductiontarget] void ProcessMigrationDecision();
   entry void MissMigrate(int);
+  entry void CallLB();
   entry void CheckForRealloc();
   entry void ResumeFromReallocCheckpoint();
   entry void MigrationDoneImpl(int);
   entry void WillIbekilled(std::vector <char> avail, int);
-  entry void StartCleanup();
+  entry [reductiontarget] void StartCleanup();
+  entry [reductiontarget] void CheckForLB();
 };
 
 };
diff --git a/src/ck-ldb/CentralLB.h b/src/ck-ldb/CentralLB.h
index bbc9a5c140..8a4366b080 100644
--- a/src/ck-ldb/CentralLB.h
+++ b/src/ck-ldb/CentralLB.h
@@ -12,6 +12,7 @@
 #include <vector>
 #include "pup_stl.h"
 #include "manager.h"
+#include "ckcheckpoint.h"
 extern CkGroupID loadbalancer;
 
 void CreateCentralLB();
@@ -96,6 +97,7 @@ class CentralLB : public CBase_CentralLB
   int GetPESpeed();
   inline void setConcurrent(bool c) { concurrent = c; }
 
+  void CallLB();
   void InvokeLB(); // Everything is at the PE barrier
   void ProcessAtSync(void); // Receive a message from AtSync to avoid
                             // making projections output look funny
@@ -121,6 +123,7 @@ class CentralLB : public CBase_CentralLB
   void MissMigrate(int waitForBarrier);
 
   //Shrink-Expand related functions
+  void CheckForLB();
   void CheckForRealloc ();
   void ResumeFromReallocCheckpoint();
   void MigrationDoneImpl (int );
@@ -283,6 +286,11 @@ class CLBStatsMsg {
 
   int from_pe;
   int pe_speed;
+#if CMK_CUDA || CMK_HIP
+  size_t gpu_mem_remaining;
+  size_t pool_buff_mem_remaining;
+  uint64_t gpu_device_id;
+#endif
   LBRealType total_walltime;
   LBRealType idletime;
   LBRealType bg_walltime;
@@ -298,7 +306,11 @@ class CLBStatsMsg {
 
 public:
   CLBStatsMsg(int osz, int csz);
-  CLBStatsMsg(): from_pe(0), pe_speed(0), total_walltime(0.0), idletime(0.0),
+  CLBStatsMsg(): from_pe(0), pe_speed(0),
+#if CMK_CUDA || CMK_HIP
+		 gpu_device_id(-1), gpu_mem_remaining(0), pool_buff_mem_remaining(0),
+#endif
+		 total_walltime(0.0), idletime(0.0),
 		 bg_walltime(0.0),
 #if defined(TEMP_LDB)
 		pe_temp(1.0),
diff --git a/src/ck-ldb/CommonLBs.ci b/src/ck-ldb/CommonLBs.ci
index 436ffa6729..f76b21f308 100644
--- a/src/ck-ldb/CommonLBs.ci
+++ b/src/ck-ldb/CommonLBs.ci
@@ -5,6 +5,8 @@ module CommonLBs {
   extern module DistributedLB;
   extern module MetisLB;
   extern module RecBipartLB;
+  extern module GreedyCentralLB;
+  extern module GreedyRefineCentralLB;
 
   initnode void initCommonLBs(void);
 };
diff --git a/src/ck-ldb/EveryLB.ci b/src/ck-ldb/EveryLB.ci
index a634d9c9e0..4bfc0fafb0 100644
--- a/src/ck-ldb/EveryLB.ci
+++ b/src/ck-ldb/EveryLB.ci
@@ -5,6 +5,8 @@ module EveryLB {
   extern module DistributedLB;
   extern module MetisLB;
   extern module RecBipartLB;
+  extern module GreedyCentralLB;
+  extern module GreedyRefineCentralLB;
 
   initnode void initEveryLB(void);
 };
diff --git a/src/ck-ldb/GreedyCentralLB.C b/src/ck-ldb/GreedyCentralLB.C
new file mode 100644
index 0000000000..e82d0dd626
--- /dev/null
+++ b/src/ck-ldb/GreedyCentralLB.C
@@ -0,0 +1,323 @@
+/**
+ * \addtogroup CkLdb
+*/
+/*@{*/
+
+/*
+ status:
+  * support processor avail bitvector
+  * support nonmigratable attrib
+      nonmigratable object load is added to its processor's background load
+      and the nonmigratable object is not taken in the objData array
+*/
+
+#include <algorithm>
+#include <unordered_map>
+
+#include "charm++.h"
+
+
+#include "ckgraph.h"
+#include "cklists.h"
+#include "GreedyCentralLB.h"
+#include "conv-mach-cuda.h"
+#include "conv-mach-hip.h"
+
+using namespace std;
+
+extern int quietModeRequested;
+
+CreateLBFunc_Def(GreedyCentralLB, "always assign the heaviest obj onto lightest loaded processor.")
+
+GreedyCentralLB::GreedyCentralLB(const CkLBOptions &opt): CBase_GreedyCentralLB(opt)
+{
+  lbname = "GreedyCentralLB";
+  if (CkMyPe()==0 && !quietModeRequested)
+    CkPrintf("CharmLB> GreedyCentralLB created.\n");
+}
+
+bool GreedyCentralLB::QueryBalanceNow(int _step)
+{
+  //  CkPrintf("[%d] Balancing on step %d\n",CkMyPe(),_step);
+  return true;
+}
+
+class GreedyCentralLB::ProcLoadGreater {
+  public:
+    bool operator()(const ProcInfo &p1, const ProcInfo &p2) {
+      return (p1.getTotalLoad() > p2.getTotalLoad());
+    }
+};
+
+class GreedyCentralLB::ObjLoadGreater {
+  public:
+    bool operator()(const CkVertex &v1, const CkVertex &v2) {
+      return (v1.getCompLoad() > v2.getCompLoad());
+    }
+};
+
+#if CMK_CUDA || CMK_HIP
+// A group of PEs that share the same GPU device.
+// Load balancing reasons at this level: the GPU is the bottleneck,
+// so we distribute objects across GPUs, not across individual PEs.
+struct GPUGroup {
+  int gpu_id;                      // GPU device id
+  double totalLoad;                // aggregate GPU load across all PEs in this group
+  std::vector<int> pe_indices;     // indices into the procs vector
+};
+#endif
+
+void GreedyCentralLB::work(LDStats* stats)
+{
+  int  obj, objCount, pe;
+  int n_pes = stats->nprocs();
+  int *map = new int[n_pes];
+
+  std::vector<ProcInfo>  procs;
+  for(pe = 0; pe < n_pes; pe++) {
+    map[pe] = -1;
+    if (stats->procs[pe].available) {
+      map[pe] = procs.size();
+      procs.push_back(ProcInfo(pe, stats->procs[pe].bg_walltime, 0.0, stats->procs[pe].pe_speed, true));
+    }
+  }
+
+  // take non migratable object load as background load
+  for (obj = 0; obj < stats->objData.size(); obj++)
+  {
+      LDObjData &oData = stats->objData[obj];
+      if (!oData.migratable)  {
+        int pe = stats->from_proc[obj];
+        pe = map[pe];
+        if (pe==-1)
+          CmiAbort("GreedyCentralLB: nonmigratable object on an unavail processor!\n");
+#if CMK_CUDA || CMK_HIP
+        procs[pe].setOverhead(procs[pe].getOverhead() + std::max(oData.wallTime, oData.gpuTime));
+#else
+        procs[pe].setOverhead(procs[pe].getOverhead() + oData.wallTime);
+#endif
+      }
+  }
+  delete [] map;
+
+  // Add the overhead to the total load
+  for (pe = 0; pe<procs.size(); pe++) {
+    procs[pe].setTotalLoad(procs[pe].getTotalLoad() + procs[pe].getOverhead());
+  }
+
+  // build object array
+  std::vector<CkVertex> objs;
+
+  for(int obj = 0; obj < stats->objData.size(); obj++) {
+    LDObjData &oData = stats->objData[obj];
+    int pe = stats->from_proc[obj];
+    if (!oData.migratable) {
+      if (!stats->procs[pe].available)
+        CmiAbort("GreedyCentralLB cannot handle nonmigratable object on an unavial processor!\n");
+      continue;
+    }
+#if CMK_CUDA || CMK_HIP
+    // Use whichever is the bottleneck: CPU wall time or GPU kernel time
+    double load = std::max(oData.wallTime, oData.gpuTime) * stats->procs[pe].pe_speed;
+    CkPrintf("[%d] GreedyCentralLB obj %d (PE %d): gpuTime=%.6f wallTime=%.6f load=%.6f\n",
+             CkMyPe(), obj, pe, oData.gpuTime, oData.wallTime, load);
+#else
+    double load = oData.wallTime * stats->procs[pe].pe_speed;
+#endif
+    objs.push_back(CkVertex(obj, load, stats->objData[obj].migratable, stats->from_proc[obj]));
+  }
+
+  // max heap of objects (heaviest first)
+  sort(objs.begin(), objs.end(), GreedyCentralLB::ObjLoadGreater());
+
+  if (_lb_args.debug()>1)
+    CkPrintf("[%d] In GreedyCentralLB strategy\n",CkMyPe());
+
+  int nmoves = 0;
+
+#if CMK_CUDA || CMK_HIP
+  // ---- GPU-aware greedy: balance across GPU groups, not individual PEs ----
+
+  // Build GPU groups: map gpu_device_id -> GPUGroup
+  // With typical counts (2-8 GPUs), linear scan beats a heap and avoids
+  // heap-invariant headaches when we update a non-front group in place.
+  std::vector<GPUGroup> gpuGroups;
+  std::unordered_map<uint64_t, int> gpuIdToIdx;  // gpu_device_id -> index in gpuGroups
+  CkPrintf("starting stratergy for GPU\n");
+  for (int i = 0; i < (int)procs.size(); i++) {
+    int real_pe = procs[i].getProcId();
+    uint64_t gpu_id = stats->procs[real_pe].gpu_device_id;
+    printf("gpu_id %ld\n", gpu_id);
+    fflush(stdout);
+
+    auto it = gpuIdToIdx.find(gpu_id);
+    if (it == gpuIdToIdx.end()) {
+      gpuIdToIdx[gpu_id] = gpuGroups.size();
+      GPUGroup g;
+      g.gpu_id = gpu_id;
+      g.totalLoad = procs[i].getTotalLoad();
+      g.pe_indices.push_back(i);
+      gpuGroups.push_back(std::move(g));
+    } else {
+      gpuGroups[it->second].totalLoad += procs[i].getTotalLoad();
+      gpuGroups[it->second].pe_indices.push_back(i);
+    }
+  }
+
+  // Reverse map: real PE -> index in gpuGroups
+  std::unordered_map<int, int> peToGroupIdx;
+  for (int gi = 0; gi < (int)gpuGroups.size(); gi++) {
+    for (int pidx : gpuGroups[gi].pe_indices) {
+      peToGroupIdx[procs[pidx].getProcId()] = gi;
+    }
+  }
+
+  CkPrintf("[%d] GreedyCentralLB: %ld GPU group(s), %ld available PEs, %ld migratable objs\n",
+           CkMyPe(), (int)gpuGroups.size(), (int)procs.size(), (int)objs.size());
+  for (auto &g : gpuGroups) {
+    CkPrintf("[%d]   GPU %ld: %ld PEs, aggregate load=%.6f\n",
+             CkMyPe(), g.gpu_id, (int)g.pe_indices.size(), g.totalLoad);
+  }
+
+  // Greedy with locality preference:
+  // For each object (heaviest first), find the lightest GPU group.
+  // If the object's current GPU group has comparable load, keep it there.
+  // Within the chosen group, prefer the object's current PE if it belongs
+  // to that group; otherwise pick the lightest PE.
+  for (obj = 0; obj < (int)objs.size(); obj++) {
+    const int from_pe = objs[obj].getCurrentPe();
+    const int id = objs[obj].getVertexId();
+    double obj_load = objs[obj].getCompLoad();
+    if (obj_load <= 0.0) obj_load = 1e-6;
+
+    // Find lightest GPU group (linear scan — few groups)
+    int lightest_gi = 0;
+    for (int gi = 1; gi < (int)gpuGroups.size(); gi++) {
+      if (gpuGroups[gi].totalLoad < gpuGroups[lightest_gi].totalLoad)
+        lightest_gi = gi;
+    }
+
+    // Check if object's current group is close enough to the lightest
+    int chosen_gi = lightest_gi;
+    auto curIt = peToGroupIdx.find(from_pe);
+    if (curIt != peToGroupIdx.end()) {
+      int cur_gi = curIt->second;
+      if (gpuGroups[cur_gi].totalLoad <= gpuGroups[lightest_gi].totalLoad + 0.01) {
+        chosen_gi = cur_gi;  // stay on current GPU
+      }
+    }
+    GPUGroup &g = gpuGroups[chosen_gi];
+
+    // Within the chosen group, prefer the current PE if it belongs here
+    int best_idx = -1;
+    if (chosen_gi == (curIt != peToGroupIdx.end() ? curIt->second : -1)) {
+      // Object's current PE is in this group — use it
+      for (int k = 0; k < (int)g.pe_indices.size(); k++) {
+        if (procs[g.pe_indices[k]].getProcId() == from_pe) {
+          best_idx = g.pe_indices[k];
+          break;
+        }
+      }
+    }
+    if (best_idx < 0) {
+      // Pick the lightest PE in the group
+      best_idx = g.pe_indices[0];
+      double best_load = procs[best_idx].getTotalLoad();
+      for (int k = 1; k < (int)g.pe_indices.size(); k++) {
+        double pl = procs[g.pe_indices[k]].getTotalLoad();
+        if (pl < best_load) {
+          best_load = pl;
+          best_idx = g.pe_indices[k];
+        }
+      }
+    }
+
+    ProcInfo &p = procs[best_idx];
+    double scaled_load = obj_load / p.getPeSpeed();
+    p.setTotalLoad(p.getTotalLoad() + scaled_load);
+    g.totalLoad += scaled_load;
+
+    // Record migration only if PE actually changed
+    const int dest = p.getProcId();
+    if (dest != from_pe) {
+      stats->to_proc[id] = dest;
+      nmoves++;
+      if (_lb_args.debug() > 2)
+        CkPrintf("[%d] Obj %d migrating from %d to %d\n", CkMyPe(), id, from_pe, dest);
+    }
+  }
+
+    for (int gi = 0; gi < (int)gpuGroups.size(); gi++) {
+      CkPrintf("gpu group %d load: %f\n", gi, gpuGroups[gi].totalLoad);
+      // if ( < gpuGroups[lightest_gi].totalLoad)
+      //   lightest_gi = gi;
+    }
+
+#else
+  // ---- Original PE-level greedy (non-GPU path) ----
+
+  // min heap of processors (lightest first)
+  make_heap(procs.begin(), procs.end(), GreedyCentralLB::ProcLoadGreater());
+
+  // greedy algorithm: assign heaviest object to lightest processor
+  // Use getCompLoad() to avoid the 0.1 floor in getVertexLoad() which
+  // destroys load differentiation for fine-grained GPU workloads
+  for (obj=0; obj < objs.size(); obj++) {
+    ProcInfo p = procs.front();
+    pop_heap(procs.begin(), procs.end(), GreedyCentralLB::ProcLoadGreater());
+    procs.pop_back();
+
+    double obj_load = objs[obj].getCompLoad();
+    if (obj_load <= 0.0) obj_load = 1e-6;
+    p.setTotalLoad(p.getTotalLoad() + obj_load / p.getPeSpeed());
+
+    //Insert object into migration queue if necessary
+    const int dest = p.getProcId();
+    const int from_pe = objs[obj].getCurrentPe();
+    const int id   = objs[obj].getVertexId();
+    if (dest != from_pe) {
+      stats->to_proc[id] = dest;
+      nmoves ++;
+      if (_lb_args.debug()>2)
+        CkPrintf("[%d] Obj %d migrating from %d to %d\n", CkMyPe(),id,from_pe,dest);
+    }
+
+    //Insert the least loaded processor with load updated back into the heap
+    procs.push_back(p);
+    push_heap(procs.begin(), procs.end(), GreedyCentralLB::ProcLoadGreater());
+  }
+#endif
+
+  CkPrintf("[%d] GreedyCentralLB: %d objects migrating.\n", CkMyPe(), nmoves);
+
+  if (_lb_args.debug()>1)  {
+    CkPrintf("CharmLB> Min obj: %f  Max obj: %f\n", objs[objs.size()-1].getCompLoad(), objs[0].getCompLoad());
+    CkPrintf("CharmLB> PE speed:\n");
+    for (pe = 0; pe<procs.size(); pe++)
+      CkPrintf("%f ", procs[pe].getPeSpeed());
+    CkPrintf("\n");
+    CkPrintf("CharmLB> PE Load:\n");
+    for (pe = 0; pe<procs.size(); pe++)
+      CkPrintf("%f (%f)  ", procs[pe].getTotalLoad(), procs[pe].getOverhead());
+    CkPrintf("\n");
+  }
+
+  if (_lb_args.metaLbOn()) {
+    double max_load = 0;
+    double avg_load = 0;
+    for (pe = 0; pe<procs.size(); pe++) {
+      if (procs[pe].getTotalLoad() > max_load) {
+        max_load = procs[pe].getTotalLoad();
+      }
+      avg_load += procs[pe].getTotalLoad();
+    }
+
+    stats->after_lb_max = max_load;
+    stats->after_lb_avg = avg_load/procs.size();
+    stats->is_prev_lb_refine = 0;
+    if (_lb_args.debug() > 0)
+      CkPrintf("GreedyCentralLB> After lb max load: %lf avg load: %lf\n", max_load, avg_load/procs.size());
+  }
+}
+
+#include "GreedyCentralLB.def.h"
diff --git a/src/ck-ldb/GreedyCentralLB.ci b/src/ck-ldb/GreedyCentralLB.ci
new file mode 100644
index 0000000000..2883616b93
--- /dev/null
+++ b/src/ck-ldb/GreedyCentralLB.ci
@@ -0,0 +1,9 @@
+module GreedyCentralLB {
+
+    extern module CentralLB;
+    initnode void lbinit(void);
+    
+    group [migratable] GreedyCentralLB : CentralLB {
+        entry void GreedyCentralLB(const CkLBOptions &);  
+    };
+};
\ No newline at end of file
diff --git a/src/ck-ldb/GreedyCentralLB.h b/src/ck-ldb/GreedyCentralLB.h
new file mode 100644
index 0000000000..cb0c6aef4f
--- /dev/null
+++ b/src/ck-ldb/GreedyCentralLB.h
@@ -0,0 +1,45 @@
+/**
+ * \addtogroup CkLdb
+*/
+/*@{*/
+
+#ifndef _GreedyCentralLB_H_
+#define _GreedyCentralLB_H_
+
+#define __DEBUG_GREEDY_REFINE_ 1
+
+#include "CentralLB.h"
+#include "GreedyCentralLB.decl.h"
+
+void CreateGreedyCentralLB();
+BaseLB * AllocateGreedyCentralLB();
+
+class GreedyCentralLB : public CBase_GreedyCentralLB {
+
+public:
+  struct HeapData {
+    double load;
+    int    pe;
+    int    id;
+  };
+
+  GreedyCentralLB(const CkLBOptions &);
+  GreedyCentralLB(CkMigrateMessage *m):CBase_GreedyCentralLB(m) { lbname = "GreedyCentralLB"; }
+  void work(LDStats* stats);
+private:
+  class ProcLoadGreater;
+  class ObjLoadGreater;
+
+	enum           HeapCmp {GT = '>', LT = '<'};
+    	void           Heapify(HeapData*, int, int, HeapCmp);
+	void           HeapSort(HeapData*, int, HeapCmp);
+	void           BuildHeap(HeapData*, int, HeapCmp);
+	bool        Compare(double, double, HeapCmp);
+	HeapData*      BuildCpuArray(BaseLB::LDStats*, int, int *);  
+	HeapData*      BuildObjectArray(BaseLB::LDStats*, int, int *);      
+	bool        QueryBalanceNow(int step);
+};
+
+#endif /* _HEAPCENTLB_H_ */
+
+/*@}*/
\ No newline at end of file
diff --git a/src/ck-ldb/GreedyRefineCentralLB.C b/src/ck-ldb/GreedyRefineCentralLB.C
new file mode 100644
index 0000000000..8aac6cdf21
--- /dev/null
+++ b/src/ck-ldb/GreedyRefineCentralLB.C
@@ -0,0 +1,803 @@
+/**
+ * \addtogroup CkLdb
+*/
+/*@{*/
+
+/**
+ * Author: jjgalvez@illinois.edu (Juan Galvez)
+ * Greedy algorithm to minimize cpu max_load and object migrations.
+ * Can find solution equal or close to regular Greedy with less (sometimes much less) migrations.
+ * The amount of migrations that the user can tolerate is passed via the command-line
+ * option +LBPercentMoves (as percentage of chares that can be moved).
+ *
+ * If LBPercentMoves is not passed, strategy assumes it can move all objects.
+ * In this case, the algorithm will give preference to minimizing cpu max_load.
+ * It will still move less than greedy, but the amount of migrations
+ * will depend very much on the particular case (object load distribution and processor background loads),
+ *
+ * supports processor avail bitvector
+ * supports nonmigratable attrib
+ *
+*/
+
+#include "charm++.h"
+#include "ckgraph.h"
+#include "GreedyRefineCentralLB.h"
+
+#include <float.h>
+#include <limits.h>
+#include <algorithm>
+#include <math.h>
+#if CMK_CUDA || CMK_HIP
+CkpvExtern(int, _lb_obj_index);
+#include <unordered_map>
+#endif
+
+extern int quietModeRequested;
+
+// a solution is feasible if num migrations <= user-specified limit
+// LOAD_MIG_BAL is used to control tradeoff between maxload and migrations
+// when selecting solutions from the feasible set
+#define LOAD_MIG_BAL 1.003
+
+using namespace std;
+
+class GreedyRefineCentralLB::Solution {
+public:
+  Solution() {}
+  Solution(int pe, double maxLoad, int nmoves) : pe(pe), max_load(maxLoad), migrations(nmoves) {}
+  int pe; // pe who produced this solution
+  float max_load;
+  int migrations;
+
+  void pup(PUP::er &p) {
+    p|pe;
+    p|max_load;
+    p|migrations;
+  }
+};
+
+// custom heap to allow removal of processors from any position
+class GreedyRefineCentralLB::PHeap {
+public:
+  PHeap(int numpes) {
+    Q.reserve(numpes+1);
+    Q.push_back(NULL);  // first element of the array is NULL
+  }
+
+  void addProcessors(std::vector<GreedyRefineCentralLB::GProc> &procs, bool bgLoadZero, bool insert=true) {
+    for (int i=0; i < procs.size(); i++) {
+      GreedyRefineCentralLB::GProc &p = procs[i];
+      if (p.available) {
+        p.load = p.bgload;
+        if (insert) {
+          Q.push_back(&p);
+          p.pos = Q.size()-1;
+        }
+      }
+    }
+    if (!bgLoadZero) buildMinHeap();
+  }
+
+  inline GreedyRefineCentralLB::GProc *top() const {
+    CkAssert(Q.size() > 1);
+    return Q[1];
+  }
+
+  inline void push(GreedyRefineCentralLB::GProc *p) {
+    Q.push_back(p);
+    p->pos = Q.size()-1;
+    siftUp(p->pos);
+  }
+
+  inline GreedyRefineCentralLB::GProc *pop() {
+    if (Q.size() == 1) return NULL;
+    GreedyRefineCentralLB::GProc *retval;
+    if (Q.size() == 2) {
+      retval = Q[1];
+      Q.pop_back();
+      return retval;
+    }
+    retval = Q[1];
+    Q[1] = Q.back();
+    Q.pop_back();
+    Q[1]->pos = 1;
+    siftDown(1);
+    return retval;
+  }
+
+  // remove processor from any position in the heap
+  void remove(GreedyRefineCentralLB::GProc *p) {
+    int pos = p->pos;
+    if ((Q.size() == 2) || (pos == Q.size()-1)) return Q.pop_back();
+    if (pos == 1) { pop(); return; }
+    Q[pos] = Q.back();
+    Q.pop_back();
+    Q[pos]->pos = pos;
+    if (Q[pos/2]->load > Q[pos]->load) siftUp(pos);
+    else siftDown(pos);
+  }
+
+  inline void clear() {
+    Q.clear();
+    Q.push_back(NULL);
+  }
+
+private:
+
+  void min_heapify(int i) {
+    const int left = 2*i;
+    const int right = 2*i + 1;
+    int smallest = i;
+    if ((left < Q.size()) && (Q[left]->load < Q[smallest]->load)) smallest = left;
+    if ((right < Q.size()) && (Q[right]->load < Q[smallest]->load)) smallest = right;
+    if (smallest != i) {
+      swap(i,smallest);
+      Q[i]->pos = i;
+      Q[smallest]->pos = smallest;
+      min_heapify(smallest);
+    }
+  }
+
+  void inline buildMinHeap() {
+    for (int i=Q.size()/2; i > 0; i--) min_heapify(i);
+  }
+
+  inline void swap(int pos1, int pos2) {
+    GreedyRefineCentralLB::GProc *t = Q[pos1];
+    Q[pos1] = Q[pos2];
+    Q[pos2] = t;
+  }
+
+  void siftUp(int pos) {
+    if (pos == 1) return;   // reached root
+    int ppos = pos/2;
+    if (Q[ppos]->load > Q[pos]->load) {
+      swap(ppos,pos);
+      Q[ppos]->pos = ppos;
+      Q[pos]->pos = pos;
+      siftUp(ppos);
+    }
+  }
+
+  inline int minChild(int pos) const {
+    int c1 = pos*2;
+    int c2 = pos*2 + 1;
+    if (c1 >= Q.size()) return -1;
+    if (c2 >= Q.size()) return c1;
+    if (Q[c1]->load < Q[c2]->load) return c1;
+    else return c2;
+  }
+
+  void siftDown(int pos) {
+    int cpos = minChild(pos);
+    if (cpos == -1) return;
+    if (Q[pos]->load > Q[cpos]->load) {
+      swap(pos,cpos);
+      Q[cpos]->pos = cpos;
+      Q[pos]->pos = pos;
+      siftDown(cpos);
+    }
+  }
+
+  std::vector<GreedyRefineCentralLB::GProc*> Q;
+};
+
+CreateLBFunc_Def(GreedyRefineCentralLB, "Greedy refinement-based algorithm")
+
+GreedyRefineCentralLB::GreedyRefineCentralLB(const CkLBOptions &opt): CBase_GreedyRefineCentralLB(opt), migrationTolerance(1.0)
+{
+  lbname = "GreedyRefineCentralLB";
+  if ((CkMyPe() == 0) && !quietModeRequested)
+    CkPrintf("CharmLB> GreedyRefineCentralLB created.\n");
+  if (_lb_args.percentMovesAllowed() < 100) {
+    migrationTolerance = float(_lb_args.percentMovesAllowed())/100.0;
+  }
+  concurrent = true;
+}
+
+GreedyRefineCentralLB::GreedyRefineCentralLB(CkMigrateMessage *m): CBase_GreedyRefineCentralLB(m), migrationTolerance(1.0) {
+  lbname = "GreedyRefineCentralLB";
+  if (_lb_args.percentMovesAllowed() < 100)
+    migrationTolerance = float(_lb_args.percentMovesAllowed())/100.0;
+  concurrent = true;
+}
+
+// ------------------------------------------------
+
+// regular greedy lb algorithm
+double GreedyRefineCentralLB::greedyLB(const std::vector<GreedyRefineCentralLB::GObj*> &pobjs,
+              GreedyRefineCentralLB::PHeap &procHeap,
+              const BaseLB::LDStats *stats) const
+{
+  double max_load = 0;
+  int nmoves = 0;
+  for (int i=0; i < pobjs.size(); i++) {
+    const GreedyRefineCentralLB::GObj *obj = pobjs[i];
+    GreedyRefineCentralLB::GProc *p = procHeap.pop();  // least loaded processor
+    // update processor load
+    p->load += (obj->load / p->speed);
+    procHeap.push(p);
+
+    if (p->id != obj->oldPE) nmoves++;
+    if (p->load > max_load) max_load = p->load;
+  }
+
+  if ((CkMyPe() == cur_ld_balancer+1) && (_lb_args.debug() > 1)) {
+    CkPrintf("[%d] %f : Greedy strategy nmoves=%d, max_load=%f\n", CkMyPe(),
+             CkWallTimer() - strategyStartTime, nmoves, max_load);
+  }
+  return max_load;
+}
+
+// -----------------------------------------------
+#if __DEBUG_GREEDY_REFINE_
+#include <fstream>
+void GreedyRefineCentralLB::dumpObjLoads(std::vector<GreedyRefineCentralLB::GObj> &objs) {
+  std::ofstream outfile("objloads.txt");
+  outfile << objs.size() << std::endl;
+  for (int i=0; i < objs.size(); i++) {
+    GreedyRefineCentralLB::GObj &obj = objs[i];
+    if ((i > 0) && (i % 100 == 0)) outfile << obj.load << std::endl;
+    else outfile << obj.load << " ";
+  }
+  outfile.close();
+}
+void GreedyRefineCentralLB::dumpProcLoads(std::vector<GreedyRefineCentralLB::GProc> &procs) {
+  std::ofstream outfile("proc_bg_loads.txt");
+  outfile << procs.size() << std::endl;
+  for (int i=0; i < procs.size(); i++) {
+    GreedyRefineCentralLB::GProc &p = procs[i];
+    if ((i > 0) && (i % 100 == 0)) outfile << p.load << std::endl;
+    else outfile << p.load << " ";
+  }
+  outfile.close();
+}
+#endif
+
+double GreedyRefineCentralLB::fillData(LDStats *stats,
+                            std::vector<GreedyRefineCentralLB::GObj> &objs,
+                            std::vector<GreedyRefineCentralLB::GObj*> &pobjs,
+                            std::vector<GreedyRefineCentralLB::GProc> &procs,
+                            PHeap &procHeap)
+{
+  const int n_pes = stats->nprocs();
+  const int n_objs = stats->n_migrateobjs;
+  // most of these variables are just for printing stats when _lb_args.debug()
+  int unmigratableObjs = 0;
+  availablePes = 0; totalObjLoad = 0;
+  double minBGLoad = DBL_MAX; double avgBGLoad = 0; double maxBGLoad = 0;
+  double minSpeed  = DBL_MAX; double maxSpeed  = 0; double avgSpeed  = 0;
+  double minOload  = DBL_MAX; double maxOload  = 0;
+
+  for (int pe=0; pe < n_pes; pe++) {
+    GreedyRefineCentralLB::GProc &p = procs[pe];
+    p.id = pe;
+    p.available = stats->procs[pe].available;
+    p.speed = stats->procs[pe].pe_speed;
+    if (p.available) {
+      availablePes++;
+      #if !(CMK_CUDA || CMK_HIP)
+        p.bgload = stats->procs[pe].bg_walltime;
+        if (p.bgload > maxBGLoad) maxBGLoad = p.bgload;
+      #else
+        p.bgload = 0.0;
+      #endif
+
+      #if (CMK_CUDA || CMK_HIP)
+        p.bg_walltime = stats->procs[pe].bg_walltime;
+        // CmiPrintf("[%d] settign bg_walltime to %f\n", pe, p.bg_walltime);
+      #endif
+      if (_lb_args.debug() > 1) {
+        double &speed = stats->procs[pe].pe_speed;
+        if (speed < minSpeed) minSpeed = speed;
+        if (speed > maxSpeed) maxSpeed = speed;
+        avgSpeed += speed;
+      }
+    }
+  }
+  if (!availablePes) CkAbort("GreedyRefineCentralLB: No available processors\n");
+
+  for (int i=0; i < n_objs; i++) {
+    LDObjData &oData = stats->objData[i];
+    GreedyRefineCentralLB::GObj &obj = objs[i];
+    int pe = stats->from_proc[i];
+    obj.id = i;
+    obj.oldPE = pe;
+    obj.gpuPupSize = oData.gpuPupSize;
+    obj.gpuAllocSize = *(size_t *)oData.getUserData(CkpvAccess(_lb_obj_index));
+    CkAssert(pe >= 0 && pe <= n_pes);
+    if (pe == n_pes) obj.oldPE = -1; // this can happen in HybridLB if object comes from outside group. mark oldPE as -1 in this situation
+    if (!oData.migratable) {
+      CkAssert(pe < n_pes);
+      unmigratableObjs++;
+      GreedyRefineCentralLB::GProc &p = procs[pe];
+      if (!p.available)
+        CkAbort("GreedyRefineCentralLB: nonmigratable object on unavailable processor\n");
+#if CMK_CUDA || CMK_HIP
+      double nmObjLoad = oData.gpuTime;
+#else
+      double nmObjLoad = oData.wallTime;
+#endif
+      p.bgload += nmObjLoad; // take non-migratable object load as background load
+      //is the non migratable obj load correct
+      CkPrintf("[%d] Obj %d on PE %d is non-migratable, load=%.6f\n", CkMyPe(), i, pe, nmObjLoad);
+      if (p.bgload > maxBGLoad) maxBGLoad = p.bgload;
+    } else {
+#if CMK_CUDA || CMK_HIP
+      obj.load = oData.gpuTime * stats->procs[pe].pe_speed;
+#else
+      obj.load = oData.wallTime * stats->procs[pe].pe_speed;
+#endif
+        // CkPrintf("[%d] Obj %d on PE %d is migratable, load=%.6f, GPU pup size=%ld, GPU alloc size=%ld\n", CkMyPe(), i, pe, obj.load, oData.gpuPupSize, obj.gpuAllocSize);
+      pobjs.push_back(&obj);
+      totalObjLoad += obj.load;
+      if (_lb_args.debug() > 1) {
+        if (obj.load < minOload) minOload = obj.load;
+        if (obj.load > maxOload) maxOload = obj.load;
+#if CMK_CUDA || CMK_HIP
+        // CkPrintf("[%d] Obj %d (PE %d): wallTime=%.6f gpuTime=%.6f effectiveLoad=%.6f\n",
+        //          CkMyPe(), i, pe, oData.wallTime, oData.gpuTime, obj.load);
+#endif
+      }
+    }
+  }
+
+  procHeap.addProcessors(procs, (maxBGLoad <= 0.001), true);
+
+  // ---- print some stats ----
+  // CkPrintf("here\n")
+  if ((_lb_args.debug() > 1) && (!concurrent || (CkMyPe() == cur_ld_balancer))) {
+    for (int pe=0; pe < n_pes; pe++) {
+      GreedyRefineCentralLB::GProc &p = procs[pe];
+      if (!p.available) continue;
+      if (p.bgload < minBGLoad) minBGLoad = p.bgload;
+      avgBGLoad += p.bgload;
+    }
+    CkPrintf("[%d] GreedyRefineCentralLB: num pes=%d, num objs=%d\n", CkMyPe(), n_pes, n_objs);
+    CkPrintf("[%d] Unavailable processors=%d, Unmigratable objs=%d\n", CkMyPe(), n_pes - availablePes, unmigratableObjs);
+    CkPrintf("[%d] min_bgload=%f mean_bgload=%f max_bgload=%f\n", CkMyPe(), minBGLoad, (avgBGLoad / availablePes), maxBGLoad);
+    CkPrintf("[%d] min_oload=%f mean_oload=%f max_oload=%f\n", CkMyPe(), minOload, (totalObjLoad / (n_objs - unmigratableObjs)), maxOload);
+    CkPrintf("[%d] min_speed=%f mean_speed=%f max_speed=%f\n", CkMyPe(), minSpeed, (avgSpeed / availablePes), maxSpeed);
+
+    double maxLoad = 0;
+    double minLoad = FLT_MAX;
+    std::vector<double> ploads(n_pes, -1);
+    for (int i=0; i < n_objs; i++) {
+      GreedyRefineCentralLB::GObj &o = objs[i];
+      int pe = o.oldPE;
+      if (pe < 0) continue;
+      if (ploads[pe] < 0) ploads[pe] = procs[pe].bgload;
+      if (stats->objData[i].migratable)  // load for this object is already counted if !migratable
+        ploads[pe] += o.load;
+      if (ploads[pe] > maxLoad) maxLoad = ploads[pe];
+      if (ploads[pe] < minLoad) minLoad = ploads[pe];
+    }
+    CkPrintf("[%d] maxload with current map=%f\n", CkMyPe(), maxLoad);
+    CkPrintf("[%d] minload with current map=%f\n", CkMyPe(), minLoad);
+
+    // CkPrintf("[%d] --- Per-PE loads before LB ---\n", CkMyPe());
+    // for (int pe=0; pe < n_pes; pe++) {
+    //   if (ploads[pe] >= 0)
+    //     CkPrintf("[%d]   PE %d: totalLoad=%.6f bgLoad=%.6f\n",
+    //              CkMyPe(), pe, ploads[pe], procs[pe].bgload);
+    // }
+
+    //CkPrintf("[%d] %f : Filled proc and obj stats\n", CkMyPe(), CkWallTimer() - strategyStartTime);
+  }
+
+  return maxBGLoad;
+}
+
+static const float Avals[] = {1.0, 1.005, 1.01, 1.015, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.16, 1.20, 1.30};
+static const float Bvals[] = {FLT_MAX, 1.0, 1.05, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3};
+#define Avals_len 14
+#define Bvals_len 16
+#define NUM_SOLUTIONS Avals_len*Bvals_len+1
+static void getGreedyRefineParams(int rank, float &A, float &B) {
+  if (rank == 0) { A = 0; B = -1; return; } // causes PE0 to run regular greedy
+  rank--;
+  int x = rank / Bvals_len;
+  if (x >= Avals_len) {
+    A = B = -1;
+  } else {
+    A = Avals[x];
+    B = Bvals[rank % Bvals_len];
+  }
+}
+
+void GreedyRefineCentralLB::sendSolution(double maxLoad, int migrations)
+{
+  // gather results in central PE, who will decide which solution is the best
+  // only the objective values of the solutions are sent, not the whole solutions
+
+  GreedyRefineCentralLB::Solution sol(CkMyPe(), maxLoad, migrations);
+  size_t buf_size = sizeof(GreedyRefineCentralLB::Solution);
+  void *buffer = malloc(buf_size);
+  PUP::toMem pd(buffer);
+  pd|sol;
+
+  CkCallback cb(CkIndex_GreedyRefineCentralLB::receiveSolutions((CkReductionMsg*)NULL), thisProxy[cur_ld_balancer]);
+  contribute(buf_size, buffer, CkReduction::set, cb);
+
+  if ((_lb_args.debug() > 1) && (CkMyPe() == cur_ld_balancer)) {
+    CkPrintf("[%d] %f : Called gather/reduction\n", CkMyPe(), CkWallTimer() - strategyStartTime);
+  }
+
+  free(buffer);
+}
+
+void GreedyRefineCentralLB::work(LDStats *stats)
+{
+  strategyStartTime = CkWallTimer();
+  float A = 1.001, B = FLT_MAX; // Use A=0, B=-1 to imitate regular Greedy (ignore migrations)
+  if (concurrent) {
+    getGreedyRefineParams(CkMyPe(), A, B);
+    if (A < 0) {
+      sendSolution(-1,-1);  // send empty response to PE0
+      return;
+    }
+  }
+
+  const int n_pes = stats->nprocs();
+  totalObjs = stats->n_migrateobjs;
+
+  std::vector<GreedyRefineCentralLB::GObj> objs(totalObjs);
+  // will sort pobjs instead of objs (faster swapping). will only contain pointers
+  // to migratable objects
+  std::vector<GreedyRefineCentralLB::GObj*> pobjs;
+  pobjs.reserve(totalObjs);
+
+  std::vector<GreedyRefineCentralLB::GProc> procs(n_pes);
+  PHeap procHeap(n_pes);
+
+  // fill data structures used by algorithm
+  double maxLoad = fillData(stats, objs, pobjs, procs, procHeap);
+
+  // ------------ apply greedy refine algorithm --------------
+
+  std::sort(pobjs.begin(), pobjs.end(), GreedyRefineCentralLB::ObjLoadGreater());
+
+  int nmoves = 0;
+  double greedyMaxLoad = 0;
+
+#if CMK_CUDA || CMK_HIP
+  // ---- GPU-aware path: balance at GPU-group level ----
+  //
+  // Group PEs by gpu_device_id.  M tracks the max *GPU-group* aggregate load.
+  // greedyLB preprocessing computes M at GPU-group level.
+  // Main loop: pop lightest GPU group, assign object to lightest PE in that group.
+
+  // --- Build GPU groups from the per-PE procs vector ---
+
+  struct GPUGrp {
+    uint64_t gpu_id;
+    double load;                        // aggregate load across PEs in this group
+    std::vector<int> peIds;             // indices into procs[]
+    size_t gpu_mem_remaining;
+    size_t pool_buff_mem_remaining;
+  };
+
+  std::vector<GPUGrp> gpuGroups;
+  std::unordered_map<uint64_t, int> gpuIdToIdx;
+
+  for (int pe = 0; pe < n_pes; pe++) {
+    GreedyRefineCentralLB::GProc &p = procs[pe];
+    if (!p.available) continue;
+    uint64_t devId = stats->procs[pe].gpu_device_id;
+    size_t gpu_mem_remaining = stats->procs[pe].gpu_mem_remaining;
+    size_t pool_buff_mem_remaining = stats->procs[pe].pool_buff_mem_remaining;
+    // printf("pe gpu_id %ld\n", devId);
+
+    auto it = gpuIdToIdx.find(devId);
+    if (it == gpuIdToIdx.end()) {
+      gpuIdToIdx[devId] = gpuGroups.size();
+      GPUGrp g;
+      g.gpu_id = devId;
+      g.load = p.bgload;
+      g.peIds.push_back(pe);
+      g.gpu_mem_remaining = gpu_mem_remaining;
+      g.pool_buff_mem_remaining = pool_buff_mem_remaining;
+      gpuGroups.push_back(std::move(g));
+    } else {
+      gpuGroups[it->second].load += p.bgload;
+      gpuGroups[it->second].peIds.push_back(pe);
+    }
+  }
+  int nGroups = gpuGroups.size();
+
+  // CkPrintf("[%d] GreedyRefineCentralLB: %d GPU group(s), %d available PEs, %d migratable objs\n",
+  //          CkMyPe(), nGroups, availablePes, (int)pobjs.size());
+  // for (auto &g : gpuGroups)
+  //   CkPrintf("[%d]   GPU %llu: %d PEs, bgload=%.6f, gpu_mem_remaining=%ld, pool_buff_mem_remaining=%ld\n",
+  //            CkMyPe(), g.gpu_id, (int)g.peIds.size(), g.load, g.gpu_mem_remaining, g.pool_buff_mem_remaining);
+
+  // --- Greedy preprocessing at GPU-group level to establish target M ---
+  // Reset group loads to bg only, then greedily assign objects to get M.
+  double M = 0;
+  {
+    // Save a copy of group bg loads
+    std::vector<double> grpLoad(nGroups);
+    for (int gi = 0; gi < nGroups; gi++) grpLoad[gi] = gpuGroups[gi].load;
+
+    for (int i = 0; i < (int)pobjs.size(); i++) {
+      // Find lightest GPU group
+      int lightest = 0;
+      for (int gi = 1; gi < nGroups; gi++) {
+        if (grpLoad[gi] < grpLoad[lightest]) lightest = gi;
+      }
+      grpLoad[lightest] += pobjs[i]->load;
+      if (grpLoad[lightest] > M) M = grpLoad[lightest];
+    }
+    greedyMaxLoad = M;
+  }
+  M *= A;
+  // CkPrintf("M is %f\n", M);
+
+  // Reset GPU group loads back to bg-only for the real assignment pass
+  for (int gi = 0; gi < nGroups; gi++) {
+    gpuGroups[gi].load = 0;
+    for (int pe : gpuGroups[gi].peIds)
+      gpuGroups[gi].load += procs[pe].bgload;
+  }
+  // Also reset per-PE loads in procHeap to bgload
+  procHeap.addProcessors(procs, (maxLoad <= 0.001), false);
+
+  // if ((_lb_args.debug() > 0) && (CkMyPe() == cur_ld_balancer))
+  //   CkPrintf("[%d] GPU greedy-refine: M(target)=%.6f, A=%.3f, B=%.3f\n", CkMyPe(), M, A, B);
+
+  // Reverse map: PE index -> GPU group index
+  std::unordered_map<int, int> peToGrpIdx;
+  for (int gi = 0; gi < nGroups; gi++)
+    for (int pe : gpuGroups[gi].peIds)
+      peToGrpIdx[pe] = gi;
+
+  for (int i = 0; i < (int)pobjs.size(); i++) {
+    const GreedyRefineCentralLB::GObj *obj = pobjs[i];
+    double obj_load = obj->load;
+
+    int lightest_gi = 0;
+    for (int gi = 1; gi < nGroups; gi++) {
+      if (gpuGroups[gi].load < gpuGroups[lightest_gi].load)
+        lightest_gi = gi;
+    }
+
+    int src_gi = -1;
+    if (obj->oldPE >= 0) {
+      auto srcIt = peToGrpIdx.find(obj->oldPE);
+      if (srcIt != peToGrpIdx.end()) src_gi = srcIt->second;
+    }
+
+    // Refinement: if object's current GPU group is close enough, keep it there
+    int chosen_gi = lightest_gi;
+    if (src_gi >= 0) {
+      GPUGrp &curGrp = gpuGroups[src_gi];
+      if ((curGrp.load <= (gpuGroups[lightest_gi].load + 0.01) * B) && (curGrp.load + obj_load <= M))
+            chosen_gi = src_gi;
+    }
+
+    // Pool buffer constraint
+    if (chosen_gi != src_gi && src_gi >= 0 && obj->gpuPupSize > 0) {
+      if (gpuGroups[src_gi].pool_buff_mem_remaining < obj->gpuPupSize || gpuGroups[chosen_gi].pool_buff_mem_remaining < obj->gpuPupSize)
+        chosen_gi = src_gi;
+
+      if((size_t)(0.95 * gpuGroups[chosen_gi].gpu_mem_remaining) <  obj->gpuAllocSize )//95% of the rest of the memory can be filled
+        chosen_gi = src_gi;
+    }
+
+    GPUGrp &g = gpuGroups[chosen_gi];
+
+    int bestPe = g.peIds[0];
+
+    //find the PE with the least walltime
+    for(int pe : g.peIds) {
+      if(procs[pe].load < procs[bestPe].load) {
+        bestPe = pe;
+      }
+    }
+
+    if(obj->oldPE >= 0 && peToGrpIdx[obj->oldPE] == chosen_gi)
+      bestPe = obj->oldPE;
+      
+    GreedyRefineCentralLB::GProc *p = &procs[bestPe];
+    double scaled = obj->load / p->speed;
+
+    // Update PE load
+    procHeap.remove(p);
+    p->load += scaled;
+    procHeap.push(p);
+
+    // Update GPU group aggregate
+    g.load += scaled;
+
+    if (chosen_gi != src_gi && src_gi >= 0 && obj->gpuPupSize > 0)
+    {
+      gpuGroups[src_gi].pool_buff_mem_remaining -= obj->gpuPupSize;
+      gpuGroups[chosen_gi].pool_buff_mem_remaining -= obj->gpuPupSize;
+      gpuGroups[chosen_gi].gpu_mem_remaining-= obj->gpuAllocSize;
+    }
+
+    // Track max GPU-group load; expand M if exceeded
+    if (g.load > maxLoad) {
+      maxLoad = g.load;
+      if (maxLoad > M) M = maxLoad;
+    }
+
+    // Record migration if PE changed
+    if (bestPe != obj->oldPE) {
+      nmoves++;
+      stats->to_proc[obj->id] = bestPe;
+      // if (_lb_args.debug() > 2)
+        // CkPrintf("[%d] Migrating obj %d: PE %d -> PE %d (GPU %d, objLoad=%.6f, gpuGrpLoad=%.6f)\n",
+        //          CkMyPe(), obj->id, obj->oldPE, bestPe, g.gpu_id, obj_load, g.load);
+    }
+  }
+
+  // Print per-GPU-group loads after LB
+  CkPrintf("[%d] --- Per-GPU-group loads after LB ---\n", CkMyPe());
+  for (int gi = 0; gi < nGroups; gi++)
+    CkPrintf("[%d]   GPU %llu: aggregate load=%.6f\n",
+             CkMyPe(), gpuGroups[gi].gpu_id, gpuGroups[gi].load);
+
+#else
+  // ---- Original PE-level greedy refine (non-GPU path) ----
+
+  double M = 0;
+  if (B > 0) {
+    M = greedyLB(pobjs, procHeap, stats);
+    greedyMaxLoad = M;
+    procHeap.addProcessors(procs, (maxLoad <= 0.001), false);
+  }
+
+  M *= A;
+  // if ((_lb_args.debug() > 1) && (CkMyPe() == cur_ld_balancer)) {
+  //   CkPrintf("maxLoad=%f totalObjLoad=%f M=%f A=%f B=%f\n", maxLoad, totalObjLoad, M, A, B);
+  // }
+  for (int i=0; i < pobjs.size(); i++) {
+    const GreedyRefineCentralLB::GObj *obj = pobjs[i];
+    GreedyRefineCentralLB::GProc *llp = procHeap.top();
+    GreedyRefineCentralLB::GProc *prevPe = NULL;
+    if (obj->oldPE >= 0) prevPe = &(procs[obj->oldPE]);
+
+    GreedyRefineCentralLB::GProc *p = llp;
+    if (prevPe && (prevPe->load <= (llp->load+0.01)*B) && (prevPe->load + obj->load <= M) && (prevPe->available))
+      p = prevPe;
+
+    procHeap.remove(p);
+    p->load += (obj->load / p->speed);
+    procHeap.push(p);
+
+    // if (p->id != obj->oldPE) {
+    //   nmoves++;
+    //   stats->to_proc[obj->id] = p->id;
+    //   if (_lb_args.debug() > 1) {
+    //     CkPrintf("[%d] Migrating obj %d: PE %d -> PE %d (objLoad=%.6f, destPELoad=%.6f)\n",
+    //              CkMyPe(), obj->id, obj->oldPE, p->id, obj->load, p->load);
+    //   }
+    // }
+    if (p->load > maxLoad) {
+      maxLoad = p->load;
+      if (maxLoad > M) M = maxLoad;
+    }
+  }
+#endif
+  // ----------------------------------------------
+  // if (_lb_args.debug() > 1 && (!concurrent || (CkMyPe() == cur_ld_balancer))) {
+  //   CkPrintf("[%d] --- Per-PE loads after LB ---\n", CkMyPe());
+  //   for (int pe=0; pe < n_pes; pe++) {
+  //     GreedyRefineCentralLB::GProc &p = procs[pe];
+  //     if (p.available)
+  //       CkPrintf("[%d]   PE %d: totalLoad=%.6f bgLoad=%.6f\n",
+  //                CkMyPe(), pe, p.load, p.bgload);
+  //   }
+  //   CkPrintf("[%d] After LB: max_load=%.6f, migrations=%d/%d (%.2f%%)\n",
+  //            CkMyPe(), maxLoad, nmoves, (int)pobjs.size(),
+  //            100.0 * nmoves / double(pobjs.size()));
+  // }
+
+  if (concurrent) {
+
+    sendSolution(maxLoad, nmoves);
+
+#if __DEBUG_GREEDY_REFINE_
+    CkCallback cb(CkReductionTarget(GreedyRefineCentralLB, receiveTotalTime), thisProxy[cur_ld_balancer]);
+    contribute(sizeof(double), &strategyStartTime, CkReduction::sum_double, cb);
+#endif
+  } else if (_lb_args.debug() > 0) {
+    double greedyRatio = 1.0;
+    if (greedyMaxLoad > 0) greedyRatio = maxLoad / greedyMaxLoad;
+    double migrationRatio = nmoves/double(pobjs.size());
+    // if ((greedyRatio > 1.03) && (migrationRatio < migrationTolerance)) {
+    //   CkPrintf("[%d] GreedyRefine: WARNING - migration ratio is %.3f (within user-specified tolerance).\n"
+    //            "but maxload after lb is %f higher than greedy. Consider testing with A=0, B=-1\n",
+    //            CkMyPe(), migrationRatio, greedyRatio);
+    // }
+    // CkPrintf("[%d] GreedyRefineCentralLB: after lb, max_load=%.3f, migrations=%d(%.2f%%), ratioToGreedy=%.3f\n",
+    //          CkMyPe(), maxLoad, nmoves, 100.0*migrationRatio, greedyRatio);
+  }
+}
+
+void GreedyRefineCentralLB::receiveTotalTime(double time)
+{
+  CkPrintf("Avg start time of GreedyRefineCentralLB strategy is %f\n", time / CkNumPes());
+}
+
+// decide which solution among all PEs is best and apply it
+void GreedyRefineCentralLB::receiveSolutions(CkReductionMsg *msg)
+{
+  std::vector<GreedyRefineCentralLB::Solution> results(NUM_SOLUTIONS);
+
+  int migrationsAllowed = totalObjs * migrationTolerance;
+  ckout<<"migrations allowed "<<migrationsAllowed<<" out of "<<totalObjs<<" total objs"<<endl;
+  // feasible solutions are those satistying user's migration constraint
+  bool feasibleSolutions = false;
+  float lowest_max_load = FLT_MAX;    // lowest max load of all solutions
+  float lowest_max_load_f = FLT_MAX;  // lowest max load of feasible solution set
+  float highest_max_load = 0;         // highest max load of all solutions
+  int lowestMigrations = INT_MAX;     // lowest num migrations of all solutions
+  const GreedyRefineCentralLB::Solution *bestSol = NULL; // best solution
+
+  // first pass. Will record solution with lowest migrations as the best, in case
+  // there is no feasible solution
+  CkReduction::setElement *current = (CkReduction::setElement*)msg->getData();  // Get the first element in the set
+  int numSolutions = 0;
+  for ( ; current && (numSolutions < NUM_SOLUTIONS); current = current->next()) {
+    PUP::fromMem pd(&current->data);
+    pd|results[numSolutions]; // store result
+    if (results[numSolutions].migrations >= 0) {  // valid result
+      const GreedyRefineCentralLB::Solution &r = results[numSolutions++];
+      if ((r.migrations <= migrationsAllowed) && (r.max_load < lowest_max_load_f)) {
+        lowest_max_load_f = r.max_load;
+        feasibleSolutions = true;
+      }
+
+      if ((r.migrations < lowestMigrations) ||
+        ((r.migrations == lowestMigrations) && (r.max_load < bestSol->max_load))) {
+        lowestMigrations = r.migrations;
+        bestSol = &r;
+      }
+
+      if (r.max_load < lowest_max_load) lowest_max_load = r.max_load;
+      if (r.max_load > highest_max_load) highest_max_load = r.max_load;
+    }
+  }
+  results.resize(numSolutions); // for cases where CkNumPes() < NUM_SOLUTIONS
+  CkAssert(numSolutions > 0);
+
+  if (feasibleSolutions) {
+    // second pass, get solution with low max load and migrations from feasible set
+    int bestMigrations = INT_MAX;  // num migrations of best solution
+    for (int i=0; i < results.size(); i++) {
+      const GreedyRefineCentralLB::Solution &r = results[i];
+      // Select if we find (fewer migrations and load within tolerance) or
+      // (same as lowest migration and better load).  Since we know a feasible
+      // solution exists and we only minimize here, we guarantee that we'll end
+      // with a feasible solution.
+      if ((r.migrations < bestMigrations && r.max_load <= lowest_max_load_f*LOAD_MIG_BAL) ||
+          (r.migrations == bestMigrations && r.max_load < bestSol->max_load)) {
+        bestMigrations = r.migrations;
+        bestSol = &r;
+      }
+    }
+  }
+  // else: can't satisfy user migration constraint (for this lb step),
+  // so just use solution with lowest num migrations
+
+  if (_lb_args.debug() > 1) {
+    CkPrintf("GreedyRefineCentralLB: Lowest max_load is %f, worst max_load is %f, lowest migrations=%d\n",
+             lowest_max_load, highest_max_load, lowestMigrations);
+
+    CkPrintf("GreedyRefineCentralLB: Got %d solutions at %f\nBest one is from PE %d with max_load=%f, migrations=%d\n",
+             numSolutions, CkWallTimer(), bestSol->pe, bestSol->max_load, bestSol->migrations);
+    float A, B;
+    getGreedyRefineParams(bestSol->pe, A, B);
+    CkPrintf("Best PE used params A=%f B=%f\n", A, B);
+  }
+
+  // notify PE that produced the best solution
+  thisProxy[bestSol->pe].ApplyDecision();
+}
+
+#include "GreedyRefineCentralLB.def.h"
+
+/*@}*/
\ No newline at end of file
diff --git a/src/ck-ldb/GreedyRefineCentralLB.ci b/src/ck-ldb/GreedyRefineCentralLB.ci
new file mode 100644
index 0000000000..999de1b0f8
--- /dev/null
+++ b/src/ck-ldb/GreedyRefineCentralLB.ci
@@ -0,0 +1,10 @@
+module GreedyRefineCentralLB {
+
+  extern module CentralLB;
+  initnode void lbinit(void);
+  group [migratable] GreedyRefineCentralLB : CentralLB {
+    entry void GreedyRefineCentralLB(const CkLBOptions &);
+    entry void receiveSolutions(CkReductionMsg *msg);
+    entry [reductiontarget] void receiveTotalTime(double time);
+  };
+};
\ No newline at end of file
diff --git a/src/ck-ldb/GreedyRefineCentralLB.h b/src/ck-ldb/GreedyRefineCentralLB.h
new file mode 100644
index 0000000000..a37b9dea7e
--- /dev/null
+++ b/src/ck-ldb/GreedyRefineCentralLB.h
@@ -0,0 +1,102 @@
+/**
+ * \addtogroup CkLdb
+*/
+/*@{*/
+
+/**
+ * Author: jjgalvez@illinois.edu (Juan Galvez)
+ * Greedy algorithm to minimize cpu max_load and object migrations.
+ * Can find solution equal or close to regular Greedy with less (sometimes much less) migrations.
+ * The amount of migrations that the user can tolerate is passed via the command-line
+ * option +LBPercentMoves (as percentage of chares that can be moved).
+ *
+ * If LBPercentMoves is not passed, strategy assumes it can move all objects.
+ * In this case, the algorithm will give preference to minimizing cpu max_load.
+ * It will still move less than greedy, but the amount of migrations
+ * will depend very much on the particular case (object load distribution and processor background loads),
+ *
+ * supports processor avail bitvector
+ * supports nonmigratable attrib
+ *
+*/
+
+#ifndef _GREEDY_REFINE_LB_H_
+#define _GREEDY_REFINE_LB_H_
+
+#include "CentralLB.h"
+#include "GreedyRefineCentralLB.decl.h"
+
+void CreateGreedyRefineCentralLB();
+BaseLB *AllocateGreedyRefineCentralLB();
+
+#define __DEBUG_GREEDY_REFINE_ 0
+
+class GreedyRefineCentralLB : public CBase_GreedyRefineCentralLB {
+public:
+  GreedyRefineCentralLB(const CkLBOptions &);
+  GreedyRefineCentralLB(CkMigrateMessage *m);
+  void work(LDStats* stats);
+  void receiveSolutions(CkReductionMsg *msg);
+  void receiveTotalTime(double time);
+  void setMigrationTolerance(float tol) { migrationTolerance = tol; }
+
+private:
+  bool QueryBalanceNow(int step) { return true; }
+
+  class GProc {
+  public:
+    GProc() : available(true), load(0) {}
+    int id;
+    bool available;
+    int pos;    // position in min heap
+    double load;
+    double bgload; // background load
+    #if (CMK_CUDA || CMK_HIP)
+    double bg_walltime;
+    #endif
+    float speed;
+  };
+
+  class GObj {
+  public:
+    int id;
+    double load;
+    int oldPE;
+    size_t gpuPupSize;
+    size_t gpuAllocSize;
+  };
+
+  class ObjLoadGreater {
+  public:
+    inline bool operator() (const GObj *o1, const GObj *o2) const {
+      return (o1->load > o2->load);
+    }
+  };
+
+  class PHeap;
+  class Solution;
+
+  double fillData(LDStats *stats,
+                  std::vector<GObj> &objs,
+                  std::vector<GObj*> &pobjs,
+                  std::vector<GProc> &procs,
+                  PHeap &procHeap);
+
+  double greedyLB(const std::vector<GObj*> &pobjs, PHeap &procHeap, const BaseLB::LDStats *stats) const;
+  void sendSolution(double maxLoad, int migrations);
+
+  double strategyStartTime;
+  double totalObjLoad;
+  int availablePes;
+  float migrationTolerance;
+  int totalObjs;
+
+#if __DEBUG_GREEDY_REFINE_
+  void dumpObjLoads(std::vector<GObj> &objs);
+  void dumpProcLoads(std::vector<GProc> &procs);
+#endif
+};
+
+#endif
+
+/*@}*/
\ No newline at end of file
diff --git a/src/ck-ldb/LBDatabase.C b/src/ck-ldb/LBDatabase.C
index 8c6c1b1d4c..67acda9be1 100644
--- a/src/ck-ldb/LBDatabase.C
+++ b/src/ck-ldb/LBDatabase.C
@@ -6,6 +6,7 @@
 LBDatabase::LBDatabase() {
   omCount = omsRegistering = 0;
   obj_walltime = 0;
+  obj_gputime = 0;
   statsAreOn = false;
   objsEmptyHead = -1;
   commTable = new LBCommTable;
@@ -243,6 +244,12 @@ void LBDatabase::GetTime(LBRealType *total_walltime, LBRealType *total_cputime,
   //CkPrintf("HERE [%d] total: %f %f obj: %f %f idle: %f bg: %f\n", CkMyPe(), *total_walltime, *total_cputime, obj_walltime, obj_cputime, *idletime, *bg_walltime);
 }
 
+void LBDatabase::GetGPUBGTime(LBRealType *bg_gputime)
+{
+  // TODO: implement this properly
+  *bg_gputime = 0;
+}
+
 void LBDatabase::ClearLoads(void)
 {
   int i;
@@ -256,6 +263,9 @@ void LBDatabase::ClearLoads(void)
         obj->lastCpuTime = obj->data.cpuTime;
 #endif
       }
+#if CMK_CUDA || CMK_HIP
+      obj->data.gpuTime = 0.0;
+#endif
       obj->data.wallTime = 0.0;
 #if CMK_LB_CPUTIMER
       obj->data.cpuTime = 0.0;
@@ -266,6 +276,7 @@ void LBDatabase::ClearLoads(void)
   commTable = new LBCommTable;
   machineUtil.Clear();
   obj_walltime = 0;
+  obj_gputime = 0;
 #if CMK_LB_CPUTIMER
   obj_cputime = 0;
 #endif
@@ -328,3 +339,17 @@ void LBDatabase::EstObjLoad(const LDObjHandle &_h, double cputime)
   obj->setTiming(cputime);
 #endif
 }
+
+void LBDatabase::EstObjGPULoad(const LDObjHandle &_h, double gputime)
+{
+#if CMK_CUDA || CMK_HIP
+#if CMK_LBDB_ON
+  LBObj *const obj = LbObj(_h);
+
+  CmiAssert(obj != NULL);
+  obj->data.gpuTime = gputime;
+#endif
+#else
+    CmiAbort("LBDatabase::EstObjGPULoad called but CMK_CUDA is not set");
+#endif
+}
diff --git a/src/ck-ldb/LBDatabase.h b/src/ck-ldb/LBDatabase.h
index b344d7c29f..ce7c37a469 100644
--- a/src/ck-ldb/LBDatabase.h
+++ b/src/ck-ldb/LBDatabase.h
@@ -3,12 +3,14 @@
 
 #include "lbdb.h"
 
+#include "objid.h"
 #include "LBObj.h"
 #include "LBOM.h"
 #include "LBComm.h"
 #include "LBMachineUtil.h"
 
 #include <vector>
+#include <unordered_map>
 
 class CkSyncBarrier;
 
@@ -32,6 +34,7 @@ friend class LBManager;
   LBCommTable* commTable;
   bool statsAreOn;
   double obj_walltime;
+  double obj_gputime;
   LBMachineUtil machineUtil;
   CkSyncBarrier* syncBarrier;
 
@@ -48,6 +51,13 @@ friend class LBManager;
 #endif
     }
   }
+
+  inline void MeasuredObjGPUTime(double gputime) {
+    if (statsAreOn) {
+      obj_gputime += gputime;
+    }
+  }
+
   inline LBOM* LbOM(LDOMHandle h) {
     return oms[h.handle];
   }
@@ -67,6 +77,37 @@ friend class LBManager;
     LbObj(h)->getTime(&walltime, &cputime);
   };
 
+  inline void GetObjGPULoad(LDObjHandle &h, LBRealType &gputime) {
+    LbObj(h)->getGPUTime(&gputime);
+  };
+
+  inline void SetObjGPULoad(std::unordered_map<uint64_t, uint64_t> &id_gputimeMap)
+  {
+    int matched = 0;
+    int liveObjs = 0;
+    for (int i = 0; i < objs.size(); i++) {
+      if(objs[i].obj == nullptr)
+        continue;
+      liveObjs++;
+      // The CUPTI map is keyed by raw element IDs (from CkMigratable::ckGetID()).
+      // The LB database stores IDs with collection bits prepended (when
+      // CMK_GLOBAL_LOCATION_UPDATE is set). Strip collection bits to match.
+      CmiUInt8 lb_id = objs[i].obj->ObjData().objID();
+      CmiUInt8 raw_id = ck::ObjID(lb_id).getElementID();
+      auto it = id_gputimeMap.find(raw_id);
+      if(it==id_gputimeMap.end()) {
+        // CkPrintf("[PE %d] SetObjGPULoad: obj %d lb_id=%lu raw_id=%lu NO MATCH\n", CmiMyPe(), i, (unsigned long)lb_id, (unsigned long)raw_id);
+        continue;
+      }
+
+      matched++;
+      // CkPrintf("[PE %d] SetObjGPULoad: obj %d id=%lu -> gpuTime=%.6f s\n",
+      //          CmiMyPe(), i, (unsigned long)it->first, it->second / 1.0e9);
+      objs[i].obj->setGPUTiming(it->second / 1.0e9);
+    }
+    // CkPrintf("[PE %d] SetObjGPULoad: %d/%d live objects matched from %zu CUPTI entries (objs.size=%zu)\n",
+    //          CmiMyPe(), matched, liveObjs, id_gputimeMap.size(), objs.size());
+  }
   inline void* GetObjUserData(LDObjHandle &h) {
     return LbObj(h)->getLocalUserData();
   }
@@ -89,6 +130,7 @@ friend class LBManager;
   inline void NonMigratable(LDObjHandle h) { LbObj(h)->SetMigratable(false); };
   inline void Migratable(LDObjHandle h) { LbObj(h)->SetMigratable(true); };
   inline void setPupSize(LDObjHandle h, size_t pup_size) { LbObj(h)->setPupSize(pup_size);};
+  inline void setGPUPupSize(LDObjHandle h, size_t gpu_pup_size) { LbObj(h)->setGPUPupSize(gpu_pup_size);};
   inline void UseAsyncMigrate(LDObjHandle h, bool flag) { LbObj(h)->UseAsyncMigrate(flag); };
   inline int GetCommDataSz(void) {
     if (commTable)
@@ -121,12 +163,14 @@ friend class LBManager;
                           int migratable);
   void UnregisterObj(LDObjHandle h);
   void EstObjLoad(const LDObjHandle &h, double cpuload);
+  void EstObjGPULoad(const LDObjHandle &h, double cpuload);
   void BackgroundLoad(LBRealType *walltime, LBRealType *cputime);
   void Send(const LDOMHandle &destOM, const CmiUInt8 &destID, unsigned int bytes, int destObjProc, int force = 0);
   void MulticastSend(const LDOMHandle &_om, CmiUInt8 *_ids, int _n, unsigned int _b, int _nMsgs=1);
   void GetTime(LBRealType *total_walltime, LBRealType *total_cputime,
                LBRealType *idletime, LBRealType *bg_walltime,
                LBRealType *bg_cputime);
+  void GetGPUBGTime(LBRealType *bg_gputime);
   const std::vector<LBObjEntry>& getObjs() {return objs;}
 
   inline void ObjectStart(const LDObjHandle &h) {
@@ -143,6 +187,10 @@ friend class LBManager;
       obj->StopTimer(&walltime, &cputime);
       obj->IncrementTime(walltime, cputime);
       MeasuredObjTime(walltime, cputime);
+
+      #if CMK_CUDA || CMK_HIP
+      MeasuredObjGPUTime(obj->data.gpuTime);
+      #endif
     }
   };
   inline const LDObjHandle &GetObjHandle(int idx) {
diff --git a/src/ck-ldb/LBManager.C b/src/ck-ldb/LBManager.C
index 2c6798873f..fbfc38e03b 100644
--- a/src/ck-ldb/LBManager.C
+++ b/src/ck-ldb/LBManager.C
@@ -8,6 +8,8 @@
 #include <ck.h>
 #include "cksyncbarrier.h"
 
+#include "hapi_portable.h"
+
 #include "DistributedLB.h"
 #include "LBManager.h"
 #include "LBSimulation.h"
@@ -83,6 +85,7 @@ class LBDBRegistry
   {
     lbtables.emplace_back(name, fn, afn, help, shown);
   }
+  bool hasBalancers() const { return !runtime_lbs.empty() || !compile_lbs.empty(); }
   void addCompiletimeBalancer(const char* name) { compile_lbs.push_back(name); }
   void addRuntimeBalancer(const char* name, const char* legacyLBName = nullptr)
   {
@@ -125,6 +128,11 @@ void LBRegisterBalancer(std::string name, LBCreateFn fn, LBAllocFn afn, std::str
 
 LBAllocFn getLBAllocFn(const char* lbname) { return lbRegistry.getLBAllocFn(lbname); }
 
+bool LBHasBalancersRegistered()
+{
+  return lbRegistry.hasBalancers();
+}
+
 // create a load balancer group using the strategy name
 static void createLoadBalancer(const std::string& lbname, const char* legacybalancer = nullptr)
 {
@@ -212,6 +220,8 @@ void _loadbalancerInit()
       lbNames.push_back("Refine");
       lbNames.push_back("Hybrid");
       lbNames.push_back("MetisLB");
+      lbNames.push_back("GreedyCentralLB");
+      lbNames.push_back("GreedyRefineCentralLB");
       if (CkMyPe() == 0)
       {
         if (CmiGetArgStringDesc(argv, "+balancer", &balancer, "Use this load balancer"))
@@ -316,6 +326,8 @@ void _loadbalancerInit()
   CmiGetArgIntDesc(argv, "+LBVersion", &_lb_args.lbversion(),
                    "LB database file version number");
   CmiGetArgIntDesc(argv, "+LBCentPE", &_lb_args.central_pe(), "CentralLB processor");
+  CmiGetArgIntDesc(argv, "+LBPercentMovesAllowed", &_lb_args.percentMovesAllowed(),
+                   "For GreedyRefineCentralLB, the percentage of chares that can be moved");
   bool _lb_dump_activated = false;
   if (CmiGetArgIntDesc(argv, "+LBDump", &LBSimulation::dumpStep,
                        "Dump the LB state from this step"))
@@ -533,10 +545,12 @@ void LBManager::init(void)
 {
   mystep = 0;
   new_ld_balancer = 0;
+  lb_in_progress = false;
   chare_count = 0;
   metabalancer = nullptr;
   lbdb_obj = new LBDatabase();
   currentLBIndex = 0;
+  reallocBuffer = nullptr;
 #if CMK_LB_CPUTIMER
   obj_cputime = 0;
 #endif
@@ -568,6 +582,7 @@ int LBManager::AddStartLBFn(std::function<void()> fn)
 
   callbk->fn = fn;
   callbk->on = true;
+  CkPrintf("Registering StartLB function %p\n", (void*)callbk);
   startLBFnList.push_back(callbk);
   startLBFn_count++;
   return startLBFnList.size() - 1;
@@ -586,6 +601,7 @@ void LBManager::RemoveStartLBFn(int handle)
 
 void LBManager::StartLB()
 {
+  CkPrintf("Start LB called, count %d\n", startLBFn_count);
   if (startLBFn_count == 0)
   {
     CmiAbort("StartLB is not supported in this LB");
@@ -593,7 +609,12 @@ void LBManager::StartLB()
   for (int i = 0; i < startLBFnList.size(); i++)
   {
     StartLBCB* startLBFn = startLBFnList[i];
-    if (startLBFn && startLBFn->on) startLBFn->fn();
+    CkPrintf("StartLB checking function %d: %p, %d\n", i, (void*)startLBFn, startLBFn->on);
+    if (startLBFn && startLBFn->on) 
+    {
+      CkPrintf("Invoking StartLB function %p\n", (void*)&startLBFn->fn);
+      startLBFn->fn();
+    }
   }
 }
 
@@ -751,7 +772,10 @@ void LBManager::nextLoadbalancer(int seq)
 // switch strategy
 void LBManager::switchLoadbalancer(int switchFrom, int switchTo)
 {
-  if (lbNames[switchTo] != "DistributedLB" && lbNames[switchTo] != "MetisLB")
+  if (lbNames[switchTo] != "DistributedLB" &&
+    lbNames[switchTo] != "MetisLB" && 
+    lbNames[switchTo] != "GreedyCentralLB" && 
+    lbNames[switchTo] != "GreedyRefineCentralLB")
   {
     json config;
     if (lbNames[switchTo] == "Hybrid")
@@ -806,8 +830,9 @@ void LBManager::pup(PUP::er& p)
       avail_vector_set = true;
       p | avail_vector;
       // If we're restarting with more PEs, make the new ones available
-      if (avail_vector.size() < CkNumPes())
-        avail_vector.resize(CkNumPes(), 1);
+      //if (avail_vector.size() < CkNumPes())
+      //avail_vector.resize(CkNumPes(), 1);
+      avail_vector = std::vector<char>(CkNumPes(), 1);
     }
     else
     {
@@ -823,6 +848,7 @@ void LBManager::pup(PUP::er& p)
   p | mystep;
   if (p.isUnpacking())
   {
+    reallocBuffer = nullptr;
     if (_lb_args.metaLbOn())
     {
       // if unpacking set metabalancer using the id
@@ -1035,6 +1061,9 @@ int LDProcessorSpeed()
     wps = (int)((double)wps * correction + 0.5);
   }
 
+  if (_lb_args.debug() > 1)
+    CmiPrintf("LB> PE %d speed is %d\n", CkMyPe(), wps);
+
   return wps;
 }
 
@@ -1048,6 +1077,66 @@ int LBManager::ProcessorSpeed()
   return peSpeed;
 }
 
+int LBManager::ProcessorGPUSpeed()
+{
+#if CMK_hapi || CMK_HIP
+  static int gpuSpeed = -1; // Cache the result
+  
+  if (gpuSpeed != -1) {
+    return gpuSpeed;
+  }
+  
+  // Check if GPU is available
+  int deviceCount = 0;
+  if (hapiGetDeviceCount(&deviceCount) != hapiSuccess || deviceCount == 0) {
+    CmiAbort("LB> PE %d: No GPU available, GPU speed = 0\n", CkMyPe());
+  }
+  
+  // Get device for this PE (round-robin assignment)
+  int deviceId = CkMyPe() % deviceCount;
+  if (hapiSetDevice(deviceId) != hapiSuccess) {
+    CmiAbort("LB> PE %d: Failed to set GPU device %d, GPU speed = 0\n", CkMyPe(), deviceId);
+  }
+  
+  // Get device properties
+  hapiDeviceProp prop;
+  if (hapiGetDeviceProperties(&prop, deviceId) != hapiSuccess) {
+    CmiAbort("LB> PE %d: Failed to get GPU device properties, GPU speed = 0\n", CkMyPe());
+  }
+
+  int clockRate = 0;
+  if (hapiDeviceGetAttribute(&clockRate, hapiDevAttrClockRate, deviceId) != hapiSuccess) {
+    CmiAbort("LB> PE %d: Failed to get GPU clock rate, GPU speed = 0\n", CkMyPe());
+  }
+  
+  // Calculate theoretical peak single-precision FLOPS
+  // Formula: multiProcessorCount * maxThreadsPerMultiProcessor * clockRate(KHz) * 2(FMA)
+  // Convert to GFLOPS and then scale to integer for comparison with CPU speed
+  long long peakFLOPS = (long long)prop.multiProcessorCount * 
+                        prop.maxThreadsPerMultiProcessor * 
+                        clockRate * 2LL; // 2 for FMA (multiply-add)
+  
+  // Convert from KHz*ops to GFLOPS, then scale to reasonable integer range
+  double gflops = peakFLOPS / 1e6; // KHz to GHz conversion for GFLOPS
+  
+  // Scale to integer range similar to CPU ProcessorSpeed (typically 1-10000)
+  // Use a scaling factor to make GPU speeds comparable to CPU speeds
+  gpuSpeed = (int)(gflops / 100.0); // Scale down GFLOPS to reasonable range
+  
+  if (gpuSpeed < 1) gpuSpeed = 1; // Minimum speed
+  
+  if (_lb_args.debug() > 1) {
+    CmiPrintf("LB> PE %d GPU %s: %d SMs, %d threads/SM, %d MHz, %.1f GFLOPS -> speed %d\n", 
+              CkMyPe(), prop.name, prop.multiProcessorCount, 
+              prop.maxThreadsPerMultiProcessor, clockRate/1000, gflops, gpuSpeed);
+  }
+  
+  return gpuSpeed;
+#else
+  CmiAbort("LB> PE %d: ProcessorGPUSpeed() GPU support not enabled in this build\n", CkMyPe());
+#endif
+}
+
 /*
   callable from user's code
 */
diff --git a/src/ck-ldb/LBManager.ci b/src/ck-ldb/LBManager.ci
index e424b4cd67..5556777ad8 100644
--- a/src/ck-ldb/LBManager.ci
+++ b/src/ck-ldb/LBManager.ci
@@ -10,6 +10,7 @@ module LBManager {
 
   group [migratable] LBManager {
     entry void LBManager(void);
+    entry void StartLB();
     entry void ResumeClients();
     initnode void initnodeFn();
   };
diff --git a/src/ck-ldb/LBManager.h b/src/ck-ldb/LBManager.h
index 314513422e..af2999088e 100644
--- a/src/ck-ldb/LBManager.h
+++ b/src/ck-ldb/LBManager.h
@@ -7,6 +7,7 @@
 #define LBMANAGER_H
 
 #include <cassert>
+#include <unordered_map>
 
 #include "LBDatabase.h"
 #include "json_fwd.hpp"
@@ -44,6 +45,7 @@ class CkLBArgs
   bool _lb_metaLbOn;
   char* _lb_metaLbModelDir;
   char* _lb_treeLBFile = (char*)"treelb.json";
+  int _lb_percentMovesAllowed;  // for GreedyRefineCentralLB, as percentage of chares that can be moved
 
  public:
   CkLBArgs()
@@ -60,6 +62,7 @@ class CkLBArgs
     _lb_targetRatio = 1.05;
     _lb_metaLbOn = false;
     _lb_metaLbModelDir = nullptr;
+    _lb_percentMovesAllowed = 100;
   }
   inline char*& treeLBFile() { return _lb_treeLBFile; }
   inline double& lbperiod() { return _autoLbPeriod; }
@@ -82,6 +85,7 @@ class CkLBArgs
   inline double& targetRatio() { return _lb_targetRatio; }
   inline bool& metaLbOn() { return _lb_metaLbOn; }
   inline char*& metaLbModelDir() { return _lb_metaLbModelDir; }
+  inline int& percentMovesAllowed() { return _lb_percentMovesAllowed; }
 };
 
 extern CkLBArgs _lb_args;
@@ -96,6 +100,8 @@ extern bool _lb_psizer_on;
 #define PredictorPrintf \
   if (PREDICT_DEBUG) CmiPrintf
 
+extern void realloc(char*);
+
 // used in constructor of all load balancers
 class CkLBOptions
 {
@@ -134,6 +140,7 @@ typedef BaseLB* (*LBAllocFn)();
 void LBDefaultCreate(LBCreateFn f);
 
 void LBRegisterBalancer(std::string, LBCreateFn, LBAllocFn, std::string, bool shown = true);
+bool LBHasBalancersRegistered();
 
 template <typename T>
 void LBRegisterBalancer(std::string name, std::string description, bool shown = true)
@@ -239,8 +246,12 @@ class LBManager : public CBase_LBManager
 
   int startLBFn_count;
 
+  char* reallocBuffer;
+
  public:
   int chare_count;
+  bool lb_in_progress;
+
 
   LBManager(void) { init(); }
   LBManager(CkMigrateMessage* m) : CBase_LBManager(m) { init(); }
@@ -264,6 +275,22 @@ class LBManager : public CBase_LBManager
   void configureTreeLB(const char* json_str);
   void configureTreeLB(json& config);
 
+  void bufferRealloc(char* bitmap)
+  {
+    int size = CkNumPes() + 2 * sizeof(int);
+    reallocBuffer = (char*)malloc(size);
+    memcpy(reallocBuffer, bitmap, size);
+  }
+
+  void callRealloc()
+  {
+    if (reallocBuffer != nullptr)
+    {
+      realloc(reallocBuffer);
+      reallocBuffer = nullptr;
+    }
+  }
+
   /*
    * Calls from object managers to load database
    */
@@ -288,6 +315,7 @@ class LBManager : public CBase_LBManager
   void NonMigratable(LDObjHandle h) { lbdb_obj->NonMigratable(h); }
   void Migratable(LDObjHandle h) { lbdb_obj->Migratable(h); }
   void setPupSize(LDObjHandle h, size_t pup_size) { lbdb_obj->setPupSize(h, pup_size); }
+  void setGPUPupSize(LDObjHandle h, size_t gpu_pup_size) { lbdb_obj->setGPUPupSize(h, gpu_pup_size); }
   void UseAsyncMigrate(LDObjHandle h, bool flag) { lbdb_obj->UseAsyncMigrate(h, flag); };
   int GetObjDataSz(void) { return lbdb_obj->GetObjDataSz(); }
   int GetCommDataSz(void) { return lbdb_obj->GetCommDataSz(); }
@@ -310,6 +338,14 @@ class LBManager : public CBase_LBManager
   {
     lbdb_obj->GetObjLoad(h, walltime, cputime);
   };
+   void GetObjGPULoad(LDObjHandle& h, LBRealType& gputime)
+  {
+    lbdb_obj->GetObjGPULoad(h, gputime);
+  };
+  void SetObjGPULoad(std::unordered_map<uint64_t, uint64_t> &id_gputimeMap)
+  {
+    lbdb_obj->SetObjGPULoad(id_gputimeMap);
+  }
   void* GetObjUserData(LDObjHandle& h) { return lbdb_obj->GetObjUserData(h); }
   void MetaLBCallLBOnChares() { lbdb_obj->MetaLBCallLBOnChares(); }
   void MetaLBResumeWaitingChares(int lb_period)
@@ -329,6 +365,10 @@ class LBManager : public CBase_LBManager
   {
     lbdb_obj->GetTime(total_walltime, total_cputime, idletime, bg_walltime, bg_cputime);
   }
+  void GetGPUBGTime(LBRealType* bg_gputime)
+  {
+    lbdb_obj->GetGPUBGTime(bg_gputime);
+  }
   LDObjHandle RegisterObj(LDOMHandle omh, CmiUInt8 id, void* userPtr, int migratable)
   {
     return lbdb_obj->RegisterObj(omh, id, userPtr, migratable);
@@ -338,6 +378,10 @@ class LBManager : public CBase_LBManager
   {
     lbdb_obj->EstObjLoad(h, cpuload);
   }
+  void EstObjGPULoad(const LDObjHandle& h, double gputime)
+  {
+    lbdb_obj->EstObjGPULoad(h, gputime);
+  }
   void BackgroundLoad(LBRealType* walltime, LBRealType* cputime)
   {
     lbdb_obj->BackgroundLoad(walltime, cputime);
@@ -476,6 +520,7 @@ class LBManager : public CBase_LBManager
   void LocalBarrierOff(void);
   void ResumeClients();
   static int ProcessorSpeed();
+  static int ProcessorGPUSpeed();
   static void SetLBPeriod(double period)
   {
     _lb_args.lbperiod() = period;
diff --git a/src/ck-ldb/LBObj.C b/src/ck-ldb/LBObj.C
index 36ca87d85f..df04e87d62 100644
--- a/src/ck-ldb/LBObj.C
+++ b/src/ck-ldb/LBObj.C
@@ -28,6 +28,17 @@ void LBObj::Clear(void)
   data.minWall = 1e6;
   data.maxWall = 0.;
 #endif
+
+#if CMK_CUDA || CMK_HIP
+  data.gpuTime = 0.;
+#endif
+
+  startWTime = -1.0;
+  lastWallTime = .0;
+#if CMK_LB_CPUTIMER
+  startCTime = -1.0;
+  lastCpuTime = .0;
+#endif
 }
 
 void LBObj::IncrementTime(LBRealType walltime, LBRealType cputime)
@@ -42,6 +53,13 @@ void LBObj::IncrementTime(LBRealType walltime, LBRealType cputime)
 #endif
 }
 
+void LBObj::IncrementGPUTime(LBRealType walltime)
+{
+#if CMK_CUDA || CMK_HIP
+  data.gpuTime += walltime;
+#else
+  CmiAbort("LBObj::IncrementGPUTime called but CMK_CUDA is not set");
+#endif
+}
 #endif
-
 /*@}*/
diff --git a/src/ck-ldb/LBObj.h b/src/ck-ldb/LBObj.h
index be61c68b02..f944b3a19d 100644
--- a/src/ck-ldb/LBObj.h
+++ b/src/ck-ldb/LBObj.h
@@ -20,19 +20,7 @@ friend class LBDatabase;
     data.migratable = _migratable;
     data.asyncArrival = _asyncArrival;
     Clear();
-//    data.cpuTime = 0.;
-//    data.wallTime = 0.;
-//    data.minWall = 1e6;
-//    data.maxWall = 0.;
     localUserData = usr_ptr;
-//    migratable = _migratable;
-//    registered = true;
-    startWTime = -1.0;
-    lastWallTime = .0;
-#if CMK_LB_CPUTIMER
-    startCTime = -1.0;
-    lastCpuTime = .0;
-#endif
   }
 
   ~LBObj() { };
@@ -40,26 +28,29 @@ friend class LBDatabase;
   void Clear(void);
 
   void IncrementTime(LBRealType walltime, LBRealType cputime);
+  void IncrementGPUTime(LBRealType walltime);
+
   inline void StartTimer(void) {
-	startWTime = CkWallTimer();
+    startWTime = CkWallTimer();
 #if CMK_LB_CPUTIMER
-	startCTime = CkCpuTimer();
+    startCTime = CkCpuTimer();
 #endif
   }
+
   inline void StopTimer(LBRealType* walltime, LBRealType* cputime) {
-	if (startWTime >= 0.0) {	// in case startOn in middle of entry
-          const double endWTime = CkWallTimer();
-	  *walltime = endWTime - startWTime;
+    if (startWTime >= 0.0) {	// in case startOn in middle of entry
+      const double endWTime = CkWallTimer();
+      *walltime = endWTime - startWTime;
 #if CMK_LB_CPUTIMER
-          const double endCTime = CkCpuTimer();
-	  *cputime = endCTime - startCTime;
+      const double endCTime = CkCpuTimer();
+      *cputime = endCTime - startCTime;
 #else
-	  *cputime = *walltime;
+      *cputime = *walltime;
 #endif
-	}
-        else {
-          *walltime = *cputime = 0.0;
-        }
+	  }
+    else {
+      *walltime = *cputime = 0.0;
+    }
   }
 
   inline void getTime(LBRealType *w, LBRealType *c) {
@@ -71,6 +62,14 @@ friend class LBDatabase;
 #endif
   }
 
+  inline void getGPUTime(LBRealType *w) {
+  #if CMK_CUDA || CMK_HIP
+    *w = data.gpuTime;
+  #else
+    CmiAbort("LBObj::getGPUTime called but CMK_CUDA is not set");
+  #endif
+  }
+
   inline void setTiming(LBRealType cputime)
   {
     data.wallTime = cputime;
@@ -79,12 +78,25 @@ friend class LBDatabase;
 #endif
   }
 
+  inline void setGPUTiming(LBRealType gputime)
+  {
+  #if CMK_CUDA || CMK_HIP
+    data.gpuTime = gputime;
+  #else
+    CmiAbort("LBObj::setGPUTiming called but CMK_CUDA is not set");
+  #endif
+  }
+
   inline LDOMHandle &parentOM() { return data.handle.omhandle; }
   inline const LDObjHandle &GetLDObjHandle() const { return data.handle; }
   inline void SetMigratable(bool mig) { data.migratable = mig; }
   inline void setPupSize(size_t obj_pup_size) {
     data.pupSize = pup_encodeSize(obj_pup_size);
   }
+  inline void setGPUPupSize(size_t obj_gpu_pup_size){
+    data.gpuPupSize = obj_gpu_pup_size;
+  }
+  
   inline void UseAsyncMigrate(bool async) { data.asyncArrival = async; }
   inline LDObjData &ObjData() { return data; };
   inline void lastKnownLoad(LBRealType *w, LBRealType *c) {
diff --git a/src/ck-ldb/Make.lb b/src/ck-ldb/Make.lb
index 4b53c60d48..a34616b2bb 100644
--- a/src/ck-ldb/Make.lb
+++ b/src/ck-ldb/Make.lb
@@ -4,6 +4,8 @@ COMMON_LDBS=\
    DistributedLB \
    MetisLB \
    RecBipartLB \
+   GreedyCentralLB \
+   GreedyRefineCentralLB \
    manager.o
 
 ALL_LDBS=\
@@ -11,6 +13,8 @@ ALL_LDBS=\
    DistributedLB \
    MetisLB \
    RecBipartLB \
+   GreedyCentralLB \
+   GreedyRefineCentralLB \
    manager.o
 
 $(L)/libmoduleTreeLB.a:
@@ -19,6 +23,12 @@ LBHEADERS += TreeLB.h TreeLB.decl.h
 $(L)/libmoduleDistributedLB.a:
 LBHEADERS += DistributedLB.h DistributedLB.decl.h
 
+$(L)/libmoduleGreedyCentralLB.a:
+LBHEADERS += GreedyCentralLB.h GreedyCentralLB.decl.h
+
+$(L)/libmoduleGreedyRefineCentralLB.a:
+LBHEADERS += GreedyRefineCentralLB.h GreedyRefineCentralLB.decl.h
+
 $(L)/libmoduleMetisLB.a:
 LBHEADERS += MetisLB.h MetisLB.decl.h
 
@@ -37,6 +47,8 @@ ALL_LB_OBJS=EveryLB.o \
     TreeLB.o \
     DistributedLB.o \
     MetisLB.o \
+    GreedyCentralLB.o \
+    GreedyRefineCentralLB.o \
     RecBipartLB.o \
     ScotchLB.o \
     TempAwareRefineLB.o \
@@ -46,12 +58,16 @@ EVERYLB_DEPS=EveryLB.o \
     DistributedLB.o \
     MetisLB.o \
     RecBipartLB.o \
+    GreedyCentralLB.o \
+    GreedyRefineCentralLB.o
 # CommonLBs dependencies
 COMMONLBS_DEPS=CommonLBs.o \
     TreeLB.o \
     DistributedLB.o \
     MetisLB.o \
     RecBipartLB.o \
+    GreedyCentralLB.o \
+    GreedyRefineCentralLB \
     manager.o \
 
 $(L)/libmoduleEveryLB.a: $(EVERYLB_DEPS)
diff --git a/src/ck-ldb/Makefile_lb.sh b/src/ck-ldb/Makefile_lb.sh
index 662052596d..96ee55bb30 100755
--- a/src/ck-ldb/Makefile_lb.sh
+++ b/src/ck-ldb/Makefile_lb.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #Typical load balancers
-COMMON_LDBS="TreeLB DistributedLB MetisLB RecBipartLB"
+COMMON_LDBS="TreeLB DistributedLB MetisLB GreedyCentralLB RecBipartLB"
 #Load balancers for more specialized circumstances
 SPECIALIZED_LDBS=""
 #Load balanders which have an external dependency, or require some other kind of intervention
diff --git a/src/ck-ldb/MetisLB.C b/src/ck-ldb/MetisLB.C
index e1fc0fabf1..8d1a6a0ee1 100644
--- a/src/ck-ldb/MetisLB.C
+++ b/src/ck-ldb/MetisLB.C
@@ -85,12 +85,19 @@ void MetisLB::work(LDStats* stats)
   std::vector<idx_t> adjwgt(numEdges);
 
   int edgeNum = 0;
-  const double ratio = 256.0 / maxLoad;
+  double ratio;
+  if (maxLoad == 0)
+    ratio = 0;
+  else
+    ratio = 256.0 / maxLoad;
 
   for (int i = 0; i < numVertices; i++)
   {
     xadj[i] = edgeNum;
-    vwgt[i] = (int)ceil(ogr->vertices[i].getVertexLoad() * ratio);
+    if (ogr->vertices[i].getVertexLoad() == 0 && ratio == 0)
+      vwgt[i] = 1;
+    else
+      vwgt[i] = (int)ceil(ogr->vertices[i].getVertexLoad() * ratio);
     for (const auto& outEdge : ogr->vertices[i].sendToList)
     {
       adjncy[edgeNum] = outEdge.getNeighborId();
@@ -151,9 +158,16 @@ void MetisLB::work(LDStats* stats)
   // tpwghts: target partition weight, can pass NULL to equally divide
   // ubvec: of size ncon to indicate allowed load imbalance tolerance (> 1.0)
   // options: array of options; edgecut: stores the edgecut; pemap: mapping
-  METIS_PartGraphRecursive(&numVertices, &ncon, xadj.data(), adjncy.data(), vwgt.data(),
-                           vsize, adjwgt.data(), &numPes, tpwgts, ubvec.data(),
-                           options.data(), &edgecut, pemap.data());
+  CkPrintf("Metis partitioning in %i partitions\n", parr->availProcSize);
+  
+  if (parr->availProcSize > 1)
+    METIS_PartGraphRecursive(&numVertices, &ncon, xadj.data(), adjncy.data(), vwgt.data(),
+                            vsize, adjwgt.data(), &parr->availProcSize, tpwgts, ubvec.data(),
+                            options.data(), &edgecut, pemap.data());
+  else
+    pemap.resize(numVertices, 0);
+  
+  parr->reassignPeMapToAvailable(pemap);
 
   if (_lb_args.debug() >= 1)
   {
diff --git a/src/ck-ldb/TreeBuilder.h b/src/ck-ldb/TreeBuilder.h
index ecdcb25333..0780e9c32b 100644
--- a/src/ck-ldb/TreeBuilder.h
+++ b/src/ck-ldb/TreeBuilder.h
@@ -133,24 +133,6 @@ class PE_Root_Tree : public LBTreeBuilder
       logic[1] = level;
     }
 
-    if (CkMyPe() == 0 && !quietModeRequested)
-    {
-      CkPrintf("[%d] TreeLB: Using PE_Root tree with: ", CkMyPe());
-      for (const auto& strategy : strategies)
-      {
-        CkPrintf("%s ", strategy.c_str());
-      }
-      CkPrintf("\n");
-
-      if (_lb_args.debug() > 0)
-      {
-        CkPrintf(
-            "\tUsing %d as root\n"
-            "\tTest PE Speed: %s\n",
-            rootPE, _lb_args.testPeSpeed() ? "true" : "false");
-      }
-    }
-
     return L;
   }
 };
diff --git a/src/ck-ldb/TreeLB.C b/src/ck-ldb/TreeLB.C
index 9ca4ba8f5d..564ff9fea6 100644
--- a/src/ck-ldb/TreeLB.C
+++ b/src/ck-ldb/TreeLB.C
@@ -4,11 +4,21 @@
 #include "TreeLB.h"
 #include "TreeStrategyFactory.h"
 #include "spanningTree.h"
+#include "ck.h"
 #include <fstream>  // TODO delete if json file is read from LBManager
 #include <sstream>
 #include "json.hpp"
 
 extern int quietModeRequested;
+#if CMK_SHRINK_EXPAND
+extern "C" void charmrun_realloc(char *s);
+extern char willContinue;
+extern realloc_state pending_realloc_state;
+extern char * se_avail_vector;
+extern char *_shrinkexpand_basedir;
+extern int numProcessAfterRestart;
+extern bool load_balancer_created;
+#endif
 
 static void lbinit()
 {
@@ -20,6 +30,9 @@ static void lbinit()
   }
   LBRegisterBalancer<TreeLB>(
       "TreeLB", "Pluggable hierarchical LB with available strategies:" + o.str());
+#if CMK_SHRINK_EXPAND
+  load_balancer_created = true;
+#endif
 }
 
 void TreeLB::Migrated(int waitBarrier)
@@ -27,6 +40,25 @@ void TreeLB::Migrated(int waitBarrier)
   objMovedIn(waitBarrier);
 }
 
+void TreeLB::StartLB(){
+  CkPrintf("TreeLB::StartLB called on PE %d\n", CkMyPe());
+  if (logic[1]) {
+    CkPrintf("size of stats_msgs = %d\n", logic[1]->stats_msgs.size());
+  }
+
+  bool rateAware = false;
+  LBStatsMsg_1* mm = (LBStatsMsg_1*)logic[1]->stats_msgs[0];
+  if ((void*)mm->speeds != (void*)mm->obj_start) rateAware = true;
+
+  // if (logic[1]->getNumNewPes() == 0 || !rateAware) {
+  //   CkPrintf("TreeLB::StartLB: no new PEs detected, starting load balancing\n");
+  //   loadBalanceSubtree(numLevels - 1);
+  // }
+  // else 
+      thisProxy.restartFromSE(rateAware);
+
+}
+
 void TreeLB::loadConfigFile(const CkLBOptions& opts)
 {
   config.clear();
@@ -121,6 +153,55 @@ void TreeLB::init(const CkLBOptions& opts)
 #endif
 }
 
+void TreeLB::collectSpeeds(int pe_id, float speed) {
+  if (_lb_args.debug() > 2) CkPrintf("[PE %d] TreeLB::collectSpeeds from PE %d speed=%f\n", CkMyPe(), pe_id, speed);
+  if (logic[1]->collectSpeeds(pe_id, speed))
+    loadBalanceSubtree(numLevels - 1);
+  else
+    CkPrintf("[PE %d] TreeLB::collectSpeeds: still waiting for more speeds\n", CkMyPe());
+}
+
+void TreeLB::restartFromSE(bool rateAware) {
+  // TODO: need to collect and recompute bg load as well for the new pes
+
+  if (CkMyPe() == 0 && rateAware) {
+    // if there was just 1 pe initially, the speed isn't set, so recompute it here
+    // TODO: ideally this should be rearranged so that the stats msgs are always set up correctly
+    LBStatsMsg_1* msg = (LBStatsMsg_1*)logic[1]->stats_msgs[0];
+    for (int i = 0; i < msg->nPes; i++) {
+        if (msg->pe_ids[i] == 0 && msg->speeds[i] == 1.0  ) {
+          msg->speeds[i] = lbmgr->ProcessorSpeed();
+        }
+      }
+  }
+  if (thisPeNew && rateAware) {
+    if (CkMyPe() == 0) CkAbort("[PE %d] Should never be new\n", CkMyPe());
+    float speed = float(lbmgr->ProcessorSpeed());
+    thisProxy[0].collectSpeeds(CkMyPe(), speed);
+    thisPeNew = false;
+  }
+
+  logic[0]->resetObjs();
+
+  if (CkMyPe() == 0 && !rateAware) {
+    loadBalanceSubtree(numLevels - 1);
+  }
+}
+
+void TreeLB::expand_init()
+{
+  awaitingLB[0] = true;
+  awaitingLB[1] = false;
+
+  if (CkMyPe() == 0)
+    awaitingLB[1] = true; // root level also needs to do LB
+
+  if (CkNumPes() == 1)
+    awaitingLB[0] = awaitingLB[1] = false; // no need for PE level if only 1 PE
+
+  numLevels = 2;
+}
+
 TreeLB::~TreeLB()
 {
 #if CMK_LBDB_ON
@@ -139,7 +220,7 @@ TreeLB::~TreeLB()
 void TreeLB::configure(LBTreeBuilder& builder, json& config)
 {
 #if CMK_LBDB_ON
-
+  if (_lb_args.debug() > 0)
   if (numLevels > 0 && CkMyPe() == 0 && !quietModeRequested)
   {
     CkPrintf("[%d] Reconfiguring TreeLB\n", CkMyPe());
@@ -206,23 +287,57 @@ void TreeLB::configure(json& config)
 
 void TreeLB::pup(PUP::er& p)
 {
-  std::string configString;
-  if (p.isPacking())
-  {
-    configString = config.dump();
-  }
-  p | configString;
-  if (p.isUnpacking())
-  {
-    config = json::parse(configString);
+  if (_lb_args.debug() > 2)
+    CkPrintf("[%d] TreeLB::pup numLevels=%d\n", CkMyPe(), numLevels);
+
+  p|seqno;
+  
+  if(p.isUnpacking()){
+    loadConfigFile(CkLBOptions(seqno));
     init(CkLBOptions(seqno));
+    manager_init();
   }
+
+  assert(numLevels == 2); // rn this only supports the two level tree
+
+  if (logic[1] == nullptr) { // TODO: delete this memory
+    logic[1] = new RootLevel(); // this is needed because logic[1] is null on PE1, but PE1 still needs to participate in this... confusing?
+  } 
+
+  if (_lb_args.debug() > 2)
+    CkPrintf("[%d] TreeLB::pupping logic things\n", CkMyPe());
+
+  int oldPE;
+  if (p.isPacking()) oldPE = CkMyPe();
+  p|oldPE;
+  if (p.isUnpacking()) {
+    if (CkMyPe() != oldPE) {
+      thisPeNew = true;
+    }
+  }
+
+  p|*logic[0];
+  p|*logic[1];  
+
+  if (p.isUnpacking())
+    expand_init();
 }
 
-void TreeLB::InvokeLB()
+void TreeLB::CallLB()
 {
-#if CMK_LBDB_ON
-  // NOTE: I'm assuming new LBManager will know when (and when not to) call AtSync
+  #if CMK_LBDB_ON
+  #if CMK_SHRINK_EXPAND
+  
+  // if (pending_realloc_state != NO_REALLOC) {
+  //   // if (_lb_args.debug() > 0)
+  //   //   CkPrintf("TreeLB::CallLB pending_realloc_state=%d (EXPAND_MSG_RECEIVED %d, NO_REALLOC %d)\n", pending_realloc_state, EXPAND_MSG_RECEIVED, NO_REALLOC);
+  //   configure(config); // reconfigure tree in case number of PEs changed
+  //   CkPrintf("Done reconfiguring tree\n");
+  // }
+
+
+  #endif    
+
   if (barrier_before_lb)
   {
     contribute(CkCallback(CkReductionTarget(TreeLB, ProcessAtSync), thisProxy));
@@ -231,6 +346,15 @@ void TreeLB::InvokeLB()
   {
     thisProxy[CkMyPe()].ProcessAtSync();
   }
+  #endif
+}
+
+void TreeLB::InvokeLB()
+{
+#if CMK_LBDB_ON
+  // NOTE: I'm assuming new LBManager will know when (and when not to) call AtSync
+  lbmgr->lb_in_progress = true;
+  CallLB();
 #endif
 }
 
@@ -242,22 +366,46 @@ void TreeLB::ProcessAtSync()
   {
     CkPrintf("--------- Started LB step %d ---------\n", lbmgr->step());
   }
-  // CmiAssert(CmiNodeAlive(CkMyPe()));   // TODO move this logic to LBManager
-  int level = 0;  // load balancing starts at the lowest level
-  CkAssert(numLevels > 0 && !awaitingLB[level]);
-  TreeLBMessage* stats = logic[level]->getStats();
-  stats->level = level;
-  awaitingLB[level] = true;
+  CkAssert(numLevels > 0 && !awaitingLB[0]);
+  TreeLBMessage* stats = logic[0]->getStats();
+  stats->level = 0;
+  awaitingLB[0] = true;
+
   sendStatsUp((CkMessage*)stats);
 #endif
 }
 
+void TreeLB::CheckForLB() {
+#if CMK_SHRINK_EXPAND
+//   // if (_lb_args.debug() > 0)
+//   //   CkPrintf("TreeLB::CheckForLB pending_realloc_state=%d (EXPAND_MSG_RECEIVED %d, NO_REALLOC %d)\n", pending_realloc_state, EXPAND_MSG_RECEIVED, NO_REALLOC);
+
+  if (pending_realloc_state == EXPAND_MSG_RECEIVED)
+    checkForRealloc();
+  //else if (pending_realloc_state == NO_REALLOC)
+  //  thisProxy.resumeClients(0);
+  else
+    loadBalanceSubtree(numLevels - 1);
+    //thisProxy.CallLB();
+#else
+  //thisProxy.CallLB();
+  loadBalanceSubtree(numLevels - 1);
+#endif
+}
+
 // send stats up using the comm-tree for this level
 void TreeLB::sendStatsUp(CkMessage* msg)
 {
   TreeLBMessage* stats = (TreeLBMessage*)msg;
   int level = stats->level;
+  if (comm_parent.size() <= level || comm_children.size() <= level ||
+      comm_logic.size() <= level)
+  {
+    CkAbort("TreeLB: sendStatsUp invalid level %d, or comm_parent not initialized\n", level);
+  }
+
   int comm_parent_pe = comm_parent[level];
+
   // fprintf(stderr, "[%d] TreeLB::sendStatsUp - received msg level=%d comm_parent=%d\n",
   // CkMyPe(), level, comm_parent_pe);
   if (comm_parent_pe == -1)
@@ -298,7 +446,16 @@ void TreeLB::receiveStats(TreeLBMessage* stats, int level)
     {
       // cutoff can be adjusted dynamically, to prevent lb between upper-level domains.
       // can be used, for example, to only do within-node lb on some steps
-      loadBalanceSubtree(level);
+      TreeLBMessage* newMsg = l->mergeStats();  // this is IN PLACE 
+      
+      #if CMK_SHRINK_EXPAND
+        //contribute(CkCallback(CkReductionTarget(TreeLB, CheckForLB), thisProxy[0]));
+        CheckForLB();
+      #else
+        //CallLB();
+        loadBalanceSubtree(level);
+      #endif
+      //loadBalanceSubtree(level);
     }
     else
     {
@@ -311,6 +468,7 @@ void TreeLB::receiveStats(TreeLBMessage* stats, int level)
 
 void TreeLB::loadBalanceSubtree(int level)
 {
+  if (_lb_args.debug()) CkPrintf("[PE %d] TreeLB::loadBalanceSubtree called for level %d, awaiting %s\n", CkMyPe(), level, awaitingLB[level] ? "true" : "false");
   if (!awaitingLB[level]) return;
   awaitingLB[level] = false;
   if (level == 0) return lb_done();
@@ -319,7 +477,8 @@ void TreeLB::loadBalanceSubtree(int level)
 
   /// CkMessage *inter_subtree_migrations = nullptr;
   IDM idm;
-  TreeLBMessage* decision = logic[level]->loadBalance(idm);
+  if (_lb_args.debug()) CkPrintf("[PE %d] Calling loadBalance at level %d\n", CkMyPe(), level);
+  TreeLBMessage* decision = logic[level]->loadBalance(idm); // this result is the MigMsg
   if (idm.size() > 0)
   {
     // this can happen when final destinations of chares has been decided,
@@ -342,6 +501,8 @@ void TreeLB::loadBalanceSubtree(int level)
   // send decision to next level
   decision->level = level - 1;
   sendDecisionDown((CkMessage*)decision);
+
+
 }
 
 void TreeLB::multicastIDM(const IDM& mig_order, int num_pes, int* _pes)
@@ -357,6 +518,8 @@ void TreeLB::multicastIDM(const IDM& mig_order, int num_pes, int* _pes)
       thisProxy[*tb.begin(i)].multicastIDM(mig_order, tb.subtreeSize(i), tb.begin(i));
   }
   migrateObjects(mig_order);
+
+  
 }
 
 void TreeLB::sendDecisionDown(CkMessage* msg)
@@ -397,11 +560,11 @@ void TreeLB::sendDecisionDown(CkMessage* msg)
 void TreeLB::receiveDecision(TreeLBMessage* decision, int level)
 {
   // fprintf(stderr, "[%d] TreeLB::receiveDecision, level=%d\n", CkMyPe(), level);
-
   // incoming and outgoing are integers. logic objects determine and interpret these
   // values
   int& incoming = expected_incoming[level];
   int& outgoing = expected_outgoing[level];
+  //CkPrintf("[PE %d] TreeLB::receiveDecision at level %d, incoming=%d outgoing=%d\n", CkMyPe(), level, incoming, outgoing);
   logic[level]->processDecision(decision, incoming, outgoing);
   // fprintf(stderr, "[%d] level=%d incoming=%d outgoing=%d\n", CkMyPe(), level, incoming,
   // outgoing);
@@ -471,10 +634,12 @@ void TreeLB::recvLoadTokens(CkMessage* tokens)
 #endif
   int load = logic[level]->tokensReceived(token_set);
   load_received[level] += load;
+  CkPrintf("[PE %d] TreeLB::recvLoadTokens, load_received = %d\n", CkMyPe(), load_received[level]);
+
   checkLoadExchanged(level);
 }
 
-void TreeLB::objMovedIn(bool waitBarrier)
+void TreeLB::objMovedIn(bool waitBarrier) // this should be called, but is not
 {
   if (!waitBarrier) CkAbort("TreeLB future migrates not supported\n");
 
@@ -483,6 +648,7 @@ void TreeLB::objMovedIn(bool waitBarrier)
   int level = 0;
   CkAssert(numLevels > 0 && awaitingLB[level]);
   load_received[level] += 1;
+
   checkLoadExchanged(level);
 }
 
@@ -497,7 +663,79 @@ void TreeLB::migrateObjects(const IDM& mig_order)
   checkLoadExchanged(level);
 }
 
+void TreeLB::checkForRealloc()
+{
+#if CMK_SHRINK_EXPAND
+if (_lb_args.debug() > 0) {
+      CkPrintf(
+        "Check for Realloc. Number of stats messages: %d\n",
+        logic[1]->stats_msgs.size()
+      );
+}
+
+  if(pending_realloc_state != NO_REALLOC) {
+    pending_realloc_state = (pending_realloc_state == SHRINK_MSG_RECEIVED) ? SHRINK_IN_PROGRESS : EXPAND_IN_PROGRESS; //in progress
+    CkPrintf("Load balancer invoking charmrun to handle reallocation on pe %d\n", CkMyPe());
+    double end_lb_time = CkWallTimer();
+   
+    // do checkpoint
+    CkCallback cb(CkIndex_TreeLB::resumeFromReallocCheckpoint(), thisProxy[0]);
+
+    // print avail vector
+    if (_lb_args.debug() > 0) {
+      CkPrintf("Shrink/Expand se_avail_vector on pe %d: ", CkMyPe());
+      for(int i=0;i<CkNumPes();i++) CkPrintf("%d ", se_avail_vector[i]);
+      CkPrintf("\n");
+    }
+
+    //print a couple object loads to sample;
+    CkStartRescaleCheckpoint(_shrinkexpand_basedir, cb, 
+      std::vector<char>(se_avail_vector, se_avail_vector + CkNumPes()));
+  }
+  else
+  {
+    thisProxy.lb_done_impl();
+  }
+#endif
+}
+
+void TreeLB::resumeFromReallocCheckpoint()
+{
+#if CMK_SHRINK_EXPAND
+  std::vector<char> avail(se_avail_vector, se_avail_vector + CkNumPes());
+  free(se_avail_vector);
+  thisProxy.willIbekilled(avail, numProcessAfterRestart);
+#endif
+}
+
+void TreeLB::willIbekilled(std::vector<char> avail, int newnumProcessAfterRestart){
+#if CMK_SHRINK_EXPAND
+  numProcessAfterRestart = newnumProcessAfterRestart;
+  CkCallback cb(CkIndex_TreeLB::startCleanup(), thisProxy[0]);
+  contribute(cb);
+#endif
+}
+
+void TreeLB::startCleanup()
+{
+#if CMK_SHRINK_EXPAND
+  CkCleanup();
+#endif
+}
+
 void TreeLB::lb_done()
+{
+#if CMK_SHRINK_EXPAND
+  // barrier to check for reallocation
+  CkCallback cb(CkIndex_TreeLB::checkForRealloc(), thisProxy[0]);
+  contribute(cb);
+  return;
+#else
+    lb_done_impl();
+#endif
+}
+
+void TreeLB::lb_done_impl()
 {
   // fprintf(stderr, "[%d] lb_done step %d lb_time=%f\n", CkMyPe(), lbmgr->step(),
   // CkWallTimer() - startTime);
@@ -505,8 +743,15 @@ void TreeLB::lb_done()
   // TODO LBManager should do all of this, including global syncResume ******
   // Currently, TreeLB does syncResume by setting barrier_after_lb=true
 
-  // clear load stats
+
+#if CMK_SHRINK_EXPAND
+  // Only clear loads if not in the middle of a reallocation (EXPAND/SHRINK)
+  if (pending_realloc_state == NO_REALLOC){
+    lbmgr->ClearLoads();
+  }
+#else
   lbmgr->ClearLoads();
+#endif
 
   if (CkMyPe() == 0 && _lb_args.debug() > 0)
   {
@@ -555,6 +800,11 @@ void TreeLB::resumeClients()
     }
   }
   lbmgr->ResumeClients();
+
+  lbmgr->lb_in_progress = false;
+
+  if (CkMyPe() == 0)
+    lbmgr->callRealloc();
 }
 
 void TreeLB::reportLbTime(double* times, int n)
diff --git a/src/ck-ldb/TreeLB.ci b/src/ck-ldb/TreeLB.ci
index 312f9d38ca..4227fbe369 100644
--- a/src/ck-ldb/TreeLB.ci
+++ b/src/ck-ldb/TreeLB.ci
@@ -1,5 +1,7 @@
 module TreeLB {
-
+  PUPable LevelLogic;
+  PUPable RootLevel;
+  PUPable PELevel;
   include "idm.h";
 
   extern module BaseLB;
@@ -17,6 +19,17 @@ module TreeLB {
     entry void multicastIDM(IDM &mig_order, int num_pes, int _pes[num_pes]);
     entry [reductiontarget] void resumeClients(void);
     entry [reductiontarget] void reportLbTime(double times[n], int n);
+  
+    entry void resumeFromReallocCheckpoint();
+    entry void lb_done_impl();
+    entry void startCleanup();
+    entry [reductiontarget] void CheckForLB();
+    entry void CallLB();
+    entry void checkForRealloc();
+    entry void willIbekilled(std::vector<char> avail, int newnumProcessAfterRestart);
+
+    entry void restartFromSE(bool rateAware);
+    entry void collectSpeeds(int pe_id, float speed);
   };
 
 };
diff --git a/src/ck-ldb/TreeLB.h b/src/ck-ldb/TreeLB.h
index 31445f5dc3..aeadaf95ad 100644
--- a/src/ck-ldb/TreeLB.h
+++ b/src/ck-ldb/TreeLB.h
@@ -6,6 +6,7 @@
 #include "BaseLB.h"
 #include "TreeLB.decl.h"
 #include "json.hpp"
+#include "manager.h"
 #include <vector>
 using json = nlohmann::json;
 
@@ -25,18 +26,29 @@ class TreeLBMessage
  public:
   uint8_t level;
   // WARNING: don't add any virtual methods here
+
+  virtual void pup(PUP::er& p) { CkAbort("TreeLBMessage::pup not implemented\n"); }
 };
 
-class LevelLogic
+class LevelLogic : public PUP::able
 {
  public:
+   std::vector<TreeLBMessage*> stats_msgs;
+
+  LevelLogic() : PUP::able() {
+    num_stats_msgs = 0;
+    num_strategies = 0;
+  }
   virtual ~LevelLogic() {}
 
   /// return msg with lb stats for this PE. only needed at leaves
   virtual TreeLBMessage* getStats() { CkAbort("LevelLogic::getStats not implemented\n"); }
+  virtual bool collectSpeeds(int pe_id, float speed) { CkAbort("LevelLogic::collectSpeeds not implemented\n"); return false; }
+  virtual int getNumNewPes() { CkAbort("LevelLogic::getNumNewPes not implemented\n"); return 0; }
   // Note: These are not "=0" methods, because then the subclass would have to
   // implement (and abort inside) empty methods if it doesn't need them
 
+  virtual void resetObjs() { CkAbort("LevelLogic::resetObjs not implemented\n"); }
   /// deposit stats msg received from a child
   virtual void depositStats(TreeLBMessage* stats) { stats_msgs.push_back(stats); }
 
@@ -102,6 +114,18 @@ class LevelLogic
     CkAbort("LevelLogic::processDecision not implemented\n");
   }
 
+  PUPable_decl(LevelLogic);
+  LevelLogic(CkMigrateMessage *m) : PUP::able(m) {}
+  virtual void pup(PUP::er& p) { 
+    PUP::able::pup(p);
+    if (p.isPacking()) {
+      CkPrintf("[PE %d] PUPPING LevelLogic with %d stats and %d strategies\n", CkMyPe(), stats_msgs.size());
+      num_stats_msgs = stats_msgs.size();
+    }
+    p|num_stats_msgs;
+  }
+   
+
   virtual bool makesTokens() { return false; }
 
   /// return nominal load that is being transferred in the tokens
@@ -124,7 +148,8 @@ class LevelLogic
   }
 
  protected:
-  std::vector<TreeLBMessage*> stats_msgs;
+  unsigned int num_stats_msgs;
+  int num_strategies;
 };
 
 class LBTreeBuilder;
@@ -139,10 +164,21 @@ class TreeLB : public CBase_TreeLB
   {
     loadConfigFile(opts);
     init(opts);
+#if CMK_SHRINK_EXPAND
+	  manager_init();
+#endif
+  }
+
+  TreeLB(CkMigrateMessage* m) : CBase_TreeLB(m) 
+  {
+#if CMK_SHRINK_EXPAND
+    CkPrintf("TREELB MIGRATION constructor ON PE %d\n", CkMyPe());
+#endif
   }
-  TreeLB(CkMigrateMessage* m) : CBase_TreeLB(m) {}
+
   virtual ~TreeLB();
 
+  void expand_init();
   void pup(PUP::er& p);
 
   void loadConfigFile(const CkLBOptions& opts);
@@ -153,7 +189,7 @@ class TreeLB : public CBase_TreeLB
 
   // start load balancing (non-AtSync mode)  NOTE: This seems to do a broadcast
   // (is this the behavior we want?)
-  inline void StartLB() { thisProxy.ProcessAtSync(); }
+  void StartLB();
 
   // TODO: I would rename this group of functions (to maybe something like startLBLocal)
   // since they are also used in non-AtSync mode
@@ -163,6 +199,8 @@ class TreeLB : public CBase_TreeLB
                          // output look funny
                          // TODO: do we still need this?
 
+
+
   // send stats up using the comm-tree for this level
   void sendStatsUp(CkMessage* stats);
 
@@ -180,6 +218,20 @@ class TreeLB : public CBase_TreeLB
 
   void reportLbTime(double* times, int n);
 
+  void resumeFromReallocCheckpoint();
+
+  void lb_done_impl();
+
+  void startCleanup();
+  void CallLB();
+  void CheckForLB();
+
+  void checkForRealloc();
+
+  void willIbekilled(std::vector<char> avail, int newnumProcessAfterRestart);
+  void restartFromSE(bool rateAware);
+  void collectSpeeds(int pe_id, float speed);
+
  private:
   void init(const CkLBOptions& opts);
 
@@ -187,6 +239,7 @@ class TreeLB : public CBase_TreeLB
   void receiveStats(TreeLBMessage* stats, int level);
 
   void loadBalanceSubtree(int level);
+  void setupForProcessing(int level);
 
   // receive lb decision from parent (decision could be empty -do nothing-)
   // a non-empty decision implies load is moved from one subtree to another subtree
@@ -211,6 +264,7 @@ class TreeLB : public CBase_TreeLB
   // load can be actual objects or tokens
   inline bool checkLoadReceived(int level)
   {
+    //if (_lb_args.debug() > 2) CkPrintf("[PE %d] TreeLB::checkLoadReceived at level %d: received=%d expected=%d\n", CkMyPe(), level, load_received[level], expected_incoming[level]);
     if (load_received[level] == expected_incoming[level])
     {
       load_received[level] = expected_incoming[level] = 0;
@@ -229,6 +283,8 @@ class TreeLB : public CBase_TreeLB
 
   uint8_t numLevels = 0;  // total number of tree levels (this chare won't necessarily
                           // participate in all levels)
+
+  bool thisPeNew = false; // true if this PE is new after a shrink/expand operation
   std::vector<LevelLogic*> logic;  // level -> my logic object at this level
   std::vector<int>
       comm_parent;  // level -> my parent PE in comm-tree connecting level to level+1
diff --git a/src/ck-ldb/TreeLevel.h b/src/ck-ldb/TreeLevel.h
index db41872f74..1e1be9396f 100644
--- a/src/ck-ldb/TreeLevel.h
+++ b/src/ck-ldb/TreeLevel.h
@@ -9,6 +9,7 @@
 #include "TreeStrategyFactory.h"
 #include <cmath>
 #include <limits>  // std::numeric_limits
+#include <algorithm>
 
 #define FLOAT_TO_INT_MULT 10000
 
@@ -43,6 +44,32 @@ class LBStatsMsg_1 : public TreeLBMessage, public CMessage_LBStatsMsg_1
                   // considered to have ID i
   unsigned int*
       order;  // list of obj ids sorted by load (ids are determined by position in oloads)
+      
+  void pup(PUP::er& p)
+  {
+    
+    p|nObjs;
+    p|nPes;
+
+
+    for (int i = 0; i < nPes; i++)
+      p|pe_ids[i];
+    for (int i = 0; i < nPes; i++)
+      p|bgloads[i];
+    for (int i = 0; i < nPes; i++)
+      p|speeds[i];
+    for (int i = 0; i < nPes + 1; i++)
+      p|obj_start[i];
+    for (int i = 0; i < nObjs; i++)
+      p|oloads[i];
+    for (int i = 0; i < nObjs; i++)
+      p|order[i];
+
+
+    CkPrintf("[PE %d] Done PUPPING LBStatsMsg_1 with %d objs and %d pes\n", CkMyPe(), nObjs, nPes);
+
+  }
+
 
   static TreeLBMessage* merge(std::vector<TreeLBMessage*>& msgs)
   {
@@ -101,28 +128,33 @@ class LBStatsMsg_1 : public TreeLBMessage, public CMessage_LBStatsMsg_1
     int pe_cnt = 0;
     int obj_cnt = 0;
     float total_load = 0;
-    for (int i = 0; i < msgs.size(); i++)
+    
+    if (msgs.size() != 1) {
+      CkAbort("[PE %d] LBStatsMsg_1::fill should only have one msg, has %d\n", CkMyPe(), msgs.size());
+    }
+    LBStatsMsg_1* msg = (LBStatsMsg_1*)msgs[0];
+    //if (_lb_args.debug() > 1)CkPrintf("[PE %d]   msg %d with %d pes and %d objs\n", CkMyPe(), 0, msg->nPes, msg->nObjs);
+    for (int j = 0; j < msg->nPes; j++)
     {
-      LBStatsMsg_1* msg = (LBStatsMsg_1*)msgs[i];
-      for (int j = 0; j < msg->nPes; j++)
+      int pe = msg->pe_ids[j];
+      CkAssert(pe >= 0 && pe < CkNumPes());
+      //if (_lb_args.debug() > 2) CkPrintf("[PE %d]   filling pe %d with %d objs\n", CkMyPe(), pe,
+      //          msg->obj_start[j + 1] - msg->obj_start[j]);
+      procs[pe_cnt].populate(pe, msg->bgloads + j, msg->speeds + j);
+      procs[pe_cnt++].resetLoad();
+      migMsg->obj_start[pe] = obj_cnt;
+      int local_id = 0;
+      for (int k = msg->obj_start[j]; k < msg->obj_start[j + 1];
+            k++, obj_cnt++, local_id++)
       {
-        int pe = msg->pe_ids[j];
-        CkAssert(pe >= 0 && pe < CkNumPes());
-        procs[pe_cnt].populate(pe, msg->bgloads + j, msg->speeds + j);
-        procs[pe_cnt++].resetLoad();
-        migMsg->obj_start[pe] = obj_cnt;
-        int local_id = 0;
-        for (int k = msg->obj_start[j]; k < msg->obj_start[j + 1];
-             k++, obj_cnt++, local_id++)
-        {
-          objs[obj_cnt].populate(obj_cnt, msg->oloads + k, pe);
-          total_load += objs[obj_cnt].getLoad();
-          migMsg->to_pes[obj_cnt] = pe;
-          // if obj_local_ids.size() > 0:
-          obj_local_ids[obj_cnt] = local_id;
-        }
+        objs[obj_cnt].populate(obj_cnt, msg->oloads + k, pe);
+        total_load += objs[obj_cnt].getLoad();
+        migMsg->to_pes[obj_cnt] = pe;
+        // if obj_local_ids.size() > 0:
+        obj_local_ids[obj_cnt] = local_id;
       }
     }
+    
     CkAssert(obj_cnt == objs.size());
     CkAssert(pe_cnt == procs.size());
     return total_load;
@@ -175,6 +207,9 @@ class IStrategyWrapper
   virtual void removeObj(int& local_id, int& oldPe, float& load) = 0;
 
   virtual void addForeignObject(int local_id, int oldPe, float load) = 0;
+
+  virtual void pup(PUP::er& p) { CkAbort("IStrategyWrapper::pup not implemented\n");
+    } // TODO: pup correctly
 };
 
 // This wrapper allocates mem for objects and processors. to the lb algorithm,
@@ -374,6 +409,35 @@ class StrategyWrapper : public IStrategyWrapper
     sol->setErrorChecking(objs, procs);
 #endif
 
+#if CMK_SHRINK_EXPAND
+    if (se_avail_vector != NULL) {
+      if (_lb_args.debug() > 0) CkPrintf("se_avail_vector is not null on pe %d, removing procs that will be removed\n", CkMyPe());
+      // if shrink/expand is enabled, remove processors that will be removed (this happens at shrink, before the checkpoint)
+      std::vector<P> procs2;
+      for (const auto& p : procs) {
+        if (se_avail_vector[p.id] != 0) procs2.push_back(p);
+      }
+      procs = procs2; 
+    } 
+#endif
+    if (_lb_args.debug() > 0){
+      CkPrintf("[PE %d] Procs are : ", CkMyPe());
+      for (const auto& p : procs) {
+        CkPrintf("%d ", p.id);
+      }
+      CkPrintf("\n");
+      CkPrintf("[PE %d] Objs per PE: ", CkMyPe());
+      std::map<int, int> counts;
+      for (const auto& o : objs) {
+        counts[o.oldPe]++;
+      }
+      for (const auto& p : procs) {
+        CkPrintf("%d:%d ", p.id, counts[p.id]);
+      }
+      CkPrintf("\n");
+    }
+
+
     double t0 = CkWallTimer();
     strategy->solve(objs, procs, *sol, false);
 
@@ -455,13 +519,132 @@ class StrategyWrapper : public IStrategyWrapper
 class RootLevel : public LevelLogic
 {
  public:
-  RootLevel(int _num_groups = -1) : num_groups(_num_groups) {}
-
+  RootLevel(int _num_groups = -1) : num_groups(_num_groups) {
+    nPes = CkNumPes();
+  }
   virtual ~RootLevel()
   {
     for (auto w : wrappers) delete w;
   }
+  virtual int getNumNewPes() { return num_new_pes; }
+
+  virtual bool collectSpeeds(int proc_id, float speed) {
+    if (rateAware)
+    {
+      CkPrintf("[PE %d] RootLevel::collectSpeeds proc_id=%d speed=%f\n", CkMyPe(), proc_id, speed);
+      LBStatsMsg_1* msg = (LBStatsMsg_1*)stats_msgs[0];
+      for (int i = 0; i < msg->nPes; i++) {
+        if (msg->pe_ids[i] == proc_id) {
+
+          msg->speeds[i] = speed; 
+        }
+      }
+
+      num_new_pes--;
+      if (num_new_pes == 0) {
+         if (_lb_args.debug() > 0){
+      if (CkMyPe() == 0) {
+        CkPrintf("After speeds collected: My stats message on PE 0: %d\n", ((LBStatsMsg_1*)stats_msgs[0])->nObjs);
+        for (int i = 0; i < ((LBStatsMsg_1*)stats_msgs[0])->nPes; i++) {
+          CkPrintf("  pe %d: id=%d bgload=%f speed=%f obj_start=%d\n", i, ((LBStatsMsg_1*)stats_msgs[0])->pe_ids[i], ((LBStatsMsg_1*)stats_msgs[0])->bgloads[i], ((LBStatsMsg_1*)stats_msgs[0])->speeds[i], ((LBStatsMsg_1*)stats_msgs[0])->obj_start[i]);
+        }
+
+      }
+    }
+        // all new pes have reported their speed, can run lb now
+       return true;
+      }
+    }
+    return false;
+  }
+  
+  virtual TreeLBMessage* mergeStats()
+  {
+    // send obj loads up
+    TreeLBMessage* newMsg = LBStatsMsg_1::merge(stats_msgs);
+    // need to cast pointer to ensure delete of CMessage_LBStatsMsg_1 is called
+    for (auto m : stats_msgs) delete (LBStatsMsg_1*)m;
+    stats_msgs.clear();
+    stats_msgs.push_back(newMsg);
+    return newMsg;
+  }
+
+  PUPable_decl(RootLevel);
+  RootLevel(CkMigrateMessage *m) : LevelLogic(m) {}
+
+  virtual void pup(PUP::er& p)
+  {
+    if (_lb_args.debug() > 2) CkPrintf("[PE %d] PUPPING RootLevel\n", CkMyPe());
+    LevelLogic::pup(p); // this packs num_stats_msgs
+
+    if (num_stats_msgs > 1) CkAbort("RootLevel should have just one stats message! Has %d\n", num_stats_msgs);
+    p|nObjs;
+    p|nPes;
+    int nNewPes = CkNumPes();
+
+    if (_lb_args.debug() > 2) CkPrintf("[PE %d] Done with basics\n", CkMyPe());
+
+
+    // stats_msgs stuff is only relevant for expand
+    if (p.isUnpacking()) {
+      stats_msgs.resize(1);
+      LBStatsMsg_1* msg;
+      if (rateAware)
+        msg= new (nNewPes, nNewPes, nNewPes, nNewPes + 1, nObjs, nObjs, 0) LBStatsMsg_1;
+      else
+        msg= new (nNewPes, nNewPes, 0, nNewPes + 1, nObjs, nObjs, 0) LBStatsMsg_1;
+      stats_msgs[0] = msg;
+    }
 
+    if (stats_msgs.size() == 0) stats_msgs.push_back(new (nNewPes, nNewPes, nNewPes, nNewPes + 1, nObjs, nObjs, 0) LBStatsMsg_1);
+    p|*stats_msgs[0]; // everyone needs to pup this cause pup is dumb
+
+    if (p.isUnpacking() && CkMyPe() == 0 && num_stats_msgs > 0) {
+      // if num_stats_msgs = 0, then we aren't expanding 
+      LBStatsMsg_1* msg = (LBStatsMsg_1*)stats_msgs[0];
+      if (nObjs != msg->nObjs) {
+        CkAbort("In RootLevel::pup, nObjs (%d) != msg->nObjs (%d)\n", nObjs, msg->nObjs);
+      }
+
+      msg->nPes = CkNumPes();
+
+      // TODO: this will not work if we do simultaneous shrink/expand
+      num_new_pes = 0;
+      for (int i = nPes; i < CkNumPes(); i++)
+      {
+        // on expand: need to reset the new PEs info
+        if (msg->pe_ids[i] > CkNumPes() - 1 || msg->pe_ids[i] <= 0) {
+          // you are a new pe!
+          if (_lb_args.debug() > 0) CkPrintf("[PE %d] RootLevel::pup PE %d is new, resetting its info\n", CkMyPe(), i);
+          num_new_pes++;
+          msg->pe_ids[i] = i;
+          msg->bgloads[i] = 0;
+         // msg->speeds[i] = 1.0; // speeds need to be recomputed for the new procs and sent back to the root
+          msg->obj_start[i+1] = msg->obj_start[i]; // no objects
+        }
+      }
+
+      if (_lb_args.debug() > 0){
+      if (CkMyPe() == 0) {
+        CkPrintf("My stats message on PE 0: %d\n", msg->nObjs);
+        for (int i = 0; i < msg->nPes; i++) {
+          CkPrintf("  pe %d: id=%d bgload=%f speed=%f obj_start=%d\n", i, msg->pe_ids[i], msg->bgloads[i], msg->speeds[i], msg->obj_start[i]);
+        }
+
+      }
+    }
+  }
+    if (num_stats_msgs == 0) {
+      stats_msgs.clear();
+    }
+    
+
+    nPes = nNewPes;
+    num_stats_msgs = stats_msgs.size();
+
+  }
+
+  
   /**
    * mode 0: receive obj stats
    * mode 1: receive aggregated group load
@@ -470,7 +653,10 @@ class RootLevel : public LevelLogic
                          json& config, bool repeat_strategies = false,
                          bool token_passing = true)
   {
+
     using namespace TreeStrategy;
+    this->rateAware = rateAware;
+    this->strategies = strategies;
     for (auto w : wrappers) delete w;
     wrappers.clear();
     if (num_groups == -1)
@@ -506,7 +692,6 @@ class RootLevel : public LevelLogic
     }
     else
     {
-      nPes += ((LBStatsMsg_1*)stats)->nPes;
       nObjs += ((LBStatsMsg_1*)stats)->nObjs;
     }
   }
@@ -518,6 +703,9 @@ class RootLevel : public LevelLogic
 #endif
 
     const int num_children = stats_msgs.size();
+    if (num_children != 1) {
+      CkAbort("RootLevel::loadBalance: expected just one stats message (merged) but received from %d\n", nPes, num_children);
+    }
     CkAssert(num_children > 0);
 #if DEBUG__TREE_LB_L1
     CkPrintf("[%d] RootLevel::loadBalance, num_children=%d nPes=%d nObjs=%d\n", CkMyPe(),
@@ -527,7 +715,8 @@ class RootLevel : public LevelLogic
     if (num_groups == -1)
     {
       // msg has object loads
-      CkAssert(wrappers.size() > current_strategy);
+      if (wrappers.size() == 0)
+        CkAbort("No strategies configured for TreeLB with obj-based strategies\n");
       IStrategyWrapper* wrapper = wrappers[current_strategy];
       CkAssert(wrapper != nullptr);
       CkAssert(nPes == CkNumPes());
@@ -538,6 +727,9 @@ class RootLevel : public LevelLogic
 #if DEBUG__TREE_LB_L1
       double t0 = CkWallTimer();
 #endif
+      if (nPes != CkNumPes()) {
+        CkAbort("nPes (%d) != CkNumPes() (%d) in RootLevel::loadBalance\n", nPes, CkNumPes());
+      }
       wrapper->prepStrategy(nObjs, nPes, stats_msgs, migMsg);
       wrapper->runStrategy(migMsg);
       if (current_strategy == wrappers.size() - 1)
@@ -555,7 +747,7 @@ class RootLevel : public LevelLogic
       // need to cast pointer to ensure delete of CMessage_LBStatsMsg_1 is called
       for (auto msg : stats_msgs) delete (LBStatsMsg_1*)msg;
       stats_msgs.clear();
-      nPes = nObjs = 0;
+      nObjs = 0;
       return migMsg;
     }
     else
@@ -624,6 +816,7 @@ class RootLevel : public LevelLogic
       }
 
       total_load = 0.0;
+      nObjs = 0; // cleanup for next round
 
       int nmoves = int(solution.size());
       SubtreeMigrateDecisionMsg* migMsg =
@@ -652,6 +845,7 @@ class RootLevel : public LevelLogic
     int load;
   };
 
+  int num_new_pes = 0; // number of new pes on expand
   int num_groups;
   bool repeat_strategies;
   size_t current_strategy = 0;
@@ -660,6 +854,8 @@ class RootLevel : public LevelLogic
   unsigned int nObjs = 0;  // total number of objects in msgs I am processing
   float total_load = 0;
   std::vector<IStrategyWrapper*> wrappers;
+  bool rateAware;
+  std::vector<std::string> strategies;
 };
 
 // ---------------- NodeSetLevel ----------------
@@ -1055,7 +1251,11 @@ class PELevel : public LevelLogic
   {
     inline bool operator()(const LDObjData& o1, const LDObjData& o2) const
     {
+#if CMK_CUDA || CMK_HIP
+      return (o1.gpuTime > o2.gpuTime);
+#else
       return (o1.wallTime > o2.wallTime);
+#endif
     }
   };
 
@@ -1063,10 +1263,56 @@ class PELevel : public LevelLogic
 
   virtual ~PELevel() {}
 
+  PUPable_decl(PELevel);
+  PELevel(CkMigrateMessage *m) : LevelLogic(m) {}
+
+  virtual void pup(PUP::er& p)
+  {
+    LevelLogic::pup(p); // this packs num_stats_msgs
+   
+    p|nObjs;
+   // p|myObjs;
+
+    int nPes;
+    if (p.isPacking()) nPes = CkNumPes();
+    p|nPes;
+
+    if (p.isUnpacking()) {
+       if (CkMyPe() >= nPes) {
+        myObjs.clear();
+        nObjs = 0;
+       }
+    }
+    num_stats_msgs = 0;
+
+  }
+
+  virtual void resetObjs() {
+      int nobjs = lbmgr->GetObjDataSz();
+
+    std::vector<LDObjData> allLocalObjs(nobjs);
+    if (nobjs > 0) lbmgr->GetObjData(allLocalObjs.data());  // populate allLocalObjs
+
+    myObjs.clear();
+    nObjs = 0;
+
+    for (int i = 0; i < nobjs; i++)
+    {
+      if (allLocalObjs[i].migratable)
+      {
+     
+        myObjs.emplace_back(allLocalObjs[i]);
+       nObjs++;
+     
+      }
+    }
+  }
+
   virtual TreeLBMessage* getStats()
   {
     const int mype = CkMyPe();
     int nobjs = lbmgr->GetObjDataSz();
+
     std::vector<LDObjData> allLocalObjs(nobjs);
     if (nobjs > 0) lbmgr->GetObjData(allLocalObjs.data());  // populate allLocalObjs
     myObjs.clear();
@@ -1079,17 +1325,25 @@ class PELevel : public LevelLogic
       }
       else
       {
+        #if CMK_CUDA || CMK_HIP
+        nonMigratableLoad += allLocalObjs[i].gpuTime;
+        #else
         nonMigratableLoad += allLocalObjs[i].wallTime;
+        #endif
       }
     }
     nobjs = myObjs.size();
-
+    nObjs = nobjs;
     // TODO verify that non-migratable objects are not added to msg and are only counted
     // as background load
 
 #if DEBUG__TREE_LB_L3
     float total_obj_load = 0;
+    #if CMK_CUDA || CMK_HIP
+    for (int i = 0; i < nobjs; i++) total_obj_load += myObjs[i].gpuTime;
+    #else
     for (int i = 0; i < nobjs; i++) total_obj_load += myObjs[i].wallTime;
+    #endif
     CkPrintf("[%d] PELevel::getStats, myObjs=%d, aggregate_obj_load=%f\n", mype,
              int(myObjs.size()), total_obj_load);
 #endif
@@ -1106,7 +1360,14 @@ class PELevel : public LevelLogic
     if (rateAware)
     {
       msg = new (1, 1, 1, 2, nobjs, nobjs, 0) LBStatsMsg_1;
+#if CMK_CUDA || CMK_HIP
+      msg->speeds[0] = float(lbmgr->ProcessorGPUSpeed());
+#else
       msg->speeds[0] = float(lbmgr->ProcessorSpeed());
+#endif
+
+    if (_lb_args.debug() > 1)
+        CkPrintf("[%d] PELevel: processor speed is %f\n", mype, msg->speeds[0]);
     }
     else
       msg = new (1, 1, 0, 2, nobjs, nobjs, 0) LBStatsMsg_1;
@@ -1119,15 +1380,23 @@ class PELevel : public LevelLogic
     {
       // If rateAware, convert object loads by multiplying by processor speed
       // Note this conversion isn't done for bgloads because they never leave the PE
+      
+#if CMK_CUDA || CMK_HIP
+      float oload = float(myObjs[i].gpuTime);
+#else
+      float oload = float(myObjs[i].wallTime);
+#endif
       if (rateAware)
-        msg->oloads[i] = float(myObjs[i].wallTime) * msg->speeds[0];
+        msg->oloads[i] = oload * msg->speeds[0];
       else
-        msg->oloads[i] = float(myObjs[i].wallTime);
+        msg->oloads[i] = oload;
       msg->order[i] = i;
     }
 
     LBRealType t1, t2, t3, bg_walltime;
-#if CMK_LB_CPUTIMER
+#if CMK_CUDA || CMK_HIP
+    lbmgr->GetGPUBGTime(&bg_walltime);
+#elif CMK_LB_CPUTIMER
     LBRealType t4;
     lbmgr->GetTime(&t1, &t2, &t3, &bg_walltime, &t4);
 #else
@@ -1153,18 +1422,22 @@ class PELevel : public LevelLogic
     outgoing = 0;
     int obj_start = decision->obj_start[mype];
     int obj_end = obj_start + int(myObjs.size());
+    assert(myObjs.size == nObjs);
     int j = 0;
+   
     for (int i = obj_start; i < obj_end; i++, j++)
     {
+      //if (_lb_args.debug() > 2) CkPrintf("[%d] PELevel: obj %d (abs=%d, handle=%d) to dest %d\n", CkMyPe(), j, i,
+      //        myObjs[j].handle.handle, decision->to_pes[i]);
       int dest = decision->to_pes[i];
+      if (dest > CkNumPes() - 1)
+        CkAbort("PELevel: processDecision found dest PE >= CkNumPes(): %d >= %d\n", dest, CkNumPes());
       if (dest != mype)
       {
         if (dest >= 0)
         {
-#if DEBUG__TREE_LB_L3
-          CkPrintf("[%d] (processDecision) My obj %d (abs=%d) moving to %d\n", CkMyPe(),
-                   j, i, dest);
-#endif
+          //if (_lb_args.debug() > 1) CkPrintf("[%d] (processDecision) My obj %d (abs=%d, handle=%d) moving to %d\n", CkMyPe(),
+          //         j, i, myObjs[j].handle.handle, dest);
           if (lbmgr->Migrate(myObjs[j].handle, dest) == 0)
           {
             CkAbort("PELevel: Migrate call returned 0\n");
@@ -1210,6 +1483,7 @@ class PELevel : public LevelLogic
   LBManager* lbmgr;
   bool rateAware;
   std::vector<LDObjData> myObjs;
+  int nObjs = 0;
 };
 
 // ---------------- MsgAggregator ----------------
diff --git a/src/ck-ldb/TreeStrategyBase.h b/src/ck-ldb/TreeStrategyBase.h
index 57926535d6..be91b37b93 100644
--- a/src/ck-ldb/TreeStrategyBase.h
+++ b/src/ck-ldb/TreeStrategyBase.h
@@ -172,7 +172,7 @@ class Proc<N, false, multi>
 {
  public:
   int id = -1;
-
+  float speed[N] = {1.0};
   inline void populate(int _id, float* _bgload, float* _speed)
   {
     id = _id;
diff --git a/src/ck-ldb/ckgraph.C b/src/ck-ldb/ckgraph.C
index 326e6eb39c..bb9eea089e 100644
--- a/src/ck-ldb/ckgraph.C
+++ b/src/ck-ldb/ckgraph.C
@@ -15,23 +15,40 @@
 
 ProcArray::ProcArray(BaseLB::LDStats *stats) {
   const int numPes = stats->procs.size();
+
   // fill the processor array
   procs.resize(numPes);
+  availPeMap.resize(numPes);
+  std::fill(availPeMap.data(), availPeMap.data() + numPes, -1);
 
   // Loop through the LDStats structure, copying data into this array and calculating
   //   the average 'totalLoad' of all the PEs
+  availProcSize = 0;
   avgLoad = 0.0;
+  int currAvailPe = 0;
   for(int pe = 0; pe < numPes; pe++) {
     procs[pe].id        = stats->procs[pe].pe;
     procs[pe].setOverhead(stats->procs[pe].bg_walltime);
     procs[pe].setTotalLoad(stats->procs[pe].total_walltime - stats->procs[pe].idletime);
     procs[pe].available = stats->procs[pe].available;
+    //CkPrintf("%i avail = %d\n", pe, procs[pe].available);
+    availProcSize += (procs[pe].available ? 1 : 0);
     avgLoad += procs[pe].getTotalLoad();
+    if (!procs[pe].available)
+      currAvailPe++;
+    if (currAvailPe < numPes)
+      availPeMap[pe] = currAvailPe++;
 //		CkPrintf("PE%d overhead:%f totalLoad:%f \n",pe,procs[pe].overhead(),procs[pe].totalLoad());
   }
+  availPeMap.resize(availProcSize);
   avgLoad /= numPes;
 }
 
+void ProcArray::reassignPeMapToAvailable(std::vector<int32_t> &pemap) {
+  for (int i = 0; i < pemap.size(); i++)
+    pemap[i] = availPeMap[pemap[i]];
+}
+
 void ProcArray::resetTotalLoad() {
   for(int pe = 0; pe < procs.size(); pe++)
     procs[pe].setTotalLoad(procs[pe].getOverhead());
diff --git a/src/ck-ldb/ckgraph.h b/src/ck-ldb/ckgraph.h
index 8b02e478f1..8ac427b39c 100644
--- a/src/ck-ldb/ckgraph.h
+++ b/src/ck-ldb/ckgraph.h
@@ -17,6 +17,9 @@
 #include "BaseLB.h"
 #include <vector>
 
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
 class ProcInfo
 {
   friend class ProcArray;
@@ -56,9 +59,12 @@ class ProcArray
   ProcArray(BaseLB::LDStats* stats);
   double getAverageLoad() const { return avgLoad; }
   void resetTotalLoad();
+  void reassignPeMapToAvailable(std::vector<int32_t> &pemap);
 
   // vector containing the list of processors
   std::vector<ProcInfo> procs;
+  std::vector<int> availPeMap;
+  int availProcSize;
 
 protected:
   double avgLoad;
@@ -143,7 +149,7 @@ class CkVertex
   }
 
   int getVertexId() const { return id; }
-  double getVertexLoad() const { return compLoad; }
+  double getVertexLoad() const { return MAX(compLoad, 0.1); }
   int getCurrentPe() const { return currPe; }
   int getNewPe() const { return newPe; }
   void setNewPe(int _newpe) { newPe = _newpe; }
diff --git a/src/ck-ldb/greedy.h b/src/ck-ldb/greedy.h
index c87599859c..995becb1e2 100644
--- a/src/ck-ldb/greedy.h
+++ b/src/ck-ldb/greedy.h
@@ -83,7 +83,7 @@ class GreedyRefine : public Strategy<O, P, S>
       // TODO improve the case where the proc is not in my list of processors (because
       // it belongs to a foreing domain). procHeap API should return an error?
       P& oldPe = procHeap.getProc(ptr(o)->oldPe);
-      if ((oldPe.id >= 0) && (oldPe.getLoad() + ptr(o)->getLoad() <= M))
+      if ((oldPe.id >= 0) && (oldPe.getLoad() + (ptr(o)->getLoad() / oldPe.speed[0]) <= M))
         p = oldPe;
       else
         p = procHeap.top();
diff --git a/src/ck-ldb/lbdb.h b/src/ck-ldb/lbdb.h
index 12f330eddd..22ef1dfb73 100644
--- a/src/ck-ldb/lbdb.h
+++ b/src/ck-ldb/lbdb.h
@@ -157,6 +157,9 @@ class LBObjUserData {
 struct LDObjData {
   LDObjHandle handle;
   LBRealType wallTime;
+#if CMK_CUDA || CMK_HIP
+  LBRealType gpuTime;
+#endif
 #if CMK_LB_CPUTIMER
   LBRealType cpuTime;
 #endif
@@ -171,6 +174,9 @@ struct LDObjData {
   // An encoded approximation of the amount of data the object would pack;
   // call pup_decodeSize(pupSize) to get the actual approximate value
   CmiUInt2 pupSize;
+#if CMK_CUDA || CMK_HIP
+  size_t gpuPupSize;
+#endif
   inline const LDOMHandle &omHandle() const { return handle.omhandle; }
   inline const LDOMid &omID() const { return handle.omhandle.id; }
   inline const CmiUInt8 &objID() const { return handle.id; }
@@ -333,6 +339,9 @@ inline void LBObjUserData::pup(PUP::er &p) {
 inline void LDObjData::pup(PUP::er &p) {
   p|handle;
   p|wallTime;
+#if CMK_CUDA || CMK_HIP
+  p|gpuTime;
+#endif
 #if CMK_LB_CPUTIMER
   p|cpuTime;
 #endif
@@ -348,6 +357,9 @@ inline void LDObjData::pup(PUP::er &p) {
   }
 #endif
   p|pupSize;
+#if CMK_CUDA || CMK_HIP
+  p|gpuPupSize;
+#endif
 }
 
 inline bool LDCommDesc::operator==(const LDCommDesc &obj) const {
diff --git a/src/ck-ldb/manager.C b/src/ck-ldb/manager.C
index 9ef1111bee..345a2e0abe 100644
--- a/src/ck-ldb/manager.C
+++ b/src/ck-ldb/manager.C
@@ -13,38 +13,152 @@
 #include "converse.h"
 #include "conv-ccs.h"
 
+
 #if CMK_SHRINK_EXPAND
 realloc_state pending_realloc_state;
 char * se_avail_vector;
-extern "C" int numProcessAfterRestart;
+int numProcessAfterRestart;
 extern "C" CcsDelayedReply shrinkExpandreplyToken;
 extern "C" char willContinue;
 char willContinue;
 #endif
-extern int load_balancer_created;
+bool load_balancer_created;
+
+void realloc(char* reallocMsg)
+{
+#if CMK_SHRINK_EXPAND
+    numProcessAfterRestart = *((int *)reallocMsg);
+    reallocMsg += sizeof(int);
+    int numBits = *((int *)reallocMsg);
+    reallocMsg += sizeof(int);
+
+    CkPrintf("Charm> numProcessAfterRestart = %d, numBits = %d\n", numProcessAfterRestart, numBits);
+
+    if (LBManagerObj()->lb_in_progress)
+    {
+        CkPrintf("Charm> Rescaling called while load balancing is in progress!\n");
+        LBManagerObj()->bufferRealloc(reallocMsg - 2 * sizeof(int));
+    }
+    else
+    {
+        //if (numProcessAfterRestart > CkNumPes())
+        //    pending_realloc_state = EXPAND_MSG_RECEIVED;
+        //else
+        //    pending_realloc_state = SHRINK_MSG_RECEIVED;
+
+        char* old_bitmap = (char *)malloc(sizeof(char) * CkNumPes());
+        LBManagerObj()->get_avail_vector(old_bitmap);
+
+        char* new_bitmap = (char *)malloc(sizeof(char) * CkNumPes());
+        memcpy(new_bitmap, old_bitmap, sizeof(char) * CkNumPes());
+
+        int last_pe = -1;
+        int j = 0;
+        for (int i = 0; i < numBits; i++)
+        {
+            if (reallocMsg[i] == 0)
+            {
+                while (last_pe < i && j < CkNumPes())
+                    last_pe += old_bitmap[j++];
+                
+                if (last_pe == i)
+                    new_bitmap[j-1] = 0;
+            }
+        }
+
+        for (int i = 0; i < CkNumPes(); i++)
+        {
+            CkPrintf("Charm> before old_bitmap[%d] = %d\n", i, old_bitmap[i]);
+            CkPrintf("Charm> reallocMsg[%d] = %d\n", i, reallocMsg[i]);
+            //new_bitmap[i] = reallocMsg[i] & new_bitmap[i];
+            CkPrintf("Charm> after new_bitmap[%d] = %d\n", i, new_bitmap[i]);
+        }
+
+        if((CkMyPe() == 0) && (load_balancer_created))
+        LBManagerObj()->set_avail_vector(new_bitmap, 0);
+
+        se_avail_vector = (char *)malloc(sizeof(char) * CkNumPes());
+        LBManagerObj()->get_avail_vector(se_avail_vector);
+
+        // now find whether this is shrink/expand
+        pending_realloc_state = NO_REALLOC;
+        for (int i = 0; i < CkNumPes(); i++)
+            if (se_avail_vector[i] == 0)
+            {
+                pending_realloc_state = SHRINK_MSG_RECEIVED;
+                break;
+            }
+
+        if (numProcessAfterRestart > CkNumPes() || (numProcessAfterRestart == CkNumPes() && 
+                pending_realloc_state == SHRINK_MSG_RECEIVED))
+            pending_realloc_state = static_cast<realloc_state>(static_cast<uint8_t>(pending_realloc_state) | 
+                static_cast<uint8_t>(EXPAND_MSG_RECEIVED));
+
+        //free(reallocMsg);
+        free(new_bitmap);
+        free(old_bitmap);
+    }
+#endif
+}
+
 static void handler(char *bit_map)
 {
 #if CMK_SHRINK_EXPAND
+    printf("Charm> Rescaling called!\n");
     shrinkExpandreplyToken = CcsDelayReply();
     bit_map += CmiMsgHeaderSizeBytes;
-    pending_realloc_state = REALLOC_MSG_RECEIVED;
-
-    if((CkMyPe() == 0) && (load_balancer_created))
-    LBManagerObj()->set_avail_vector(bit_map);
-
-    se_avail_vector = (char *)malloc(sizeof(char) * CkNumPes());
-    LBManagerObj()->get_avail_vector(se_avail_vector);
+    realloc(bit_map);
+#endif
+}
 
-    numProcessAfterRestart = *((int *)(bit_map + CkNumPes()));
+static void realloc_handler(char *msg)
+{
+#if CMK_SHRINK_EXPAND
+    printf("Charm> Rescaling called!\n");
+    int myPes = CkNumPes();
+    shrinkExpandreplyToken = CcsDelayReply();
+    msg += CmiMsgHeaderSizeBytes;
+    bool isExpand = *((bool *)msg);
+    int numPes = *((int *)(msg + sizeof(bool)));
+    printf("Charm> realloc_handler: isExpand=%d numPes=%d CkNumPes()=%d\n", isExpand, numPes, CkNumPes());
+    
+    char* bit_map = (char *)malloc(CkNumPes() + 2 * sizeof(int));
+    memcpy(bit_map, &numPes, sizeof(int));
+    memcpy(&bit_map[sizeof(int)], &myPes, sizeof(int));
+    char* start_bitmap = bit_map + 2 * sizeof(int);
+    
+    if (isExpand)
+    {
+        for (int i = 0; i < CkNumPes(); i++) {
+            start_bitmap[i] = 1;
+        }
+    }
+    else
+    {
+        for (int i = 0; i < CkNumPes(); i++) {
+            if (i < numPes)
+                start_bitmap[i] = 1;
+            else
+                start_bitmap[i] = 0;
+        }
+    }
+    
+    realloc(bit_map);
 #endif
 }
 
+void rescale(char* bit_map)
+{
+    realloc(bit_map);
+}
+
 void manager_init(){
 #if CMK_SHRINK_EXPAND
     static int inited = 0;
     willContinue = 0;
     if (inited) return;
     CcsRegisterHandler("set_bitmap", (CmiHandler) handler);
+    CcsRegisterHandler("realloc", (CmiHandler) realloc_handler);
     inited = 1;
     pending_realloc_state = NO_REALLOC;
 #endif
diff --git a/src/ck-ldb/manager.h b/src/ck-ldb/manager.h
index d2ffb69fba..473686ede4 100644
--- a/src/ck-ldb/manager.h
+++ b/src/ck-ldb/manager.h
@@ -13,6 +13,8 @@
 
 void manager_init(void);
 
+void rescale(char* bit_map);
+
 #endif
 
 /*@}*/
diff --git a/src/ck-ldb/refine.h b/src/ck-ldb/refine.h
index b0d24e68ec..b58628ffaa 100644
--- a/src/ck-ldb/refine.h
+++ b/src/ck-ldb/refine.h
@@ -38,6 +38,7 @@ class RefineA : public Strategy<O, P, S>
 
   void solve(std::vector<O>& objs, std::vector<P>& procs, S& solution, bool objsSorted)
   {
+    CkPrintf("Solving with RefineA strategy\n");
     float M = calcGreedyMaxload(objs, procs, objsSorted);
     if (CkMyPe() == 0 && _lb_args.debug() > 0)
       CkPrintf("[%d] RefineA: greedy maxload is %f\n", CkMyPe(), M);
@@ -73,7 +74,6 @@ class RefineA : public Strategy<O, P, S>
     while (reldiff(lower, upper) > 1.01)
     {
       M = (lower + upper) / 2;
-
       solutions.emplace_back(initialAssignment);
       std::unordered_map<int, std::vector<O>> proc_objs(
           proc_objs0);  // real pe -> list of its objects
@@ -104,13 +104,13 @@ class RefineA : public Strategy<O, P, S>
         for (auto it = heavy_objs.begin(); it != heavy_objs.end(); it++)
         {
           O& o = *it;
-          if (lightest.getLoad() + o.getLoad() <= M)
+          if (lightest.getLoad() + (o.getLoad() / lightest.speed[0]) <= M)
           {
             heavy_processors.pop();
-            heavy.load -= o.getLoad();
+            heavy.load -= o.getLoad() / heavy.speed[0];
             for (auto& light : light_processors)
             {
-              if (light.getLoad() + o.getLoad() <= M)
+              if (light.getLoad() + (o.getLoad() / light.speed[0]) <= M)
               {
                 solutions.back().assign(o, light);
                 lightH.remove(light);
@@ -178,6 +178,7 @@ class RefineB : public Strategy<O, P, S>
 
   void solve(std::vector<O>& objs, std::vector<P>& procs, S& solution, bool objsSorted)
   {
+    CkPrintf("Solving with RefineB strategy\n");
     float M = calcGreedyMaxload(objs, procs, objsSorted);
     if (CkMyPe() == 0 && _lb_args.debug() > 0)
       CkPrintf("[%d] RefineB: greedy maxload is %f\n", CkMyPe(), M);
diff --git a/src/ck-perf/trace-projections.h b/src/ck-perf/trace-projections.h
index add502f10c..015078f243 100644
--- a/src/ck-perf/trace-projections.h
+++ b/src/ck-perf/trace-projections.h
@@ -628,6 +628,7 @@ class toProjectionsGZFile : public PUP::er {
   gzFile f;
  protected:
   virtual void bytes(void *p,size_t n,size_t itemSize,dataType t);
+  virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode) {}
   virtual void pup_buffer(void *&p,size_t n,size_t itemSize,dataType t);
   virtual void pup_buffer(void *&p,size_t n, size_t itemSize, dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
  public:
diff --git a/src/conv-ccs/ccs-builtins.C b/src/conv-ccs/ccs-builtins.C
index a2059a565b..b519a4ff3e 100644
--- a/src/conv-ccs/ccs-builtins.C
+++ b/src/conv-ccs/ccs-builtins.C
@@ -63,6 +63,14 @@ void CcsImpl_kill(void)
     SOCKET fd=skt_connect(killList->ip,killList->port,20);
     if (fd!=INVALID_SOCKET) {
       skt_sendN(fd,"die\n",strlen("die\n")+1);
+
+      // Set SO_LINGER to ensure the "die" message is sent before we exit.
+      // This forces close() to block until the kernel has transmitted the data.
+      struct linger linger_opt;
+      linger_opt.l_onoff = 1;  // Enable linger
+      linger_opt.l_linger = 5; // Timeout in seconds
+      setsockopt(fd, SOL_SOCKET, SO_LINGER, &linger_opt, sizeof(linger_opt));
+
       skt_close(fd);
     }
     killList=killList->next;
diff --git a/src/conv-ccs/ccs-builtins.h b/src/conv-ccs/ccs-builtins.h
index f768dda7f3..ee3fee9729 100644
--- a/src/conv-ccs/ccs-builtins.h
+++ b/src/conv-ccs/ccs-builtins.h
@@ -35,6 +35,8 @@ class PUP_fmt : public PUP::wrap_er {
     virtual void comment(const char *message);
     virtual void synchronize(unsigned int m);
     virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t);
+    virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {}
+    virtual void bytes(void **p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {}
     virtual void pup_buffer(void *&p,size_t n,size_t itemSize,PUP::dataType t);
     virtual void pup_buffer(void *&p,size_t n, size_t itemSize, PUP::dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
 };
diff --git a/src/conv-ccs/conv-ccs.C b/src/conv-ccs/conv-ccs.C
index d191fe35a1..24f4022055 100644
--- a/src/conv-ccs/conv-ccs.C
+++ b/src/conv-ccs/conv-ccs.C
@@ -8,6 +8,7 @@
 #include "ccs-server.h"
 #include "sockRoutines.h"
 #include "queueing.h"
+#include <sys/socket.h>
 
 #ifdef _WIN32
 # include <io.h>
@@ -232,6 +233,35 @@ void CcsSendDelayedReply(CcsDelayedReply d,int replyLen, const void *replyData)
   free(h);
 }
 
+void CcsSendDelayedReplyAndTerm(CcsDelayedReply d, int replyLen, const void *replyData)
+{
+    CcsImplHeader *h = d.hdr;
+    int fd = ChMessageInt(h->replyFd);
+
+    // 1. Send the reply data, same as CcsReply.
+    h->len = ChMessageInt_new(replyLen);
+    skt_sendN(fd, &replyLen, sizeof(int));
+    if (replyLen > 0) {
+        skt_sendN(fd, replyData, replyLen);
+    }
+
+    // 2. Perform a synchronous close to ensure data is sent before returning.
+    //    shutdown() tells the kernel to send all buffered data, then a FIN packet.
+    shutdown(fd, SHUT_WR);
+
+    // 3. Wait for the peer (charmrun) to close its side. The recv() will block
+    //    until charmrun reads the data and closes its socket, which gives us an
+    //    EOF (recv returns 0). This is our acknowledgment.
+    char dummy_buffer[32];
+    recv(fd, dummy_buffer, sizeof(dummy_buffer), 0);
+
+    // 4. Now that the handshake is complete, we can safely close our end.
+    skt_close(fd);
+
+    // 5. Free the handle resource.
+    free(h);
+}
+
 void CcsNoReply(void)
 {
   if (CpvAccess(ccsReq)==NULL) return;
diff --git a/src/conv-ccs/conv-ccs.h b/src/conv-ccs/conv-ccs.h
index 451c42b31d..1ffd449bd4 100644
--- a/src/conv-ccs/conv-ccs.h
+++ b/src/conv-ccs/conv-ccs.h
@@ -94,6 +94,18 @@ void CcsSendReply(int replyLen, const void *replyData);
 void CcsSendReplyNoError(int replyLen, const void *replyData);
 CcsDelayedReply CcsDelayReply(void);
 void CcsSendDelayedReply(CcsDelayedReply d,int replyLen, const void *replyData);
+
+/**
+ * Send a delayed reply and then perform a synchronous close on the socket.
+ * This function blocks until the peer has acknowledged receipt of the data
+ * by closing its end of the connection. This is intended for final replies
+ * before program exit to prevent race conditions.
+ */
+void CcsSendDelayedReplyAndTerm(CcsDelayedReply d, int replyLen, const void *replyData);
+
+/**
+ Send an empty reply for a request that was previously delayed.
+*/
 void CcsNoReply();
 void CcsNoDelayedReply(CcsDelayedReply d);
 
diff --git a/src/conv-core/conv-config.h b/src/conv-core/conv-config.h
index 51aebc5bf7..cb0af5635b 100644
--- a/src/conv-core/conv-config.h
+++ b/src/conv-core/conv-config.h
@@ -123,6 +123,10 @@
 #define CMK_CUDA                  0
 #endif
 
+#if !defined(CMK_HIP)
+#define CMK_HIP                  0
+#endif
+
 #ifndef CMI_QD
 #define CMI_QD (CMK_REPLAYSYSTEM)
 #endif
diff --git a/src/conv-core/conv-rdma.h b/src/conv-core/conv-rdma.h
index f65e080ad9..efa99d78f0 100644
--- a/src/conv-core/conv-rdma.h
+++ b/src/conv-core/conv-rdma.h
@@ -3,54 +3,62 @@
 
 #include "cmirdmautils.h"
 #include "pup.h"
+#include <functional>
 
-/*********************************** Zerocopy Direct API **********************************/
+// User specified configuration
+// TODO: move to a better location
+extern bool CmiUseCopyBasedRDMA;
+
+// LCI layer definition
+#define CMK_REG_REQUIRED 1
+// 8-byte for mr, 16-byte for rmr
+// TODO: better to use dynamic allocation and PUP
+#define CMK_NOCOPY_DIRECT_BYTES 24
+
+/*********************************** Zerocopy Direct API
+ * **********************************/
 typedef void (*RdmaAckCallerFn)(void *token);
 
 /* Support for Direct API */
 void CmiSetRdmaCommonInfo(void *info, const void *ptr, int size);
 int CmiGetRdmaCommonInfoSize(void);
 
-void CmiSetRdmaBufferInfo(void *info, const void *ptr, int size, unsigned short int mode);
+void CmiSetRdmaBufferInfo(void *info, const void *ptr, int size,
+                          unsigned short int mode);
 
 // Function to set the ack handler for the Direct API
 void CmiSetDirectNcpyAckHandler(RdmaAckCallerFn fn);
 
-/* CmiIssueRget initiates an RDMA read operation, transferring 'size' bytes of data from the address space of 'srcPe' to local address, 'destAddr'.
- * When the runtime invokes srcAck on the source (target), it indicates safety to overwrite or free the srcAddr buffer.
- * When the runtime invokes destAck on the destination (initiator), it indicates that the data has been successfully received in the
- * destAddr buffer.
+/* CmiIssueRget initiates an RDMA read operation, transferring 'size' bytes of
+ * data from the address space of 'srcPe' to local address, 'destAddr'. When the
+ * runtime invokes srcAck on the source (target), it indicates safety to
+ * overwrite or free the srcAddr buffer. When the runtime invokes destAck on the
+ * destination (initiator), it indicates that the data has been successfully
+ * received in the destAddr buffer.
  */
 void CmiIssueRget(NcpyOperationInfo *ncpyOpInfo);
 
-/* CmiIssueRput initiates an RDMA write operation, transferring 'size' bytes of data from the local address, 'srcAddr' to the address space of 'destPe'.
- * When the runtime invokes srcAck on the source (initiator), it indicates safety to overwrite or free the srcAddr buffer.
- * When the runtime invokes destAck on the destination (target), it indicates that the data has been successfully received in the
- * destAddr buffer.
+/* CmiIssueRput initiates an RDMA write operation, transferring 'size' bytes of
+ * data from the local address, 'srcAddr' to the address space of 'destPe'. When
+ * the runtime invokes srcAck on the source (initiator), it indicates safety to
+ * overwrite or free the srcAddr buffer. When the runtime invokes destAck on the
+ * destination (target), it indicates that the data has been successfully
+ * received in the destAddr buffer.
  */
 
 void CmiIssueRput(NcpyOperationInfo *ncpyOpInfo);
 
-void CmiDeregisterMem(const void *ptr, void *info, int pe, unsigned short int mode);
+void CmiDeregisterMem(const void *ptr, void *info, int pe,
+                      unsigned short int mode);
 
 #if CMK_USE_CMA
-void CmiIssueRgetUsingCMA(
-  const void* srcAddr,
-  void *srcInfo,
-  int srcPe,
-  const void* destAddr,
-  void *destInfo,
-  int destPe,
-  size_t size);
-
-void CmiIssueRputUsingCMA(
-  const void* destAddr,
-  void *destInfo,
-  int destPe,
-  const void* srcAddr,
-  void *srcInfo,
-  int srcPe,
-  size_t size);
+void CmiIssueRgetUsingCMA(const void *srcAddr, void *srcInfo, int srcPe,
+                          const void *destAddr, void *destInfo, int destPe,
+                          size_t size);
+
+void CmiIssueRputUsingCMA(const void *destAddr, void *destInfo, int destPe,
+                          const void *srcAddr, void *srcInfo, int srcPe,
+                          size_t size);
 #endif
 
 // Allocation from pool
@@ -82,22 +90,25 @@ void CmiSetNcpyAckSize(int ackSize);
 #endif
 
 // Represents the mode of host-side zerocopy transfer
-// CkNcpyMode::MEMCPY indicates that the PEs are on the logical node and memcpy can be used
-// CkNcpyMode::CMA indicates that the PEs are on the same physical node and CMA can be used
-// CkNcpyMode::RDMA indicates that the neither MEMCPY or CMA can be used and REMOTE Direct Memory Access needs to be used
+// CkNcpyMode::MEMCPY indicates that the PEs are on the logical node and memcpy
+// can be used CkNcpyMode::CMA indicates that the PEs are on the same physical
+// node and CMA can be used CkNcpyMode::RDMA indicates that the neither MEMCPY
+// or CMA can be used and REMOTE Direct Memory Access needs to be used
 enum class CmiNcpyMode : char { MEMCPY, CMA, RDMA };
 
-// Represents the completion status of the zerocopy transfer (used as a return value for CkNcpyBuffer::get & CkNcpyBuffer:::put)
-// CMA and MEMCPY transfers complete instantly and return CkNcpyStatus::complete
-// RDMA transfers use a remote asynchronous call and hence return CkNcpyStatus::incomplete
+// Represents the completion status of the zerocopy transfer (used as a return
+// value for CkNcpyBuffer::get & CkNcpyBuffer:::put) CMA and MEMCPY transfers
+// complete instantly and return CkNcpyStatus::complete RDMA transfers use a
+// remote asynchronous call and hence return CkNcpyStatus::incomplete
 enum class CmiNcpyStatus : char { incomplete, complete };
 
 // Represents the remote handler tag that should be invoked
 // ncpyHandlerIdx::EM_ACK tag is used to remotely invoke CkRdmaEMAckHandler
-// ncpyHandlerIdx::BCAST_ACK tag is used to remotely invoke CkRdmaEMBcastAckHandler
-// ncpyHandlerIdx::BCAST_POST_ACK is used to remotely invoke CkRdmaEMBcastPostAckHandler
-// ncpyHandlerIdx::CMA_DEREG_ACK is used to remotely invoke CkRdmaEMDeregAndAckHandler
-enum class ncpyHandlerIdx: char {
+// ncpyHandlerIdx::BCAST_ACK tag is used to remotely invoke
+// CkRdmaEMBcastAckHandler ncpyHandlerIdx::BCAST_POST_ACK is used to remotely
+// invoke CkRdmaEMBcastPostAckHandler ncpyHandlerIdx::CMA_DEREG_ACK is used to
+// remotely invoke CkRdmaEMDeregAndAckHandler
+enum class ncpyHandlerIdx : char {
   EM_ACK,
   BCAST_ACK,
   BCAST_POST_ACK,
@@ -109,21 +120,20 @@ enum class ncpyHandlerIdx: char {
 
 class CmiNcpyBuffer {
 
-  //private:
-  public:
-
+  // private:
+public:
   // bool to indicate registration for current values of ptr and cnt on pe
   bool isRegistered;
 
-  // machine specific information about the buffer
-  #if defined(__GNUC__) || defined(__clang__)
-  #pragma GCC diagnostic push
-  #pragma GCC diagnostic ignored "-Wpedantic"
-  #endif
+// machine specific information about the buffer
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+#endif
   char layerInfo[CMK_COMMON_NOCOPY_DIRECT_BYTES + CMK_NOCOPY_DIRECT_BYTES];
-  #if defined(__GNUC__) || defined(__clang__)
-  #pragma GCC diagnostic pop
-  #endif
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
 
 #if CMK_ERROR_CHECKING
   void checkRegModeIsValid() {
@@ -158,20 +168,38 @@ class CmiNcpyBuffer {
   // ack handling pointer used for bcast and CMA p2p transfers
   const void *refAckInfo;
 
-  CmiNcpyBuffer() : isRegistered(false), ptr(NULL), cnt(0), pe(-1), regMode(CMK_BUFFER_REG), deregMode(CMK_BUFFER_DEREG), ref(NULL), refAckInfo(NULL) {}
+  // ipc specific
+  void* deviceRdmaOpInfo;
 
-  explicit CmiNcpyBuffer(const void *ptr_, size_t cnt_, unsigned short int regMode_=CMK_BUFFER_REG, unsigned short int deregMode_=CMK_BUFFER_DEREG) {
+  CmiNcpyBuffer()
+      : isRegistered(false), ptr(NULL), cnt(0), pe(-1), regMode(CMK_BUFFER_REG),
+        deregMode(CMK_BUFFER_DEREG), ref(NULL), refAckInfo(NULL) {}
+
+  explicit CmiNcpyBuffer(const void *ptr_, size_t cnt_,
+                         unsigned short int regMode_ = CMK_BUFFER_REG,
+                         unsigned short int deregMode_ = CMK_BUFFER_DEREG) : deviceRdmaOpInfo(nullptr) {
+    init(ptr_, cnt_, regMode_, deregMode_);
+  }
+
+  explicit CmiNcpyBuffer(const void *ptr_, size_t cnt_, void* deviceRdmaOpInfo_,
+                         unsigned short int regMode_ = CMK_BUFFER_REG,
+                         unsigned short int deregMode_ = CMK_BUFFER_DEREG) : deviceRdmaOpInfo(deviceRdmaOpInfo_) {
     init(ptr_, cnt_, regMode_, deregMode_);
   }
 
   void print() {
-    CmiPrintf("[%d][%d][%d] CmiNcpyBuffer print: ptr:%p, size:%zu, pe:%d, regMode=%d, deregMode=%d, ref:%p, refAckInfo:%p\n", CmiMyPe(), CmiMyNode(), CmiMyRank(), ptr, cnt, pe, regMode, deregMode, ref, refAckInfo);
+    CmiPrintf("[%d][%d][%d] CmiNcpyBuffer print: ptr:%p, size:%zu, pe:%d, "
+              "regMode=%d, deregMode=%d, ref:%p, refAckInfo:%p\n",
+              CmiMyPe(), CmiMyNode(), CmiMyRank(), ptr, cnt, pe, regMode,
+              deregMode, ref, refAckInfo);
   }
 
-  void init(const void *ptr_, size_t cnt_, unsigned short int regMode_=CMK_BUFFER_REG, unsigned short int deregMode_=CMK_BUFFER_DEREG) {
-    ptr  = ptr_;
-    cnt  = cnt_;
-    pe   = CmiMyPe();
+  void init(const void *ptr_, size_t cnt_,
+            unsigned short int regMode_ = CMK_BUFFER_REG,
+            unsigned short int deregMode_ = CMK_BUFFER_DEREG) {
+    ptr = ptr_;
+    cnt = cnt_;
+    pe = CmiMyPe();
     regMode = regMode_;
     deregMode = deregMode_;
 
@@ -189,36 +217,33 @@ class CmiNcpyBuffer {
     refAckInfo = NULL;
 
     // Register memory everytime new values are initialized
-    if(cnt > 0)
+    if (cnt > 0)
       registerMem();
   }
 
-  void setRef(const void *ref_) {
-    ref = ref_;
-  }
+  void setRef(const void *ref_) { ref = ref_; }
 
-  const void *getRef() {
-    return ref;
-  }
+  const void *getRef() { return ref; }
 
   // Register(Pin) the memory for the buffer
-  void registerMem()
-  {
+  void registerMem() {
     // Check that this object is local when registerMem is called
     CmiAssert(CmiNodeOf(pe) == CmiMyNode());
 
     // Set machine layer information when regMode is not CMK_BUFFER_NOREG
-    if(regMode != CMK_BUFFER_NOREG) {
+    if (regMode != CMK_BUFFER_NOREG) {
 
       CmiSetRdmaCommonInfo(&layerInfo[0], ptr, cnt);
 
-      /* Set the pointer layerInfo unconditionally for layers that don't require pinning (MPI, PAMI)
-       * or if regMode is REG, PREREG on layers that require pinning (GNI, Verbs, OFI, UCX) */
+      /* Set the pointer layerInfo unconditionally for layers that don't require
+       * pinning (MPI, PAMI) or if regMode is REG, PREREG on layers that require
+       * pinning (GNI, Verbs, OFI, UCX) */
 #if CMK_REG_REQUIRED
-      if(regMode == CMK_BUFFER_REG || regMode == CMK_BUFFER_PREREG)
+      if (regMode == CMK_BUFFER_REG || regMode == CMK_BUFFER_PREREG)
 #endif
       {
-        CmiSetRdmaBufferInfo(layerInfo + CmiGetRdmaCommonInfoSize(), ptr, cnt, regMode);
+        CmiSetRdmaBufferInfo(layerInfo + CmiGetRdmaCommonInfoSize(), ptr, cnt,
+                             regMode);
         isRegistered = true;
       }
     }
@@ -231,12 +256,13 @@ class CmiNcpyBuffer {
     // Check that this object is local when deregisterMem is called
     CmiAssert(CmiNodeOf(pe) == CmiMyNode());
 
-    if(isRegistered == false)
+    if (isRegistered == false)
       return;
 
 #if CMK_REG_REQUIRED
-    if(regMode != CMK_BUFFER_NOREG) {
-      CmiDeregisterMem(ptr, layerInfo + CmiGetRdmaCommonInfoSize(), pe, regMode);
+    if (regMode != CMK_BUFFER_NOREG) {
+      CmiDeregisterMem(ptr, layerInfo + CmiGetRdmaCommonInfoSize(), pe,
+                       regMode);
       isRegistered = false;
     }
 #endif
@@ -246,6 +272,7 @@ class CmiNcpyBuffer {
     p((char *)&ptr, sizeof(ptr));
     p((char *)&ref, sizeof(ref));
     p((char *)&refAckInfo, sizeof(refAckInfo));
+    p((char *)&deviceRdmaOpInfo, sizeof(deviceRdmaOpInfo));
     p|cnt;
     p|pe;
     p|regMode;
@@ -262,44 +289,48 @@ class CmiNcpyBuffer {
   void cmaPut(CmiNcpyBuffer &destination);
 #endif
 
-  NcpyOperationInfo *createNcpyOpInfo(CmiNcpyBuffer &source, CmiNcpyBuffer &destination, int ackSize, char *srcAck, char *destAck, int rootNode, int opMode, void *refPtr);
+  NcpyOperationInfo *createNcpyOpInfo(CmiNcpyBuffer &source,
+                                      CmiNcpyBuffer &destination, int ackSize,
+                                      char *srcAck, char *destAck, int rootNode,
+                                      int opMode, void *refPtr);
 
   void rdmaGet(CmiNcpyBuffer &source, int ackSize, char *srcAck, char *destAck);
-  void rdmaPut(CmiNcpyBuffer &destination, int ackSize, char *srcAck, char *destAck);
+  void rdmaPut(CmiNcpyBuffer &destination, int ackSize, char *srcAck,
+               char *destAck);
 
   friend inline void deregisterBuffer(CmiNcpyBuffer &buffInfo);
-
-
 };
 
 /***************************** Other Util *********************************/
 
 void invokeZCPupHandler(void *ref, int pe);
 inline void deregisterBuffer(CmiNcpyBuffer &buffInfo) {
-  CmiDeregisterMem(buffInfo.ptr, buffInfo.layerInfo + CmiGetRdmaCommonInfoSize(), buffInfo.pe, buffInfo.regMode);
+  CmiDeregisterMem(buffInfo.ptr,
+                   buffInfo.layerInfo + CmiGetRdmaCommonInfoSize(), buffInfo.pe,
+                   buffInfo.regMode);
   buffInfo.isRegistered = false;
 }
 CmiNcpyMode findTransferMode(int srcPe, int destPe);
 CmiNcpyMode findTransferModeWithNodes(int srcNode, int destNode);
 
-
 // Converse message to invoke the Ncpy handler on a remote process
-struct ncpyHandlerMsg{
+struct ncpyHandlerMsg {
   char cmicore[CmiMsgHeaderSizeBytes];
   ncpyHandlerIdx opMode;
   void *ref;
 };
 
-struct zcPupSourceInfo{
+struct zcPupSourceInfo {
   CmiNcpyBuffer src;
-  std::function<void (void *)> deallocate;
+  std::function<void(void *)> deallocate;
 };
 
 void zcPupDone(void *ref);
 void zcPupHandler(ncpyHandlerMsg *msg);
 
 zcPupSourceInfo *zcPupAddSource(CmiNcpyBuffer &src);
-zcPupSourceInfo *zcPupAddSource(CmiNcpyBuffer &src, std::function<void (void *)> deallocate);
+zcPupSourceInfo *zcPupAddSource(CmiNcpyBuffer &src,
+                                std::function<void(void *)> deallocate);
 
 void zcPupGet(CmiNcpyBuffer &src, CmiNcpyBuffer &dest);
 
diff --git a/src/conv-core/conv-rdmadevice.C b/src/conv-core/conv-rdmadevice.C
index 2d695fc31a..22c2a00d91 100644
--- a/src/conv-core/conv-rdmadevice.C
+++ b/src/conv-core/conv-rdmadevice.C
@@ -1,7 +1,8 @@
 #include "converse.h"
 #include "conv-rdmadevice.h"
+#include "ck.h"
 
-#if CMK_CUDA
+#if CMK_CUDA || CMK_HIP
 
 CmiNcpyModeDevice findTransferModeDevice(int srcPe, int dstPe) {
   CmiEnforce((srcPe >= 0) && (srcPe <= CmiNumPes()));
@@ -10,12 +11,10 @@ CmiNcpyModeDevice findTransferModeDevice(int srcPe, int dstPe) {
   if (CmiNodeOf(srcPe) == CmiNodeOf(dstPe)) {
     // Same logical node
     return CmiNcpyModeDevice::MEMCPY;
-  }
-  else if (CmiPeOnSamePhysicalNode(srcPe, dstPe)) {
+  } else if (CmiPeOnSamePhysicalNode(srcPe, dstPe)) {
     // Different logical nodes, same physical node
     return CmiNcpyModeDevice::IPC;
-  }
-  else {
+  } else {
     // Different physical nodes, requires GPUDirect RDMA
     return CmiNcpyModeDevice::RDMA;
   }
@@ -24,8 +23,8 @@ CmiNcpyModeDevice findTransferModeDevice(int srcPe, int dstPe) {
 #if CMK_GPU_COMM
 #include "machine-rdma.h"
 
-void CmiSendDevice(int dest_pe, const void*& ptr, size_t size, uint64_t& tag) {
-  LrtsSendDevice(dest_pe, ptr, size, tag);
+void CmiSendDevice(int dest_rank, int src_rank, const void*& ptr, size_t size, uint64_t& tag) {
+  LrtsSendDevice(dest_rank, src_rank, ptr, size, tag);
 }
 
 void CmiRecvDevice(DeviceRdmaOp* op, DeviceRecvType type) {
@@ -40,6 +39,7 @@ void CmiRdmaDeviceRecvInit(RdmaAckHandlerFn fn) {
 }
 
 void CmiInvokeRecvHandler(void* data) {
+  QdProcess(1);
   rdmaDeviceRecvHandlerFn(data);
 }
 #endif // CMK_GPU_COMM
diff --git a/src/conv-core/conv-rdmadevice.h b/src/conv-core/conv-rdmadevice.h
index 88e5d3b1b4..d9bd75a6f4 100644
--- a/src/conv-core/conv-rdmadevice.h
+++ b/src/conv-core/conv-rdmadevice.h
@@ -5,12 +5,15 @@
 #include "converse.h"
 #include "cmirdmautils.h"
 #include "pup.h"
+#include "conv-rdma.h"
 
-#if CMK_CUDA
-#include <cuda_runtime.h>
+#define CMK_GPU_COMM 1
+
+#if CMK_CUDA || CMK_HIP
+#include "hapi_portable.h"
 
 // Represents the mode of device-side zerocopy transfer
-// MEMCPY indicates that the PEs are on the same logical node and cudaMemcpyDeviceToDevice can be used
+// MEMCPY indicates that the PEs are on the same logical node and hapiMemcpyDeviceToDevice can be used
 // IPC indicates that the PEs are on different logical nodes within the same physical node and CUDA IPC can be used
 // RDMA indicates that the PEs are on different physical nodes and requires GPUDirect RDMA
 enum class CmiNcpyModeDevice : char { MEMCPY, IPC, RDMA };
@@ -23,12 +26,13 @@ class CmiDeviceBuffer {
   // Pointer to and size of the buffer
   const void* ptr;
   size_t cnt;
-  cudaStream_t cuda_stream;
+  hapiStream_t hapi_stream;
 
-#if !CMK_GPU_COMM
   // Source and destination PEs
   int src_pe;
+  int src_mpi_rank;
   int dest_pe;
+  int dest_mpi_rank;
 
   // Used for CUDA IPC
   int device_idx;
@@ -39,32 +43,28 @@ class CmiDeviceBuffer {
   bool data_stored;
   void* data;
 
+  CmiNcpyBuffer lci_ncpy_buffer;
+
   CmiDeviceBuffer() : ptr(NULL), cnt(0), src_pe(-1), dest_pe(-1) { init(); }
 
   explicit CmiDeviceBuffer(const void* ptr_, size_t cnt_) : ptr(ptr_), cnt(cnt_),
-    src_pe(CmiMyPe()), dest_pe(-1) { init(); }
+    src_pe(CmiMyPe()), src_mpi_rank(CmiNodeOf(CmiMyPe())), dest_pe(-1), dest_mpi_rank(-1) { init(); }
 
   void init() {
     device_idx = -1;
     comm_offset = 0;
     event_idx = -1;
-    cuda_stream = cudaStreamPerThread;
+    hapi_stream = hapiStreamPerThread;
 
     data_stored = false;
     data = NULL;
   }
-#else
-  uint64_t tag;
-
-  CmiDeviceBuffer() : ptr(NULL), cnt(0) {}
 
-  explicit CmiDeviceBuffer(const void* ptr_, size_t cnt_) : ptr(ptr_), cnt(cnt_) {}
-#endif // CMK_GPU_COMM
+  uint64_t tag;
 
   void pup(PUP::er &p) {
     p((char *)&ptr, sizeof(ptr));
     p|cnt;
-#if !CMK_GPU_COMM
     p|src_pe;
     p|dest_pe;
     p|device_idx;
@@ -73,18 +73,21 @@ class CmiDeviceBuffer {
     p|data_stored;
     if (data_stored) {
       if (p.isUnpacking()) {
-        cudaMallocHost(&data, cnt);
+        hapiMallocHost(&data, cnt);
       }
       PUParray(p, (char*)data, cnt);
     }
-#else
     p|tag;
-#endif // CMK_GPU_COMM
+    p|src_pe;
+    p|src_mpi_rank;
+    p|dest_pe;
+    p|dest_mpi_rank;
+    p|lci_ncpy_buffer;
   }
 
   ~CmiDeviceBuffer() {
 #if !CMK_GPU_COMM
-    if (data) cudaFreeHost(data);
+    if (data) hapiFreeHost(data);
 #endif
   }
 };
@@ -94,7 +97,7 @@ CmiNcpyModeDevice findTransferModeDevice(int srcPe, int destPe);
 #if CMK_GPU_COMM
 typedef void (*RdmaAckCallerFn)(void *token);
 
-void CmiSendDevice(int dest_pe, const void*& ptr, size_t size, uint64_t& tag);
+void CmiSendDevice(int dest_rank, int src_rank, const void*& ptr, size_t size, uint64_t& tag);
 void CmiRecvDevice(DeviceRdmaOp* op, DeviceRecvType type);
 void CmiRdmaDeviceRecvInit(RdmaAckCallerFn fn);
 void CmiInvokeRecvHandler(void* data);
diff --git a/src/conv-core/cpuaffinity.C b/src/conv-core/cpuaffinity.C
index 27267a7262..542fc00102 100644
--- a/src/conv-core/cpuaffinity.C
+++ b/src/conv-core/cpuaffinity.C
@@ -833,9 +833,8 @@ void CmiCheckAffinity(void)
     cpu_set_t my_aff;
     if (get_affinity(&my_aff) == -1) CmiAbort("get_affinity failed\n");
     CPU_OR(&core_usage, &core_usage, &my_aff); // add my affinity (pe0)
-
     cpuAffSyncWait(cpuPhyAffCheckDone);
-
+    
 #if CMK_SMP && !CMK_SMP_NO_COMMTHD
     CmiNodeBarrier();
 
diff --git a/src/libs/ck-libs/ampi/ampi.C b/src/libs/ck-libs/ampi/ampi.C
index 621cfcbe37..01d999ef62 100644
--- a/src/libs/ck-libs/ampi/ampi.C
+++ b/src/libs/ck-libs/ampi/ampi.C
@@ -1430,7 +1430,7 @@ void ampiParent::pup(PUP::er &p) noexcept {
         case AMPI_G_REQ:
           blockingReq = new GReq;
           break;
-#if CMK_CUDA
+#if CMK_CUDA 
         case AMPI_GPU_REQ:
           CkAbort("AMPI> error trying to PUP a non-migratable GPU request!");
           break;
@@ -11720,7 +11720,7 @@ int AMPI_GPU_Iinvoke_wr(hapiWorkRequest *to_call, MPI_Request *request)
 /* Submit GPU request that will be notified of completion once the previous
  * operations in the given CUDA stream are complete */
 CLINKAGE
-int AMPI_GPU_Iinvoke(cudaStream_t stream, MPI_Request *request)
+int AMPI_GPU_Iinvoke(hapiStream_t stream, MPI_Request *request)
 {
   AMPI_API("AMPI_GPU_Iinvoke", stream, request);
 
@@ -11748,7 +11748,7 @@ int AMPI_GPU_Invoke_wr(hapiWorkRequest *to_call)
 }
 
 CLINKAGE
-int AMPI_GPU_Invoke(cudaStream_t stream)
+int AMPI_GPU_Invoke(hapiStream_t stream)
 {
   AMPI_API("AMPI_GPU_Invoke", stream);
 
diff --git a/src/libs/ck-libs/ampi/ampi_functions.h b/src/libs/ck-libs/ampi/ampi_functions.h
index 48b2bd6e22..43c8a3512a 100644
--- a/src/libs/ck-libs/ampi/ampi_functions.h
+++ b/src/libs/ck-libs/ampi/ampi_functions.h
@@ -23,7 +23,7 @@
 # error You must define AMPI_CUSTOM_FUNC before including this file!
 #endif
 
-#if CMK_CUDA
+#if CMK_CUDA || CMK_HIP
 #include "hapi_functions.h"
 #endif
 
@@ -607,11 +607,11 @@ AMPI_CUSTOM_FUNC(int, AMPI_Alltoall_long, void *sendbuf, int sendcount, MPI_Data
 
 
 #ifdef __cplusplus
-#if CMK_CUDA
+#if CMK_CUDA || CMK_HIP
 AMPI_CUSTOM_FUNC(int, AMPI_GPU_Iinvoke_wr, hapiWorkRequest *to_call, MPI_Request *request)
-AMPI_CUSTOM_FUNC(int, AMPI_GPU_Iinvoke, cudaStream_t stream, MPI_Request *request)
+AMPI_CUSTOM_FUNC(int, AMPI_GPU_Iinvoke, hapiStream_t stream, MPI_Request *request)
 AMPI_CUSTOM_FUNC(int, AMPI_GPU_Invoke_wr, hapiWorkRequest *to_call)
-AMPI_CUSTOM_FUNC(int, AMPI_GPU_Invoke, cudaStream_t stream)
+AMPI_CUSTOM_FUNC(int, AMPI_GPU_Invoke, hapiStream_t stream)
 #endif
 #endif
 
diff --git a/src/scripts/Make.depends b/src/scripts/Make.depends
index 8d5c014cf6..a73bc09427 100644
--- a/src/scripts/Make.depends
+++ b/src/scripts/Make.depends
@@ -169,7 +169,7 @@ EveryLB.o: EveryLB.C LBManager.h LBDatabase.h lbdb.h converse.h \
  PathHistory.decl.h ckcallback-ccs.h CkCallback.decl.h BaseLB.decl.h \
  EveryLB.decl.h charm++.h envelope.h sdag.h TreeLB.decl.h idm.h \
  BaseLB.decl.h DistributedLB.decl.h DistBaseLB.decl.h LBManager.decl.h \
- MetisLB.decl.h CentralLB.decl.h CentralLBMsg.h RecBipartLB.decl.h \
+ MetisLB.decl.h GreedyCentralLB.decl.h CentralLB.decl.h CentralLBMsg.h RecBipartLB.decl.h \
  EveryLB.def.h
 
 HybridBaseLB.o: HybridBaseLB.C HybridBaseLB.h charm++.h charm.h \
@@ -394,6 +394,30 @@ MetisLB.o: MetisLB.C MetisLB.h CentralLB.h BaseLB.h LBManager.h \
  CkCallback.decl.h BaseLB.decl.h CentralLB.decl.h CentralLBMsg.h \
  manager.h MetisLB.decl.h ckgraph.h MetisLB.def.h
 
+ GreedyCentralLB.o: GreedyCentralLB.C GreedyCentralLB.h CentralLB.h BaseLB.h LBManager.h \
+ LBDatabase.h lbdb.h converse.h conv-header.h conv-config.h \
+ conv-autoconfig.h conv-common.h conv-mach-common.h conv-mach.h \
+ conv-mach-opt.h lrts-common.h cmiqueue.h pup_c.h pup_c_functions.h \
+ lrtslock.h queueing.h conv-cpm.h conv-cpath.h conv-qd.h conv-random.h \
+ conv-lists.h conv-trace.h persistent.h cmirdmautils.h debug-conv.h \
+ charm.h conv-rdma.h pup.h middle.h middle-conv.h LBObj.h LBOM.h LBComm.h \
+ LBMachineUtil.h json_fwd.hpp LBManager.decl.h charm++.h cklists.h \
+ pup_stl.h conv-config.h ckbitvector.h ckstream.h init.h charm-api.h \
+ ckhashtable.h ckrdma.h envelope.h pup.h charm.h middle.h cklists.h \
+ objid.h charm.h converse.h pup.h ckcallback.h cksection.h ckarrayindex.h \
+ objid.h conv-ccs.h sockRoutines.h ccs-server.h register.h debug-charm.h \
+ debug-conv++.h simd.h ckmessage.h CkMarshall.decl.h sdag.h pup_stl.h \
+ envelope.h debug-charm.h ckrdmadevice.h conv-rdmadevice.h ckobjQ.h \
+ ckreduction.h CkReduction.decl.h ckmemcheckpoint.h \
+ CkMemCheckpoint.decl.h readonly.h ckarray.h cklocation.h MetaBalancer.h \
+ RandomForestModel.h MetaBalancer.decl.h CkLocation.decl.h \
+ ckarrayoptions.h ckmulticast.h CkMulticast.decl.h cklocrec.h \
+ ckmigratable.h CkArray.decl.h ckfutures.h CkFutures.decl.h waitqd.h \
+ waitqd.decl.h ckcheckpoint.h ckcallback.h CkCheckpointStatus.decl.h \
+ trace.h pathHistory.h PathHistory.decl.h ckcallback-ccs.h \
+ CkCallback.decl.h BaseLB.decl.h CentralLB.decl.h CentralLBMsg.h \
+ manager.h GreedyCentralLB.decl.h ckgraph.h GreedyCentralLB.def.h
+
 RecBipartLB.o: RecBipartLB.C RecBipartLB.h CentralLB.h BaseLB.h \
  LBManager.h LBDatabase.h lbdb.h converse.h conv-header.h conv-config.h \
  conv-autoconfig.h conv-common.h conv-mach-common.h conv-mach.h \
diff --git a/src/scripts/charmc b/src/scripts/charmc
index 7863c144ae..aab371b07d 100755
--- a/src/scripts/charmc
+++ b/src/scripts/charmc
@@ -1614,7 +1614,7 @@ modInitObj="$modInitName.o"
 MAKE_LDXX="0"
 MAKE_LD="0"
 
-CORE_LIBS=(-lreconverse -lcharm_cxx_utils "${TRACE_OBJ[@]}" -lm)
+CORE_LIBS=(-lreconverse -lcharm_cxx_utils "${TRACE_OBJ[@]}" -lm -lckrescale)
 
 if [[ "$BUILD_SHARE" = '0' && "$USER_INITIATED_SHARED" = '0' ]]
 then
@@ -2297,14 +2297,41 @@ fi
 
 if [[ "$COPY_CHARMRUN" = 'true' ]]
 then
-  targ="charmrun$CMK_POST_EXE"
-  [[ ! -x "$CHARMBIN/$targ" && -n "$CMK_POST_EXE" ]] && targ=charmrun
+	targ="charmrun$CMK_POST_EXE"
+	[[ ! -x "$CHARMBIN/$targ" && -n "$CMK_POST_EXE" ]] && targ=charmrun
 
-  if [[ -x "$CHARMBIN/$targ" ]]
-  then
+	if [[ -x "$CHARMBIN/$targ" ]]
+	then
 	DoNoErrCheck $RM "$targ"
 	DoNoErrCheck $CP "$CHARMBIN/$targ" "$targ" 2> /dev/null
-  fi
+	fi
+
+	targ_elastic="charmrun_elastic$CMK_POST_EXE"
+	[[ ! -x "$CHARMBIN/$targ_elastic" && -n "$CMK_POST_EXE" ]] && targ_elastic=charmrun_elastic
+
+	if [[ -x "$CHARMBIN/$targ_elastic" ]]
+	then
+	DoNoErrCheck $RM "$targ_elastic"
+	DoNoErrCheck $CP "$CHARMBIN/$targ_elastic" "$targ_elastic" 2> /dev/null
+	fi
+
+	targ_elastic="charmrun_hapi$CMK_POST_EXE"
+	[[ ! -x "$CHARMBIN/$targ_elastic" && -n "$CMK_POST_EXE" ]] && targ_elastic=charmrun_hapi
+
+	if [[ -x "$CHARMBIN/$targ_elastic" ]]
+	then
+	DoNoErrCheck $RM "$targ_elastic"
+	DoNoErrCheck $CP "$CHARMBIN/$targ_elastic" "$targ_elastic" 2> /dev/null
+	fi
+
+	targ_elastic="hapi_memory_daemon$CMK_POST_EXE"
+	[[ ! -x "$CHARMBIN/$targ_elastic" && -n "$CMK_POST_EXE" ]] && targ_elastic=hapi_memory_daemon
+
+	if [[ -x "$CHARMBIN/$targ_elastic" ]]
+	then
+	DoNoErrCheck $RM "$targ_elastic"
+	DoNoErrCheck $CP "$CHARMBIN/$targ_elastic" "$targ_elastic" 2> /dev/null
+	fi
 fi
 
 [[ -z "$SKIP_MODULEINIT" && -z "$SAVE" ]] && DoNoErrCheck $RM "$modInitSrc" "$modInitObj" > /dev/null 2>&1
diff --git a/src/util/charmrun-src/CMakeLists.txt b/src/util/charmrun-src/CMakeLists.txt
index 95a3ccf389..f0f6e96d20 100644
--- a/src/util/charmrun-src/CMakeLists.txt
+++ b/src/util/charmrun-src/CMakeLists.txt
@@ -8,7 +8,7 @@ target_link_libraries(charmd_faceless PRIVATE -seq)
 
 add_executable(charmrun charmrun.C)
 target_compile_options(charmrun PRIVATE -seq -DCMK_NOT_USE_CONVERSE=1)
-target_link_libraries(charmrun PRIVATE -seq)
+target_link_libraries(charmrun PRIVATE -seq ckrescale)
 
 target_include_directories(charmrun PRIVATE ../../conv-ccs ..) # for ccs-auth.c sockRoutines.c
 
diff --git a/src/util/charmrun-src/Makefile b/src/util/charmrun-src/Makefile
index 2fbc3b4bf6..cf80f1c5db 100644
--- a/src/util/charmrun-src/Makefile
+++ b/src/util/charmrun-src/Makefile
@@ -5,13 +5,13 @@ SHELL=/bin/sh
 
 INCLUDED=../conv-mach.h ../conv-mach-opt.h \
 	../ccs-server.C ../ccs-server.h ../ccs-auth.C ../ccs-auth.h \
-	../sockRoutines.C ../sockRoutines.h
+	../sockRoutines.C ../sockRoutines.h ../ckcheckpoint.C ../ckcheckpoint.h
 
 all: charmrun charmd charmd_faceless
 
-charmrun: charmrun.C $(INCLUDED)
+charmrun: charmrun.C $(INCLUDED) ck.o
 	$(CHARMC) -c -seq -DCMK_NOT_USE_CONVERSE=1 charmrun.C -o charmrun.o
-	$(CHARMC) -cp $(BIN) -seq -language c++ -o charmrun charmrun.o
+	$(CHARMC) -cp $(BIN) -seq -language c++ -o charmrun charmrun.o ck.o
 
 charmd: daemon.C daemon.h ../sockRoutines-seq.o
 	$(CHARMC) -seq -c daemon.C -o daemon.o
diff --git a/src/util/charmrun-src/charmrun.C b/src/util/charmrun-src/charmrun.C
index 9403096137..79ecf20efa 100644
--- a/src/util/charmrun-src/charmrun.C
+++ b/src/util/charmrun-src/charmrun.C
@@ -1,4 +1,5 @@
 #include "converse.h"
+#include "ckrescale.h"
 
 #include "sockRoutines.h"
 #include "sockRoutines.C"
@@ -25,6 +26,7 @@
 #include <sys/stat.h>
 
 #include <unordered_map>
+#include <unordered_set>
 #include <map>
 #include <string>
 #include <vector>
@@ -32,6 +34,12 @@
 #include <utility>
 #include <algorithm>
 
+#include <regex>
+#include <iostream>
+#include <fstream>
+#include <string>
+
+
 #if defined(_WIN32)
 /*Win32 has screwy names for the standard UNIX calls:*/
 #define getcwd _getcwd
@@ -360,6 +368,8 @@ static char *getenv_display_no_tamper()
 static unsigned int server_port;
 static char server_addr[1024]; /* IP address or hostname of charmrun*/
 static SOCKET server_fd;
+
+static std::unordered_set<int> node_set;
 /*****************************************************************************
  *                                                                           *
  * PPARAM - obtaining "program parameters" from the user.                    *
@@ -745,8 +755,8 @@ static char **saved_argv;
 static int saved_argc;
 static int arg_realloc_pes;
 static int arg_old_pes;
-static int arg_shrinkexpand;
 static int arg_charmrun_port;
+static int arg_shrinkexpand;
 static const char *arg_shrinkexpand_basedir;
 #endif
 
@@ -779,7 +789,6 @@ static int arg_server_port = 0;
 static const char *arg_server_auth = NULL;
 static int replay_single = 0;
 
-
 struct TopologyRequest
 {
   int host, socket, core, pu;
@@ -817,6 +826,93 @@ TopologyRequest proc_per;
 TopologyRequest onewth_per;
 int auto_provision;
 
+void print_nodelist(){
+    FILE *f=fopen("/app/hostfile","r");
+    char c;
+    c = fgetc(f); 
+    while (c != EOF) {
+      printf ("%c", c); 
+      c = fgetc(f); 
+    } 
+    fclose(f);
+}
+
+int count_num_slots()
+{
+  std::ifstream infile("/etc/mpi/hostfile");
+  std::string sLine;
+  
+  std::regex rgx("host (.*)-worker-(\\d+)\\.(.*) \\+\\+cpus (\\d+)");
+  std::smatch match;
+  int total_slots = 0;
+
+  printf("Counting slots in hostfile\n");
+
+  while(getline(infile, sLine))
+  {
+    if (std::regex_search(sLine, match, rgx))
+    {
+      total_slots += std::stoi(match[4]);
+    }
+    else
+    {
+      printf("Error parsing hostfile regex\n");
+      return 0;
+    }
+  }
+  printf("Total slots = %d\n", total_slots);
+  std::cout << std::flush;
+  return total_slots;
+}
+
+void wait_hostfile(int numProcs)
+{
+  int i = 0;
+  while (count_num_slots() != numProcs) 
+  {
+    sleep(1 << i++);
+  }
+}
+
+void write_hostfile(int numProcesses) 
+{
+    std::ifstream infile("/etc/mpi/hostfile");
+    std::string sLine;
+    getline(infile, sLine);
+    printf("Line = %s\n", sLine.c_str());
+    std::cout << std::flush;
+    std::regex rgx("host (.*)-worker-(\\d+)\\.(.*) \\+\\+cpus (\\d+)");
+    std::smatch match;
+    char hostStr[200];
+
+    if (std::regex_search(sLine, match, rgx))
+    {
+        std::string name = match[1];
+        std::string suffix = match[3];
+        int slots = std::stoi(match[4]);
+
+        infile.close();
+
+        std::ofstream outfile("/app/hostfile");
+
+        for (int i = 0; i < numProcesses; i++)
+        {
+            sprintf(hostStr, "host %s-worker-%i.%s ++cpus %i\n", name.c_str(), i, suffix.c_str(), slots);
+            printf("Writing: %s\n", hostStr);
+            outfile << hostStr;
+        }
+
+        outfile.flush();
+        outfile.close();
+
+        print_nodelist();
+    }
+    else
+    {
+        printf("Error parsing hostfile regex\n");
+    }
+}
+
 static void arg_init(int argc, const char **argv)
 {
   static char buf[1024];
@@ -867,6 +963,7 @@ static void arg_init(int argc, const char **argv)
   pparam_flag(&arg_child_charmrun, 0, "child-charmrun", "child charmrun");
 #endif
 #if CMK_SHRINK_EXPAND
+  arg_shrinkexpand = 0;
   pparam_int(&arg_realloc_pes, 1, "newp", "New number of processes to create");
   pparam_int(&arg_old_pes, 1, "oldp", "Old number of processes to create");
   pparam_flag(&arg_shrinkexpand, 0, "shrinkexpand", "Enable shrink/expand support");
@@ -964,7 +1061,13 @@ static void arg_init(int argc, const char **argv)
 #if CMK_SHRINK_EXPAND
   if (arg_shrinkexpand) {
     arg_requested_pes = arg_realloc_pes;
-    printf("\n \nCharmrun> %d Reallocated pes\n \n", arg_requested_pes);
+    //arg_nodelist = "/etc/mpi/hostfileScaled";
+    //write_hostfile(arg_requested_pes);
+    //printf("Waiting\n");
+    //wait_hostfile(arg_requested_nodes);
+    //printf("\n \nCharmrun> %d Reallocated pes\n \n", arg_requested_pes);
+    //print_nodelist();
+    //arg_nodelist = new_hostfile;
   }
 #endif
 
@@ -1319,6 +1422,7 @@ struct nodetab_host
 
   skt_ip_t ip = _skt_invalid_ip;      /*IP address of host*/
   int cpus = 1;     /* # of physical CPUs*/
+  int remaining_cpus = 1; /* # of physical CPUs remaining for this host */
   int nice = -100;     /* process priority */
 //  int forks = 0;    /* number of processes to fork on remote node */
 
@@ -1393,6 +1497,42 @@ static std::vector<nodetab_host *> my_host_table;
 static std::vector<nodetab_process> my_process_table;
 static std::vector<nodetab_process *> pe_to_process_map;
 
+#if CMK_SHRINK_EXPAND
+ /*This little snippet creates a OLDNODENAMES
+ environment variable entry*/
+ char *create_oldnodenames()
+ {
+   static char dest1[1024 * 1000];
+   int i;
+   for (i = 0; i < my_process_table.size(); i++)
+     sprintf(dest1, "%s %s", dest1, (my_process_table[i].host)->name);
+   printf("Charmrun> Created oldnames %s \n", dest1);
+   return dest1;
+ }
+
+ int isPresent(const char *names, char **listofnames)
+ {
+   int k;
+   for (k = 0; k < arg_old_pes; k++) {
+     if (strcmp(names, listofnames[k]) == 0)
+       return 1;
+   }
+   return 0;
+ }
+ void parse_oldnodenames(char **oldnodelist)
+ {
+   char *ns;
+   ns = getenv("OLDNODENAMES");
+   int i;
+   char buffer[1024 * 1000];
+   for (i = 0; i < arg_old_pes; i++) {
+     oldnodelist[i] = (char *) malloc(100 * sizeof(char));
+     int nread = sscanf(ns, "%s %[^\n]", oldnodelist[i], buffer);
+     ns = buffer;
+   }
+ }
+ #endif
+
 static const char *nodetab_args(const char *args, nodetab_host *h)
 {
   while (*args != 0)
@@ -1438,7 +1578,7 @@ static const char *nodetab_args(const char *args, nodetab_host *h)
 
     args = skipblanks(e2);
   }
-
+  h->remaining_cpus = h->cpus;
   return args;
 }
 
@@ -1531,6 +1671,7 @@ static void nodetab_init_with_nodelist()
             host->name = strdup(hostname.c_str());
             host->ip = nodetab_host::resolve(hostname.c_str());
             host->hostno = hostno++;
+            printf("Adding host %s, %i\n", host->name, host->hostno);
             temp_hosts.insert({hostname, host});
             nodetab_args(b3, host);
           }
@@ -1595,6 +1736,8 @@ static void nodeinfo_add(const ChSingleNodeinfo *in, nodetab_process & p)
     fprintf(stderr, "Charmrun> Warning: Process #%d received ChSingleNodeInfo #%d\n", p.nodeno, node);
 
   p.info = in->info;
+  fprintf(stdout, "Charmrun> client %d added -> dataport = %d\n", node, ChMessageInt(p.info.dataport));
+  fflush(stdout);
   p.num_pus = ChMessageInt(in->num_pus);
   p.num_cores = ChMessageInt(in->num_cores);
   p.num_sockets = ChMessageInt(in->num_sockets);
@@ -1961,7 +2104,6 @@ static int req_handle_initnode(ChMessage *msg, nodetab_process & p)
     fprintf(stderr, "Charmrun: possibly because: %s.\n", msg->data);
     exit(1);
   }
-
   nodeinfo_add((ChSingleNodeinfo *) msg->data, p);
   return REQ_OK;
 }
@@ -2430,6 +2572,8 @@ static int req_handle_realloc(ChMessage *msg, SOCKET fd)
     ret[saved_argc + index++] = NULL;
   }
 
+  setenv("OLDNODENAMES", create_oldnodenames(), 1);
+
   ChMessage ackmsg;
   ChMessage_new("realloc_ack", 0, &ackmsg);
   for (const nodetab_process & p : my_process_table)
@@ -3181,6 +3325,8 @@ static SOCKET errorcheck_one_client_connect(void)
 
   const SOCKET req_client = skt_accept(server_fd, &clientIP, &clientPort);
 
+  //printf("clientPort = %d\n", clientPort);
+
   /* FIXME: will this ever be triggered? It seems the skt_abort handler here is
    *        'client_connect_problem', which calls exit(1), so we'd exit
    *        in skt_accept. */
@@ -3314,6 +3460,13 @@ static void req_set_client_connect(std::vector<nodetab_process> & process_table,
   curclientend = 0;
 #endif
 
+  printf("Charmrun> Waiting for %d clients to connect.\n", count);
+  for (int i = 0; i < process_table.size(); i++)
+  {
+   nodetab_process & p = process_table[i];
+   printf("Charmrun> process table nodeno %d, name %s\n", p.nodeno, p.host->name);
+  }
+
   int finished = 0;
   while (finished < count)
   {
@@ -3327,8 +3480,18 @@ static void req_set_client_connect(std::vector<nodetab_process> & process_table,
 
       curclientend++;
     }
+    //fprintf(stdout, "open_sockets.size() = %d, clientstart,end=%d, %d\n", open_sockets.size(), 
+    //  curclientstart, curclientend);
+    //fflush(stdout);
 #endif
     /* check appropriate clients for messages */
+
+    //for (int i = 0; i < process_table.size(); i++)
+    //{
+    //  nodetab_process & p = process_table[i];
+    //  printf("Charmrun> process table nodeno %d\n", p.nodeno);
+    //}
+
     while (!open_sockets.empty())
     {
       const SOCKET req_client = open_sockets.front();
@@ -3340,6 +3503,17 @@ static void req_set_client_connect(std::vector<nodetab_process> & process_table,
         ChMessage_recv(req_client, &msg);
 
         int nodeNo = ChMessageInt(((ChSingleNodeinfo *)msg.data)->nodeNo);
+
+        printf("Charmrun> node %d is connecting\n", nodeNo);
+
+        if (node_set.find(nodeNo) != node_set.end())
+        {
+          printf("Charmrun> node %d is already in the node set\n", nodeNo);
+          continue;
+        }
+
+        node_set.insert(nodeNo);
+
         nodetab_process & p = get_process_for_nodeno(process_table, nodeNo);
         p.req_client = req_client;
 
@@ -3578,24 +3752,71 @@ static void req_construct_phase2_processes(std::vector<nodetab_process> & phase2
 
   for (nodetab_process & p : my_process_table)
   {
-    p.forkstart = active_host_count + p.nodeno * new_processes_per_host;
+    //p.forkstart = active_host_count + p.nodeno * new_processes_per_host;
     p.host->processes = 1;
+    p.host->remaining_cpus--;
   }
 
-  for (int i = 0; i < num_new_processes; ++i)
+  int i = 0;
+  //int curr_pe = active_host_count;
+  int num_forks = 0;
+
+  // FIXME this will hang if total PEs requested > total PEs available
+  while (num_forks < num_new_processes)
   {
-    nodetab_process & src = my_process_table[i % active_host_count];
-    phase2_processes.push_back(src);
+    nodetab_process & src = my_process_table[i++ % active_host_count];
 
-    nodetab_process & p = phase2_processes.back();
-    p.nodeno = src.forkstart + (src.host->processes++ - 1);
+    int prev_pe = src.nodeno;
+    while (src.host->remaining_cpus > 0)
+    {
+      if (num_forks >= num_new_processes)
+        break;
+      ++prev_pe;
+      if (src.forkstart == 0)
+        src.forkstart = prev_pe;
+      src.host->processes++;
+      src.host->remaining_cpus--;
+
+      phase2_processes.push_back(src);
+      nodetab_process & p = phase2_processes.back();
+      p.nodeno = prev_pe;
+      num_forks++;
+    }
   }
+
+  printf("PHASE2> %d processes will be forked\n", phase2_processes.size());
 }
 
 static void start_nodes_local(const std::vector<nodetab_process> &);
 static void start_nodes_ssh(std::vector<nodetab_process> &);
 static void finish_nodes(std::vector<nodetab_process> &);
 
+static void req_client_reconnect(std::vector<nodetab_process> & process_table)
+{
+  skt_set_abort(client_connect_problem_skt);
+
+  std::vector<nodetab_process> phase2_processes;
+
+  req_construct_phase2_processes(phase2_processes);
+  printf("Phase2 reconnect: %d processes will be forked\n", phase2_processes.size());
+  if (phase2_processes.size() > 0)
+  {
+      if (!arg_local)
+      {
+#if CMK_SHRINK_EXPAND
+        if (arg_requested_pes > arg_old_pes)
+#endif
+        {
+          assert(!arg_mpiexec);
+          start_nodes_ssh(phase2_processes);
+        }
+      }
+  }
+  req_add_phase2_processes(phase2_processes);
+  req_client_connect_table(process_table);
+  req_all_clients_connected();
+}
+
 static void req_client_connect(std::vector<nodetab_process> & process_table)
 {
   skt_set_abort(client_connect_problem_skt);
@@ -3641,23 +3862,28 @@ static void req_client_connect(std::vector<nodetab_process> & process_table)
     }
     else
     {
-      // send nodefork packets
-      ChMessageHeader hdr;
-      ChMessageInt_t mydata[ChInitNodeforkFields];
-      ChMessageHeader_new("nodefork", sizeof(mydata), &hdr);
-      for (const nodetab_process & p : process_table)
+#if CMK_SHRINK_EXPAND
+      if (!arg_shrinkexpand)
+#endif
       {
-        int numforks = p.host->processes - 1;
-        if (numforks <= 0)
-          continue;
+        // send nodefork packets
+        ChMessageHeader hdr;
+        ChMessageInt_t mydata[ChInitNodeforkFields];
+        ChMessageHeader_new("nodefork", sizeof(mydata), &hdr);
+        for (const nodetab_process & p : process_table)
+        {
+          int numforks = p.host->processes - 1;
+          if (numforks <= 0)
+            continue;
 
-        if (arg_verbose)
-          printf("Charmrun> Instructing host \"%s\" to fork() x %d\n", p.host->name, numforks);
+          if (arg_verbose)
+            printf("Charmrun> Instructing host \"%s\" to fork() x %d\n", p.host->name, numforks);
 
-        mydata[0] = ChMessageInt_new(numforks);
-        mydata[1] = ChMessageInt_new(p.forkstart);
-        skt_sendN(p.req_client, (const char *) &hdr, sizeof(hdr));
-        skt_sendN(p.req_client, (const char *) mydata, sizeof(mydata));
+          mydata[0] = ChMessageInt_new(numforks);
+          mydata[1] = ChMessageInt_new(p.forkstart);
+          skt_sendN(p.req_client, (const char *) &hdr, sizeof(hdr));
+          skt_sendN(p.req_client, (const char *) mydata, sizeof(mydata));
+        }
       }
     }
 
@@ -4166,7 +4392,7 @@ int main(int argc, const char **argv, char **envp)
     for (const nodetab_host * h : host_table)
     {
       skt_print_ip(ips, sizeof(ips), h->ip);
-      printf("Charmrun> added host \"%s\", IP:%s\n", h->name, ips);
+      printf("Charmrun> added host \"%s\", hostno %d, IP:%s\n", h->name, h->hostno, ips);
     }
   }
 
@@ -4195,12 +4421,14 @@ int main(int argc, const char **argv, char **envp)
                                        ? (arg_requested_nodes > 0 ? std::min(my_host_count, arg_requested_nodes) : my_host_count)
                                        : std::min(my_host_count, get_old_style_process_count());
   my_process_table.resize(my_initial_process_count);
+  int curr_nodeno = 0;
   for (int i = 0; i < my_initial_process_count; ++i)
   {
     nodetab_host * h = my_host_table[i];
     nodetab_process & p = my_process_table[i];
     p.host = h;
-    p.nodeno = h->hostno;
+    p.nodeno = curr_nodeno;
+    curr_nodeno += h->cpus;
   }
 
   /* start the node processes */
@@ -4301,6 +4529,11 @@ int main(int argc, const char **argv, char **envp)
       finish_nodes(my_process_table);
 #endif
     if (!arg_batch_spawn)
+#if CMK_SHRINK_EXPAND
+      if (arg_shrinkexpand)
+#endif
+        req_client_reconnect(my_process_table);
+    else if (!arg_batch_spawn)
       req_client_connect(my_process_table);
   }
 #if CMK_SSH_KILL
@@ -5259,10 +5492,27 @@ static void start_one_node_ssh(nodetab_process & p, const char ** argv)
 
 static void start_nodes_ssh(std::vector<nodetab_process> & process_table)
 {
+  char **oldnodenames;
+#if CMK_SHRINK_EXPAND
+  if (arg_shrinkexpand)
+  {
+    oldnodenames = (char **) malloc(arg_old_pes * sizeof(char *));
+    parse_oldnodenames(oldnodenames);
+  }
+
   for (nodetab_process & p : process_table)
   {
+    if (arg_shrinkexpand && !isPresent(p.host->name, oldnodenames))
+      start_one_node_ssh(p);
+    else if (!arg_shrinkexpand)
       start_one_node_ssh(p);
   }
+#else
+  for (nodetab_process & p : process_table)
+  {
+    start_one_node_ssh(p);
+  }
+#endif
 }
 
 /* for mpiexec, for once calling mpiexec to start on all nodes  */
diff --git a/src/util/ckrescale.C b/src/util/ckrescale.C
new file mode 100644
index 0000000000..d1cb2c8c5c
--- /dev/null
+++ b/src/util/ckrescale.C
@@ -0,0 +1,19 @@
+bool shrinkexpand_exit = false; // Flag to indicate if we are in the process of shrinking/expanding
+bool in_restart = false; // Flag to indicate if we are in a restart process
+
+
+void set_shrinkexpand_exit(bool value) {
+  shrinkexpand_exit = value;
+}
+
+bool get_shrinkexpand_exit() {
+  return shrinkexpand_exit;
+}
+
+void set_in_restart(bool value) {
+  in_restart = value;
+}
+
+bool get_in_restart() {
+  return in_restart;
+}
\ No newline at end of file
diff --git a/src/util/ckrescale.h b/src/util/ckrescale.h
new file mode 100644
index 0000000000..ed0c4a2a6e
--- /dev/null
+++ b/src/util/ckrescale.h
@@ -0,0 +1,5 @@
+void set_shrinkexpand_exit(bool value);
+bool get_shrinkexpand_exit();
+
+void set_in_restart(bool value);
+bool get_in_restart();
\ No newline at end of file
diff --git a/src/util/cmirdmautils.h b/src/util/cmirdmautils.h
index cbd622987a..62ccbaa6ac 100644
--- a/src/util/cmirdmautils.h
+++ b/src/util/cmirdmautils.h
@@ -6,7 +6,7 @@
 #include <stdio.h>
 #include <stddef.h>
 
-#if CMK_CUDA
+#if CMK_CUDA || CMK_HIP
 enum DeviceRecvType {
   DEVICE_RECV_TYPE_CHARM,
   DEVICE_RECV_TYPE_AMPI,
@@ -20,18 +20,21 @@ typedef struct DeviceRdmaInfo_ {
 } DeviceRdmaInfo;
 
 typedef struct DeviceRdmaOp_ {
-  int dest_pe;
   const void* dest_ptr;
   size_t size;
   DeviceRdmaInfo* info;
   void* src_cb;
   void* dst_cb;
   uint64_t tag;
+  int dest_pe;
+  int src_pe;
+  int src_mpi_rank;
+  int dest_mpi_rank;
 } DeviceRdmaOp;
 
 typedef struct DeviceRdmaOpMsg_ {
   char header[CmiMsgHeaderSizeBytes];
-  DeviceRdmaOp op;
+  DeviceRdmaOp* op;
 } DeviceRdmaOpMsg;
 #endif // CMK_CUDA
 
diff --git a/src/util/pup.h b/src/util/pup.h
index 365c05876c..481d24b1d1 100644
--- a/src/util/pup.h
+++ b/src/util/pup.h
@@ -132,6 +132,11 @@ typedef enum {
   dataType_last //<- for setting table lengths, etc.
 } dataType;
 
+enum class PUPMode {
+  HOST, // Host mode, no special handling
+  DEVICE
+};
+
 static inline dataType getXlateDataType(signed char *a) { return Tchar; }
 #if CMK_SIGNEDCHAR_DIFF_CHAR
 static inline dataType getXlateDataType(char *a) { return Tchar; }
@@ -208,10 +213,10 @@ class er {
    /// These state bits describe the PUP::er's direction.
    enum
    {
-     IS_SIZING = 0x0100,
-     IS_PACKING = 0x0200,
-     IS_UNPACKING = 0x0400,
-     TYPE_MASK = 0xFF00
+      IS_SIZING = 0x0100,
+      IS_PACKING = 0x0200,
+      IS_UNPACKING = 0x0400,
+      TYPE_MASK = 0xFF00
    };
  public:
   virtual ~er();//<- does nothing, but might be needed by some child
@@ -262,8 +267,19 @@ class er {
 
   //For arrays:
   template<class T>
-  void operator()(T *a,size_t nItems) {
-    bytes((void *)a,nItems, sizeof(T), getXlateDataType(a));
+  void operator()(T *a, size_t nItems) {
+    bytes((void *)a, nItems, sizeof(T), getXlateDataType(a));
+  }
+
+  // Overload for T** (array of pointers)
+  template<class T>
+  void operator()(T **a, size_t nItems) {
+    bytes((void *)(*a), nItems, sizeof(T), getXlateDataType(*a));
+  }
+
+  template<class T>
+  void operator()(T *a,size_t nItems, PUPMode mode) {
+    bytes((void *)a,nItems, sizeof(T), getXlateDataType(a), mode);
   }
 
   // Standard pup_buffer API that calls malloc for allocation on isUnpacking and free for deallocation on isPacking
@@ -323,6 +339,7 @@ class er {
   //Generic bottleneck: pack/unpack n items of size itemSize
   // and data type t from p.  Desc describes the data item
   virtual void bytes(void *p,size_t n,size_t itemSize,dataType t) =0;
+  virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode) =0;
   virtual void object(able** a);
 
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t) = 0;
@@ -390,21 +407,25 @@ enum {
 class sizer : public er {
  protected:
   size_t nBytes;
+  size_t gpuBytes;
   //Generic bottleneck: n items of size itemSize
   virtual void bytes(void *p,size_t n,size_t itemSize,dataType t);
+  virtual void bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode);
 
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t);
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
 
  public:
   //Write data to the given buffer
-  sizer(const unsigned int purpose = 0) : er(IS_SIZING | purpose), nBytes(0)
+  sizer(const unsigned int purpose = 0) : er(IS_SIZING | purpose), nBytes(0), gpuBytes(0)
   {
     CmiAssert((purpose & TYPE_MASK) == 0);
   }
 
   //Return the current number of bytes to be packed
   size_t size(void) const {return nBytes;}
+
+  size_t gpu_size(void) const {return gpuBytes;}
 };
 
 template <class T>
@@ -417,8 +438,13 @@ class mem : public er { //Memory-buffer packers and unpackers
  protected:
   myByte *origBuf;//Start of memory buffer
   myByte *buf;//Memory buffer (stuff gets packed into/out of here)
-  mem(const unsigned int type, myByte* Nbuf, const unsigned int purpose = 0)
+  myByte *gpuBuf;
+  myByte *gpuOrigBuf;
+  mem(const unsigned int type, myByte* Nbuf, 
+    myByte* gpuNbuf,
+    const unsigned int purpose = 0)
       : er(type | purpose), origBuf(Nbuf), buf(Nbuf)
+      , gpuOrigBuf(gpuNbuf), gpuBuf(gpuNbuf)
   {
     CmiAssert((purpose & TYPE_MASK) == 0);
   }
@@ -455,14 +481,26 @@ class toMem : public mem {
  protected:
   //Generic bottleneck: pack n items of size itemSize from p.
   virtual void bytes(void *p,size_t n,size_t itemSize,dataType t);
+  virtual void bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode);
 
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t);
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
 
  public:
   //Write data to the given buffer
-  toMem(void* Nbuf, const unsigned int purpose = 0)
-      : mem(IS_PACKING, (myByte*)Nbuf, purpose)
+  toMem(void* Nbuf, 
+    void* gpuNbuf,
+    const unsigned int purpose = 0, int state = IS_PACKING)
+      : mem(state, (myByte*)Nbuf,
+      (myByte*)gpuNbuf,
+      purpose)
+  {
+  }
+
+  toMem(void* Nbuf, const unsigned int purpose = 0, int state = IS_PACKING)
+      : mem(state, (myByte*)Nbuf, 
+      nullptr,
+      purpose)
   {
   }
 };
@@ -479,6 +517,7 @@ class fromMem : public mem {
  protected:
   //Generic bottleneck: unpack n items of size itemSize from p.
   virtual void bytes(void *p,size_t n,size_t itemSize,dataType t);
+  virtual void bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode);
 
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t);
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
@@ -486,9 +525,19 @@ class fromMem : public mem {
   void pup_buffer_generic(void *&p,size_t n, size_t itemSize, dataType t, std::function<void *(size_t)> allocate, bool isMalloc);
 
  public:
-  //Read data from the given buffer
-  fromMem(const void* Nbuf, const unsigned int purpose = 0)
-      : mem(IS_UNPACKING, (myByte*)Nbuf, purpose)
+  fromMem(const void* Nbuf, 
+    const void* gpuNbuf,
+    const unsigned int purpose = 0, int state = IS_UNPACKING)
+      : mem(state, (myByte*)Nbuf,
+      (myByte*)gpuNbuf,
+      purpose)
+  {
+  }
+
+  fromMem(const void* Nbuf, const unsigned int purpose = 0, int state = IS_UNPACKING)
+      : mem(state, (myByte*)Nbuf, 
+      nullptr,
+      purpose)
   {
   }
 };
@@ -524,6 +573,7 @@ class toDisk : public disk {
  protected:
   //Generic bottleneck: pack n items of size itemSize from p.
   virtual void bytes(void *p,size_t n,size_t itemSize,dataType t);
+  virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode);
 
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t);
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
@@ -545,6 +595,7 @@ class fromDisk : public disk {
  protected:
   //Generic bottleneck: unpack n items of size itemSize from p.
   virtual void bytes(void *p,size_t n,size_t itemSize,dataType t);
+  virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode);
 
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t);
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
@@ -576,6 +627,7 @@ class toTextUtil : public er {
   virtual void synchronize(unsigned int m);
  protected:
   virtual void bytes(void *p,size_t n,size_t itemSize,dataType t);
+  virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode) {}
 
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t);
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
@@ -615,6 +667,7 @@ class toTextFile : public er {
  protected:
   FILE *f;
   virtual void bytes(void *p,size_t n,size_t itemSize,dataType t);
+  virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode) {}
 
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t);
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
@@ -636,6 +689,7 @@ class fromTextFile : public er {
   double readDouble(void);
   
   virtual void bytes(void *p,size_t n,size_t itemSize,dataType t);
+  virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode) {}
 
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t);
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
@@ -724,6 +778,7 @@ class xlater : public wrap_er {
   
   //Generic bottleneck: unpack n items of size itemSize from p.
   virtual void bytes(void *p,size_t n,size_t itemSize,dataType t);
+  virtual void bytes(void *p,size_t n,size_t itemSize,dataType t,PUPMode mode) {}
 
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t);
   virtual void pup_buffer(void *&p, size_t n, size_t itemSize, dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
diff --git a/src/util/pup_toNetwork.h b/src/util/pup_toNetwork.h
index 7c5663af2e..ced89d7246 100644
--- a/src/util/pup_toNetwork.h
+++ b/src/util/pup_toNetwork.h
@@ -32,6 +32,8 @@ typedef CMK_NETWORK_INT4 CMK_POINTER_SIZED_INT;
 class PUP_toNetwork_sizer : public PUP::er {
 	size_t nBytes;
 	virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t);
+	virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {}
+	virtual void bytes(void **p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {}
 	virtual void pup_buffer(void *&p, size_t n, size_t itemSize, PUP::dataType t);
 	virtual void pup_buffer(void *&p, size_t n, size_t itemSize, PUP::dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
 
@@ -70,6 +72,8 @@ class PUP_toNetwork_pack : public PUP::er {
 	}
 
 	virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t);
+	virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {}
+	virtual void bytes(void **p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {}
 	virtual void pup_buffer(void *&p, size_t n, size_t itemSize, PUP::dataType t);
 	virtual void pup_buffer(void *&p, size_t n, size_t itemSize, PUP::dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
 
@@ -118,6 +122,8 @@ class PUP_toNetwork_unpack : public PUP::er {
 	}
 
 	virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t);
+	virtual void bytes(void *p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {}
+	virtual void bytes(void **p,size_t n,size_t itemSize,PUP::dataType t,PUP::PUPMode mode) {}
 	virtual void pup_buffer(void *&p, size_t n, size_t itemSize, PUP::dataType t);
 	virtual void pup_buffer(void *&p, size_t n, size_t itemSize, PUP::dataType t, std::function<void *(size_t)> allocate, std::function<void (void *)> deallocate);
 
diff --git a/src/util/pup_util.C b/src/util/pup_util.C
index 561b7ecbad..a8fc92b9fc 100644
--- a/src/util/pup_util.C
+++ b/src/util/pup_util.C
@@ -22,6 +22,13 @@ virtual functions are defined here.
 #include "converse.h"
 #include "pup.h"
 #include "ckhashtable.h"
+#include "conv-mach-cuda.h"
+#include "conv-mach-hip.h"
+
+#if CMK_CUDA || CMK_HIP
+#include "hapi_portable.h"
+#include "hapi_impl.h"
+#endif
 
 #include "conv-rdma.h"
 #if defined(_WIN32)
@@ -145,9 +152,21 @@ void PUP::sizer::bytes(void * /*p*/,size_t n,size_t itemSize,dataType /*t*/)
 	nBytes+=n*itemSize;
 }
 
+void PUP::sizer::bytes(void * p,size_t n,size_t itemSize,dataType t, PUPMode mode)
+{
+#ifdef CK_CHECK_PUP
+	nBytes+=sizeof(pupCheckRec);
+#endif
+  if (mode == PUPMode::HOST)
+    nBytes+=n*itemSize;
+  else if (mode == PUPMode::DEVICE)
+    gpuBytes += n * itemSize;
+}
+
 /*Memory PUP::er's*/
 void PUP::toMem::bytes(void *p,size_t n,size_t itemSize,dataType t)
 {
+  //CmiPrintf("[%d] PUP::toMem::bytes called with p=%p, n=%zu, itemSize=%zu, t=%d\n", CmiMyPe(), p, n, itemSize, t);
 #ifdef CK_CHECK_PUP
 	((pupCheckRec *)buf)->write(t,n);
 	buf+=sizeof(pupCheckRec);
@@ -156,8 +175,10 @@ void PUP::toMem::bytes(void *p,size_t n,size_t itemSize,dataType t)
 	memcpy((void *)buf,p,n); 
 	buf+=n;
 }
+
 void PUP::fromMem::bytes(void *p,size_t n,size_t itemSize,dataType t)
 {
+  //CmiPrintf("[%d] PUP::fromMem::bytes called with p=%p, n=%zu, itemSize=%zu, t=%d\n", CmiMyPe(), p, n, itemSize, t);
 #ifdef CK_CHECK_PUP
 	((pupCheckRec *)buf)->check(t,n);
 	buf+=sizeof(pupCheckRec);
@@ -167,6 +188,53 @@ void PUP::fromMem::bytes(void *p,size_t n,size_t itemSize,dataType t)
 	buf+=n;
 }
 
+void PUP::toMem::bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode)
+{
+  //CmiPrintf("[%d] PUP::toMem::bytes called with p=%p, n=%zu, itemSize=%zu, t=%d, mode=%d\n", CmiMyPe(), p, n, itemSize, t, mode);
+#ifdef CK_CHECK_PUP
+	((pupCheckRec *)buf)->write(t,n);
+	buf+=sizeof(pupCheckRec);
+#endif
+	n*=itemSize;
+  if (mode == PUPMode::HOST)
+  {
+    memcpy((void *)buf,p,n); 
+    buf+=n;
+  }
+  else
+  {
+    //CmiPrintf("[%d] Copying %zu bytes from p=%p to GPU buffer\n", CmiMyPe(), n, p);
+    // For GPU mode, we assume p is a device pointer and copy directly
+#if CMK_CUDA || CMK_HIP
+    hapiMemcpy((void *)gpuBuf, p, n, hapiMemcpyDeviceToDevice);
+    gpuBuf += n;
+#endif
+  }
+}
+
+void PUP::fromMem::bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode)
+{
+  //CmiPrintf("[%d] PUP::fromMem::bytes called with p=%p, n=%zu, itemSize=%zu, t=%d, mode=%d\n", CmiMyPe(), p, n, itemSize, t, mode);
+#ifdef CK_CHECK_PUP
+	((pupCheckRec *)buf)->check(t,n);
+	buf+=sizeof(pupCheckRec);
+#endif
+	n*=itemSize; 
+  if (mode == PUPMode::HOST)
+  {
+    memcpy(p,(const void *)buf,n); 
+    buf+=n;
+  }
+  else
+  {
+    //CmiPrintf("[%d] Copying %zu bytes from GPU buffer to p=%p\n", CmiMyPe(), n, p);
+#if CMK_CUDA || CMK_HIP
+    hapiMemcpy(p, (const void *)gpuBuf, n, hapiMemcpyDeviceToDevice);
+    gpuBuf += n;
+#endif
+  }
+}
+
 void PUP::sizer::pup_buffer(void *&p,size_t n, size_t itemSize, dataType t) {
 #ifdef CK_CHECK_PUP
 	nBytes+=sizeof(pupCheckRec);
@@ -375,6 +443,23 @@ void PUP::toDisk::bytes(void *p,size_t n,size_t itemSize,dataType /*t*/)
   }
 }
 
+void PUP::toDisk::bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode)
+{
+  if (mode == PUPMode::HOST) {
+    bytes(p, n, itemSize, t);
+  } else if (mode == PUPMode::DEVICE) {
+#if CMK_CUDA || CMK_HIP
+    // For GPU mode, we assume p is a device pointer and copy directly
+    int allocId = hapiCheckpoint(p, itemSize * n);
+    //CmiPrintf("Alloc ID = %d\n", allocId);
+    if(CmiFwrite(&allocId,sizeof(int),1,F) != 1)
+    {
+      error = true;
+    }
+#endif
+  }
+}
+
 void PUP::toDisk::pup_buffer(void *&p,size_t n,size_t itemSize,dataType t) {
   bytes(p, n, itemSize, t);
   if(isDeleting()) free(p);
@@ -385,8 +470,24 @@ void PUP::toDisk::pup_buffer(void *&p,size_t n, size_t itemSize, dataType t, std
   if(isDeleting()) deallocate(p);
 }
 
-void PUP::fromDisk::bytes(void *p,size_t n,size_t itemSize,dataType /*t*/)
-{/* CkPrintf("reading %d bytes\n",itemSize*n); */ CmiFread(p,itemSize,n,F);}
+void PUP::fromDisk::bytes(void *p,size_t n,size_t itemSize,dataType t)
+{
+  CmiFread(p,itemSize,n,F);
+}
+
+void PUP::fromDisk::bytes(void *p,size_t n,size_t itemSize,dataType t, PUPMode mode)
+{
+  if (mode == PUPMode::HOST) {
+    bytes(p, n, itemSize, t);
+  } else if (mode == PUPMode::DEVICE) {
+#if CMK_CUDA || CMK_HIP
+    // For GPU mode, we assume p is a device pointer and copy directly
+    int allocId;
+    CmiFread(&allocId,sizeof(int),1,F);
+    hapiRestore(p, itemSize * n, allocId);
+#endif
+  }
+}
 
 void PUP::fromDisk::pup_buffer(void *&p,size_t n,size_t itemSize,dataType t) {
   if(isUnpacking()) p = malloc(n * itemSize);
diff --git a/tests/ampi/migration/Makefile b/tests/ampi/migration/Makefile
index 860fc1adb2..aa8e0d6b96 100644
--- a/tests/ampi/migration/Makefile
+++ b/tests/ampi/migration/Makefile
@@ -4,10 +4,10 @@ CHARMC=../../../bin/ampicxx $(OPTS)
 all: migration
 
 migration: test.o
-	$(CHARMC) -o migration test.o
+	$(CHARMC) -pieglobals -o migration test.o
 
 test.o: test.C
-	$(CHARMC) -c test.C
+	$(CHARMC) -pieglobals -c test.C
 
 #
 #