diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3c42b99f77..c0af100956 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,6 +10,7 @@ endmacro(add_psc_executable) if(NOT USE_CUDA) add_psc_executable(psc_bgk) + add_psc_executable(psc_shock) endif() add_psc_executable(psc_bubble_yz) @@ -19,16 +20,15 @@ add_psc_executable(psc_whistler) add_psc_executable(psc_harris_yz) add_psc_executable(psc_2d_shock) add_psc_executable(psc_radiation) -add_psc_executable(psc_shock) if(NOT USE_CUDA) install( - TARGETS psc_bgk + TARGETS psc_bgk psc_shock RUNTIME DESTINATION bin ) endif() install( - TARGETS psc_bubble_yz psc_flatfoil_yz psc_whistler psc_shock + TARGETS psc_bubble_yz psc_flatfoil_yz psc_whistler RUNTIME DESTINATION bin ) diff --git a/src/libpsc/cuda/cuda_base.cu b/src/libpsc/cuda/cuda_base.cu deleted file mode 100644 index 9d2e4a83bf..0000000000 --- a/src/libpsc/cuda/cuda_base.cu +++ /dev/null @@ -1,178 +0,0 @@ - -#include "PscConfig.h" -#include "cuda_base.cuh" - -#ifdef PSC_HAVE_RMM -#include -#include -#include -#include -#endif - -#include -#include -#include -#include - -std::size_t mem_particles; -std::size_t mem_randomize_sort; -std::size_t mem_sort_by_block; -std::size_t mem_bnd; -std::size_t mem_heating; -std::size_t mem_collisions; -std::size_t mem_bndp; -std::size_t mem_rnd; - -#ifdef PSC_HAVE_RMM -using device_mr_type = rmm::mr::device_memory_resource; -using pool_mr_type = rmm::mr::pool_memory_resource; -using track_mr_type = rmm::mr::tracking_resource_adaptor; -using log_mr_type = rmm::mr::logging_resource_adaptor; - -static track_mr_type* track_mr; -static pool_mr_type* pool_mr; -#endif - -void cuda_base_init(void) -{ - static bool first_time = true; - if (!first_time) - return; - - first_time = false; - -#ifdef PSC_HAVE_RMM - rmm::logger().set_level(spdlog::level::trace); - - device_mr_type* mr = - rmm::mr::get_current_device_resource(); // Points to `cuda_memory_resource` - static log_mr_type _log_mr{mr, std::cout, true}; - static pool_mr_type pool_mr{&_log_mr, 15000000000}; - static track_mr_type track_mr{&pool_mr}; -#if 0 - static log_mr_type log_mr{&track_mr, std::cout, true}; - rmm::mr::set_current_device_resource(&log_mr); -#else - rmm::mr::set_current_device_resource(&track_mr); -#endif - ::pool_mr = &pool_mr; - ::track_mr = &track_mr; -#endif - - int deviceCount; - cudaGetDeviceCount(&deviceCount); - - // This function call returns 0 if there are no CUDA capable devices. - if (deviceCount == 0) { - printf("There is no device supporting CUDA\n"); - return; - } - - get_rng_state().resize(131072); - - int rank; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (rank != 0) { - return; - } - - for (int dev = 0; dev < deviceCount; ++dev) { - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, dev); - - if (dev == 0) { - // This function call returns 9999 for both major & minor fields, if no - // CUDA capable devices are present - if (deviceProp.major == 9999 && deviceProp.minor == 9999) - printf("There is no device supporting CUDA.\n"); - else if (deviceCount == 1) - printf("There is 1 device supporting CUDA\n"); - else - printf("There are %d devices supporting CUDA\n", deviceCount); - } - printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); - printf(" CUDA Capability Major revision number: %d\n", - deviceProp.major); - printf(" CUDA Capability Minor revision number: %d\n", - deviceProp.minor); - printf(" Total amount of global memory: %lu bytes\n", - deviceProp.totalGlobalMem); -#if CUDART_VERSION >= 2000 - printf(" Number of multiprocessors: %d\n", - deviceProp.multiProcessorCount); - printf(" Number of cores: %d\n", - 8 * deviceProp.multiProcessorCount); -#endif - printf(" Total amount of constant memory: %lu bytes\n", - deviceProp.totalConstMem); - printf(" Total amount of shared memory per block: %lu bytes\n", - deviceProp.sharedMemPerBlock); - printf(" Total number of registers available per block: %d\n", - deviceProp.regsPerBlock); - printf(" Warp size: %d\n", - deviceProp.warpSize); - printf(" Maximum number of threads per block: %d\n", - deviceProp.maxThreadsPerBlock); - printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", - deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], - deviceProp.maxThreadsDim[2]); - printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", - deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], - deviceProp.maxGridSize[2]); - printf(" Maximum memory pitch: %lu bytes\n", - deviceProp.memPitch); - printf(" Texture alignment: %lu bytes\n", - deviceProp.textureAlignment); - printf(" Clock rate: %.2f GHz\n", - deviceProp.clockRate * 1e-6f); -#if CUDART_VERSION >= 2000 - printf(" Concurrent copy and execution: %s\n", - deviceProp.deviceOverlap ? "Yes" : "No"); -#endif -#if CUDART_VERSION >= 2020 - printf(" Run time limit on kernels: %s\n", - deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); - printf(" Integrated: %s\n", - deviceProp.integrated ? "Yes" : "No"); - printf(" Support host page-locked memory mapping: %s\n", - deviceProp.canMapHostMemory ? "Yes" : "No"); - printf( - " Compute mode: %s\n", - deviceProp.computeMode == cudaComputeModeDefault - ? "Default (multiple host threads can use this device simultaneously)" - : deviceProp.computeMode == cudaComputeModeExclusive - ? "Exclusive (only one host thread at a time can use this device)" - : deviceProp.computeMode == cudaComputeModeProhibited - ? "Prohibited (no host thread can use this device)" - : "Unknown"); -#endif - } -} - -std::size_t mem_cuda_allocated() -{ -#ifdef PSC_HAVE_RMM - if (track_mr) { - return track_mr->get_allocated_bytes(); - } else { - return 0; - } -#else - return 0; -#endif -} - -RngStateCuda& get_rng_state() -{ - static RngStateCuda rng_state; - return rng_state; -} - -void mem_pool_print() -{ -#if 0 // needs hacked RMM to make print() accessible - if (pool_mr) { - pool_mr->print(); - } -#endif -} diff --git a/src/libpsc/cuda/cuda_base.cxx b/src/libpsc/cuda/cuda_base.cxx index 93f28c6b71..50e54e619f 100644 --- a/src/libpsc/cuda/cuda_base.cxx +++ b/src/libpsc/cuda/cuda_base.cxx @@ -33,6 +33,49 @@ static track_mr_type* track_mr; static pool_mr_type* pool_mr; #endif +namespace +{ +struct DevicePropsCompat +{ + int clock_rate; + int concurrent_kernels; + int kernel_exec_timeout; + int integrated; + int can_map_host_memory; + int compute_mode; +}; + +int cuda_device_attr(int dev, cudaDeviceAttr attr) +{ + int value = 0; + cudaError_t ierr = cudaDeviceGetAttribute(&value, attr, dev); + if (ierr != cudaSuccess) { + return 0; + } + return value; +} + +DevicePropsCompat get_device_props_compat(const hipDeviceProp_t& deviceProp, + int dev) +{ +#if CUDART_VERSION >= 13000 + return {cuda_device_attr(dev, cudaDevAttrClockRate), + cuda_device_attr(dev, cudaDevAttrConcurrentKernels), + cuda_device_attr(dev, cudaDevAttrKernelExecTimeout), + cuda_device_attr(dev, cudaDevAttrIntegrated), + cuda_device_attr(dev, cudaDevAttrCanMapHostMemory), + cuda_device_attr(dev, cudaDevAttrComputeMode)}; +#else + return {deviceProp.clockRate, + deviceProp.deviceOverlap, + deviceProp.kernelExecTimeoutEnabled, + deviceProp.integrated, + deviceProp.canMapHostMemory, + deviceProp.computeMode}; +#endif +} +} // namespace + void cuda_base_init(void) { static bool first_time = true; @@ -79,6 +122,7 @@ void cuda_base_init(void) for (int dev = 0; dev < deviceCount; ++dev) { hipDeviceProp_t deviceProp; hipGetDeviceProperties(&deviceProp, dev); + auto compat = get_device_props_compat(deviceProp, dev); if (dev == 0) { // This function call returns 9999 for both major & minor fields, if no @@ -124,25 +168,25 @@ void cuda_base_init(void) printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment); printf(" Clock rate: %.2f GHz\n", - deviceProp.clockRate * 1e-6f); + compat.clock_rate * 1e-6f); #if CUDART_VERSION >= 2000 printf(" Concurrent copy and execution: %s\n", - deviceProp.deviceOverlap ? "Yes" : "No"); + compat.concurrent_kernels ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 printf(" Run time limit on kernels: %s\n", - deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); + compat.kernel_exec_timeout ? "Yes" : "No"); printf(" Integrated: %s\n", - deviceProp.integrated ? "Yes" : "No"); + compat.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", - deviceProp.canMapHostMemory ? "Yes" : "No"); + compat.can_map_host_memory ? "Yes" : "No"); printf( " Compute mode: %s\n", - deviceProp.computeMode == hipComputeModeDefault + compat.compute_mode == hipComputeModeDefault ? "Default (multiple host threads can use this device simultaneously)" - : deviceProp.computeMode == hipComputeModeExclusive + : compat.compute_mode == hipComputeModeExclusive ? "Exclusive (only one host thread at a time can use this device)" - : deviceProp.computeMode == hipComputeModeProhibited + : compat.compute_mode == hipComputeModeProhibited ? "Prohibited (no host thread can use this device)" : "Unknown"); #endif diff --git a/src/libpsc/cuda/cuda_mparticles.hxx b/src/libpsc/cuda/cuda_mparticles.hxx index 38c4ba1289..59be06ae00 100644 --- a/src/libpsc/cuda/cuda_mparticles.hxx +++ b/src/libpsc/cuda/cuda_mparticles.hxx @@ -138,12 +138,12 @@ public: __host__ __device__ iterator begin() { - return iterator({xi4.begin(), pxi4.begin()}); + return iterator(thrust::make_tuple(xi4.begin(), pxi4.begin())); } __host__ __device__ iterator end() { - return iterator({xi4.end(), pxi4.end()}); + return iterator(thrust::make_tuple(xi4.end(), pxi4.end())); } __host__ void resize(size_t n) diff --git a/src/libpsc/psc_output_fields/fields_item_moments_1st.hxx b/src/libpsc/psc_output_fields/fields_item_moments_1st.hxx index bb0492c16c..73ab4859ff 100644 --- a/src/libpsc/psc_output_fields/fields_item_moments_1st.hxx +++ b/src/libpsc/psc_output_fields/fields_item_moments_1st.hxx @@ -1,6 +1,7 @@ #pragma once +#include #include #include "fields_item.hxx" diff --git a/src/libpsc/psc_push_fields/marder_impl.hxx b/src/libpsc/psc_push_fields/marder_impl.hxx index 2dc4fb50a8..a65dc1df9d 100644 --- a/src/libpsc/psc_push_fields/marder_impl.hxx +++ b/src/libpsc/psc_push_fields/marder_impl.hxx @@ -64,7 +64,7 @@ inline void correct(const Grid_t& grid, E1& efield, const Int3& efield_ib, template inline void cuda_marder_correct_yz(E1& efield, E2& res, Float3 fac, Int3 l, - Int3 r, Int3 l, Int3 r) + Int3 r) { auto k_efield = efield.to_kernel(); auto k_res = res.to_kernel();