diff --git a/CMakeLists.txt b/CMakeLists.txt index 1af795e12f4..20a99e2cb5b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -263,9 +263,9 @@ option(CUDAQ_ENABLE_SANITIZERS "Enable Address Sanitizer (ASan) and Undefined Be # Sub-projects that can be enabled independently, in the style of # LLVM_ENABLE_PROJECTS. Accepts a semicolon-separated list. -set(CUDAQ_ALL_PROJECTS "cudaq;runtime;python") +set(CUDAQ_ALL_PROJECTS "cudaq;runtime;python;realtime") set(CUDAQ_ENABLE_PROJECTS "" CACHE STRING - "Semicolon-separated list of CUDA-Q sub-projects to build. Valid values: ${CUDAQ_ALL_PROJECTS}. Defaults to 'cudaq;runtime' (plus 'python' if CUDAQ_ENABLE_PYTHON is set).") + "Semicolon-separated list of CUDA-Q sub-projects to build. Valid values: ${CUDAQ_ALL_PROJECTS}. Defaults to 'cudaq;runtime' (plus 'python' if CUDAQ_ENABLE_PYTHON is set). Include 'realtime' to build source-tree realtime; set CUDAQ_REALTIME_DIR to use an installed realtime package.") # Back-compat: if CUDAQ_ENABLE_PROJECTS wasn't given, derive it from the # legacy per-project flags so existing build scripts keep working, and warn @@ -824,6 +824,13 @@ if (CUDAQ_REALTIME_DIR) "CUDAQ_REALTIME_DIR requires CUDA support, but CUDA was not found.") endif() + if ("realtime" IN_LIST CUDAQ_ENABLE_PROJECTS) + message(WARNING + "Both CUDAQ_REALTIME_DIR and 'realtime' in CUDAQ_ENABLE_PROJECTS were provided. " + "Using CUDAQ_REALTIME_DIR=${CUDAQ_REALTIME_DIR} and skipping the source-tree realtime build.") + list(REMOVE_ITEM CUDAQ_ENABLE_PROJECTS "realtime") + endif() + find_package(cudaq-realtime CONFIG REQUIRED PATHS "${CUDAQ_REALTIME_DIR}" NO_DEFAULT_PATH) @@ -841,9 +848,17 @@ if (CUDAQ_REALTIME_DIR) set(CUDAQ_ENABLE_REALTIME TRUE) message(STATUS "CUDA-Q realtime integration enabled from: ${CUDAQ_REALTIME_DIR}") +elseif ("realtime" IN_LIST CUDAQ_ENABLE_PROJECTS) + if (NOT CUDA_FOUND) + message(FATAL_ERROR + "The \"realtime\" project requires CUDA support, but CUDA was not found.") + endif() + + set(CUDAQ_ENABLE_REALTIME TRUE) + message(STATUS "CUDA-Q realtime integration enabled from the source tree.") else() message(STATUS - "CUDA-Q realtime integration disabled. Set CUDAQ_REALTIME_DIR to an installed cudaq-realtime prefix to enable it.") + "CUDA-Q realtime integration disabled. Include \"realtime\" in CUDAQ_ENABLE_PROJECTS to enable it.") endif() # Code coverage setup @@ -868,6 +883,9 @@ endif() if("cudaq" IN_LIST CUDAQ_ENABLE_PROJECTS) add_subdirectory(cudaq) endif() +if("realtime" IN_LIST CUDAQ_ENABLE_PROJECTS) + add_subdirectory(realtime) +endif() if("runtime" IN_LIST CUDAQ_ENABLE_PROJECTS) add_subdirectory(runtime) endif() diff --git a/cmake/modules/CUDAQConfig.cmake b/cmake/modules/CUDAQConfig.cmake index 5891c164403..90240e8b164 100644 --- a/cmake/modules/CUDAQConfig.cmake +++ b/cmake/modules/CUDAQConfig.cmake @@ -42,7 +42,9 @@ if (CUDAQ_REALTIME_DIR) else() # Do not use find_dependency here: it inherits find_package(CUDAQ REQUIRED) # and would make realtime mandatory for CUDA-Q installs that do not use it. - find_package(cudaq-realtime CONFIG QUIET) + find_package(cudaq-realtime CONFIG QUIET + PATHS "${CUDAQ_CMAKE_DIR}/../cudaq-realtime" + NO_DEFAULT_PATH) endif() get_filename_component(PARENT_DIRECTORY ${CUDAQ_CMAKE_DIR} DIRECTORY) diff --git a/cudaq/test/CMakeLists.txt b/cudaq/test/CMakeLists.txt index 4708ed752e1..97be10a5402 100644 --- a/cudaq/test/CMakeLists.txt +++ b/cudaq/test/CMakeLists.txt @@ -82,14 +82,14 @@ if (CUDA_FOUND AND CUDAQ_ENABLE_REALTIME AND NOT CUDAQ_DISABLE_CPP_FRONTEND AND cudaq-device-call-runtime cudaq-device-call-runtime-headers cudaq::cudaq-realtime-dispatch - CUDA::cudart) + CUDA::cudart_static) target_compile_options(${target} PRIVATE $<$:--expt-relaxed-constexpr>) set_target_properties(${target} PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON - CUDA_RUNTIME_LIBRARY Shared + CUDA_RUNTIME_LIBRARY Static CUDA_STANDARD 20 CXX_STANDARD 20 LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) diff --git a/cudaq/test/lit.site.cfg.py.in b/cudaq/test/lit.site.cfg.py.in index a68f44d40f5..a4601412c8b 100644 --- a/cudaq/test/lit.site.cfg.py.in +++ b/cudaq/test/lit.site.cfg.py.in @@ -44,7 +44,9 @@ config.cuda_found = "@CUDA_FOUND@" if cmake_boolvar_to_bool(config.cuda_found): config.available_features.add('nvcc') +config.cuda_gpu_available = os.system("nvidia-smi >/dev/null 2>&1") == 0 if (cmake_boolvar_to_bool(config.cuda_found) and + config.cuda_gpu_available and config.cudaq_device_call_realtime_libs and all(os.path.exists(lib) for lib in config.cudaq_device_call_realtime_libs)): diff --git a/realtime/CMakeLists.txt b/realtime/CMakeLists.txt index ef771c93271..7b6e3242005 100644 --- a/realtime/CMakeLists.txt +++ b/realtime/CMakeLists.txt @@ -10,21 +10,25 @@ cmake_minimum_required(VERSION 3.22 FATAL_ERROR) include(FetchContent) -# Set a default build type if none was specified. Must set this before -# project(). -set(CMAKE_BUILD_TYPE "Release" CACHE STRING - "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel") +string(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}" CUDAQ_REALTIME_STANDALONE_BUILD) -# Set a default install prefix if none was specified. -set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.cudaq_realtime" CACHE STRING - "Install path prefix, prepended onto install directories") +if(CUDAQ_REALTIME_STANDALONE_BUILD) + # Set a default build type if none was specified. Must set this before + # project(). + set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel") + + # Set a default install prefix if none was specified. + set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.cudaq_realtime" CACHE STRING + "Install path prefix, prepended onto install directories") +endif() # Project setup # ============================================================================== -# Check if core is built as a standalone project. -project(cudaq-realtime) -set(CUDAQ_REALTIME_STANDALONE_BUILD TRUE) +if(CUDAQ_REALTIME_STANDALONE_BUILD) + project(cudaq-realtime) +endif() include(GNUInstallDirs) include(CMakePackageConfigHelpers) @@ -45,71 +49,107 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") # Options # ============================================================================== +set(_cudaq_realtime_build_tests_default ON) +if(NOT CUDAQ_REALTIME_STANDALONE_BUILD) + set(_cudaq_realtime_build_tests_default OFF) +endif() + option(CUDAQ_REALTIME_BUILD_TESTS - "Generate build targets for the CUDAQ real-time unit tests" ON) + "Generate build targets for the CUDAQ real-time unit tests" + ${_cudaq_realtime_build_tests_default}) option(CUDAQ_REALTIME_BUILD_EXAMPLES "Generate build targets for the CUDAQ real-time example programs" ON) option(CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS "Build Hololink bridge/emulator/playback tools (requires hololink)." OFF) +set(_host_compiler_opts_list "") +if(CUDAQ_REALTIME_STANDALONE_BUILD) + list(APPEND _host_compiler_opts_list "-fPIC") +endif() + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64" AND + CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13) + # nvcc cannot parse the GCC BF16 intrinsic headers exposed through + # on the CI toolchains. This only applies from GCC 13: + # GCC 13 switched the AVX512BF16 intrinsics to the `__bf16` builtin type + # (GCC 12 still used the parseable `__bfloat16` typedef) and introduced + # , which does not exist on GCC 12. + list(APPEND _host_compiler_opts_list + "-D_AVX512BF16INTRIN_H_INCLUDED" + "-D_AVX512BF16VLINTRIN_H_INCLUDED" + "-D_AVXNECONVERTINTRIN_H_INCLUDED") + endif() + + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + # -mno-amx-tile alone is insufficient: the AMX intrinsic headers use + # #pragma GCC target("amx-*") which re-enables the builtins that nvcc + # cannot parse. Pre-define the include guards to skip those headers. + list(APPEND _host_compiler_opts_list + "-mno-amx-tile" + "-D_AMXTILEINTRIN_H_INCLUDED" + "-D_AMXBF16INTRIN_H_INCLUDED" + "-D_AMXINT8INTRIN_H_INCLUDED" + "-D_AMXFP16INTRIN_H_INCLUDED" + "-D_AMXCOMPLEXINTRIN_H_INCLUDED") + endif() +endif() + +list(JOIN _host_compiler_opts_list "," _host_compiler_opts) + # Check for CUDA Support (ref: cuda-quantum/CMakeLists.txt) # ============================================================================== -include(CheckLanguage) -check_language(CUDA) -set(CUDA_FOUND FALSE) -# Generate -gencode arch=compute_XX,code=sm_XX for list of supported -# arch values. -# List should be sorted in increasing order. -function(CUDA_get_gencode_args out_args_string arch_values) - # allow the user to pass the list like a normal variable - set(arch_list ${arch_values} ${ARGN}) - set(out "") - foreach(arch IN LISTS arch_list) - set(out "${out} -gencode arch=compute_${arch},code=sm_${arch}") - endforeach(arch) - - # Repeat the last one as to ensure the generation of PTX for most - # recent virtual architecture for forward compatibility - list(GET arch_list -1 last_arch) - set(out "${out} -gencode arch=compute_${last_arch},code=compute_${last_arch}") - set(${out_args_string} ${out} PARENT_SCOPE) -endfunction() - -if(CMAKE_CUDA_COMPILER) - if (NOT CUDA_TARGET_ARCHS) - if (CUDAToolkit_VERSION VERSION_LESS 13.0) - # Ampere, Hopper - set(CUDA_TARGET_ARCHS "80;90") - else() - # Ampere, Hopper, Blackwell - set(CUDA_TARGET_ARCHS "80;90;100") +if(CUDAQ_REALTIME_STANDALONE_BUILD) + include(CheckLanguage) + check_language(CUDA) + set(CUDA_FOUND FALSE) + # Generate -gencode arch=compute_XX,code=sm_XX for list of supported + # arch values. + # List should be sorted in increasing order. + function(CUDA_get_gencode_args out_args_string arch_values) + # allow the user to pass the list like a normal variable + set(arch_list ${arch_values} ${ARGN}) + set(out "") + foreach(arch IN LISTS arch_list) + set(out "${out} -gencode arch=compute_${arch},code=sm_${arch}") + endforeach(arch) + + # Repeat the last one as to ensure the generation of PTX for most + # recent virtual architecture for forward compatibility + list(GET arch_list -1 last_arch) + set(out "${out} -gencode arch=compute_${last_arch},code=compute_${last_arch}") + set(${out_args_string} ${out} PARENT_SCOPE) + endfunction() + + if(CMAKE_CUDA_COMPILER) + find_package(CUDAToolkit REQUIRED) + if (NOT CUDA_TARGET_ARCHS) + if (CUDAToolkit_VERSION VERSION_LESS 13.0) + # Ampere, Hopper + set(CUDA_TARGET_ARCHS "80;90") + else() + # Ampere, Hopper, Blackwell + set(CUDA_TARGET_ARCHS "80;90;100") + endif() endif() + CUDA_get_gencode_args(CUDA_gencode_flags ${CUDA_TARGET_ARCHS}) + # Keep realtime CUDA sources on C++17. nvcc's C++20 frontend can hit an + # internal compiler error in libstdc++ headers when compiling the standalone + # realtime tests, including test_host_dispatcher.cu, on the CI toolchain. + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -shared -std=c++17 ${CUDA_gencode_flags} --compiler-options ${_host_compiler_opts}") + + enable_language(CUDA) + set(CUDA_FOUND TRUE) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) + message(STATUS "Cuda language found.") endif() - CUDA_get_gencode_args(CUDA_gencode_flags ${CUDA_TARGET_ARCHS}) - set(_host_compiler_opts "-fPIC") - if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64") - # -mno-amx-tile alone is insufficient with GCC 13+: the AMX intrinsic - # headers use #pragma GCC target("amx-tile") which re-enables the - # builtins that nvcc cannot parse. Pre-define the include guards to - # prevent these headers from being processed at all. - string(APPEND _host_compiler_opts ",-mno-amx-tile") - foreach(_amx_guard _AMXTILEINTRIN_H_INCLUDED - _AMXBF16INTRIN_H_INCLUDED - _AMXINT8INTRIN_H_INCLUDED - _AMXFP16INTRIN_H_INCLUDED - _AMXCOMPLEXINTRIN_H_INCLUDED) - string(APPEND _host_compiler_opts ",-D${_amx_guard}") - endforeach() - endif() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -shared -std=c++17 ${CUDA_gencode_flags} --compiler-options ${_host_compiler_opts}") - - enable_language(CUDA) - set(CUDA_FOUND TRUE) - set(CMAKE_CUDA_STANDARD 17) - set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) +elseif(CUDA_FOUND) find_package(CUDAToolkit REQUIRED) - message(STATUS "Cuda language found.") + if(_host_compiler_opts) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options ${_host_compiler_opts}") + endif() endif() # External Dependencies @@ -117,9 +157,12 @@ endif() find_package(Threads REQUIRED) -# Enable static linking of the C++ standard library to avoid dependency issues when distributing the library. -SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libstdc++ -static-libgcc") -SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++ -static-libgcc") +if(CUDAQ_REALTIME_STANDALONE_BUILD) + # Standalone realtime owns its distribution linker policy. Integrated CUDA-Q + # builds inherit the top-level CUDA-Q runtime/dependency link settings. + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libstdc++ -static-libgcc") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++ -static-libgcc") +endif() add_subdirectory(lib) diff --git a/realtime/lib/cpu_transport/CMakeLists.txt b/realtime/lib/cpu_transport/CMakeLists.txt index d321a894ba7..2b9a25a6834 100644 --- a/realtime/lib/cpu_transport/CMakeLists.txt +++ b/realtime/lib/cpu_transport/CMakeLists.txt @@ -33,6 +33,9 @@ add_library(cudaq-realtime-cpu-transport STATIC roce_transceiver.cpp # class CpuRoceTransceiver: ibv setup, RX/TX threads, ring lifecycle roce_wrapper.cpp # extern-C shim used by bridge tools and CpuRoceChannel ) +# Source-tree CUDA-Q integration uses the same cudaq:: target names +# that installed exports provide. +add_library(cudaq::cudaq-realtime-cpu-transport ALIAS cudaq-realtime-cpu-transport) target_include_directories(cudaq-realtime-cpu-transport PUBLIC diff --git a/realtime/lib/daemon/CMakeLists.txt b/realtime/lib/daemon/CMakeLists.txt index 5df1470dd91..b54d848ee7f 100644 --- a/realtime/lib/daemon/CMakeLists.txt +++ b/realtime/lib/daemon/CMakeLists.txt @@ -20,6 +20,9 @@ if(CUDA_FOUND) ) add_library(cudaq-realtime SHARED ${CUDAQ_REALTIME_SOURCES}) + # Source-tree CUDA-Q integration uses the same cudaq:: target names + # that installed exports provide. + add_library(cudaq::cudaq-realtime ALIAS cudaq-realtime) target_include_directories(cudaq-realtime PUBLIC @@ -54,6 +57,9 @@ if(CUDA_FOUND) add_library(cudaq-realtime-dispatch STATIC dispatcher/dispatch_kernel.cu ) + # Source-tree CUDA-Q integration uses the same cudaq:: target names + # that installed exports provide. + add_library(cudaq::cudaq-realtime-dispatch ALIAS cudaq-realtime-dispatch) set_target_properties(cudaq-realtime-dispatch PROPERTIES LINK_FLAGS_RELEASE "-Wl,--exclude-libs=ALL") @@ -97,6 +103,9 @@ if(CUDA_FOUND) dispatcher/host_dispatcher.cu dispatcher/host_dispatcher_capi.cu ) + # Source-tree CUDA-Q integration uses the same cudaq:: target names + # that installed exports provide. + add_library(cudaq::cudaq-realtime-host-dispatch ALIAS cudaq-realtime-host-dispatch) target_include_directories(cudaq-realtime-host-dispatch PUBLIC diff --git a/realtime/unittests/utils/CMakeLists.txt b/realtime/unittests/utils/CMakeLists.txt index 1cd14db74f6..08f73b09166 100644 --- a/realtime/unittests/utils/CMakeLists.txt +++ b/realtime/unittests/utils/CMakeLists.txt @@ -165,9 +165,10 @@ if (GPU_ROCE_TRANSCEIVER_LIB AND message(STATUS "Building hololink_bridge (generic increment)") message(STATUS " GPU RoCE Transceiver: ${GPU_ROCE_TRANSCEIVER_LIB}") - # Hololink wrapper static library (compiled by g++, isolates fmt) + # Hololink wrapper static library (compiled by g++, isolates fmt). + # Use the realtime source root so this path works when nested in CUDA-Q. add_library(hololink_wrapper_generic STATIC - ${CMAKE_SOURCE_DIR}/lib/daemon/bridge/hololink/hololink_wrapper.cpp) + ${CUDAQ_REALTIME_SOURCE_DIR}/lib/daemon/bridge/hololink/hololink_wrapper.cpp) target_include_directories(hololink_wrapper_generic PRIVATE diff --git a/runtime/internal/device_call/include/cudaq_internal/device_call/DeviceCallService.h b/runtime/internal/device_call/include/cudaq_internal/device_call/DeviceCallService.h index 1683dd64d22..22782a3c5ed 100644 --- a/runtime/internal/device_call/include/cudaq_internal/device_call/DeviceCallService.h +++ b/runtime/internal/device_call/include/cudaq_internal/device_call/DeviceCallService.h @@ -167,5 +167,12 @@ using DeviceCallServicePluginInfoFn = DeviceCallServicePluginInfo (*)(); // Default service discovery entry point. Service artifacts may also expose // suffixed variants with the same signature for tests or multi-service // deployments, e.g. cudaqGetDeviceCallServicePluginInfo_. +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreturn-type-c-linkage" +#endif extern "C" cudaq_internal::device_call::DeviceCallServicePluginInfo cudaqGetDeviceCallServicePluginInfo(); +#if defined(__clang__) +#pragma clang diagnostic pop +#endif diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index c9ca33e7d32..8b80a1b54ee 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -150,6 +150,7 @@ create_tests_with_backend(stim backends/StimTester.cpp) if (CUDA_FOUND AND CUDAQ_ENABLE_REALTIME) add_executable(test_device_call_dispatch + device_call/DeviceCallDispatchTester.cpp device_call/DeviceCallDispatchTester.cu) set_target_properties(test_device_call_dispatch PROPERTIES CUDA_SEPARABLE_COMPILATION ON diff --git a/unittests/device_call/DeviceCallDispatchTester.cpp b/unittests/device_call/DeviceCallDispatchTester.cpp new file mode 100644 index 00000000000..695a32358d9 --- /dev/null +++ b/unittests/device_call/DeviceCallDispatchTester.cpp @@ -0,0 +1,401 @@ +/******************************************************************************* + * Copyright (c) 2026 NVIDIA Corporation & Affiliates. * + * All rights reserved. * + * * + * This source code and the accompanying materials are made available under * + * the terms of the Apache License 2.0 which accompanies this distribution. * + ******************************************************************************/ + +#include "cudaq_internal/device_call/DeviceCallError.h" +#include "cudaq_internal/device_call/DeviceCallService.h" +#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" +#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" +#include +#include + +#include +#include +#include + +namespace cudaq_internal::device_call { +void initializeDeviceCallRuntime(int argc, char **argv); +void finalizeDeviceCallRuntime(); + +namespace test { +bool createGraphAddThem(void **dMailbox, cudaGraph_t *graphOut, + cudaGraphExec_t *execOut); +void fillHostGraphAddEntry(cudaq_function_entry_t &entry, + cudaGraphExec_t graphExec); +int populateAddThemTable(cudaq_function_entry_t *entries, bool useOffset, + cudaStream_t stream); +} // namespace test +} // namespace cudaq_internal::device_call + +extern "C" std::int32_t __cudaq_device_call_acquire_realtime_frame( + std::uint32_t deviceId, std::uint32_t functionId, + std::uint64_t requestBytes, std::uint64_t responseCapacity, + void **frameHandle, void **requestPayload, void **responsePayload); +extern "C" std::int32_t +__cudaq_device_call_dispatch_realtime_frame(void *frameHandle, + std::uint64_t *responseBytes); +extern "C" void +__cudaq_device_call_safely_release_realtime_frame(void *frameHandle); + +namespace { + +using namespace cudaq_internal::device_call; + +constexpr std::uint32_t AddThemFunctionId = + cudaq::realtime::fnv1a_hash("addThem"); +constexpr std::uint32_t GraphAddThemFunctionId = + cudaq::realtime::fnv1a_hash("graphAddThem"); +constexpr std::int32_t DeviceCallSuccessStatus = + toAbiStatus(DeviceCallStatus::Success); +constexpr std::int32_t DeviceCallInvalidArgumentStatus = + toAbiStatus(DeviceCallStatus::InvalidArgument); +constexpr std::int32_t DeviceCallNotInitializedStatus = + toAbiStatus(DeviceCallStatus::NotInitialized); +constexpr std::int32_t DeviceCallResponseTooLargeStatus = + toAbiStatus(DeviceCallStatus::ResponseTooLarge); + +// A payload buffer must be non-null whenever its declared length is nonzero; +// a zero-length buffer is allowed to be null. +constexpr bool isValidBuffer(const void *buffer, std::uint64_t length) { + return length == 0 || buffer != nullptr; +} + +std::int32_t dispatchUsingFrameLease(std::uint32_t deviceId, + std::uint32_t functionId, + const void *request, + std::uint64_t requestLen, void *response, + std::uint64_t responseCapacity, + std::uint64_t *responseLen) { + if (!isValidBuffer(request, requestLen) || !responseLen || + !isValidBuffer(response, responseCapacity)) + return DeviceCallInvalidArgumentStatus; + + void *frame = nullptr; + void *requestPayload = nullptr; + void *responsePayload = nullptr; + std::int32_t status = __cudaq_device_call_acquire_realtime_frame( + deviceId, functionId, requestLen, responseCapacity, &frame, + &requestPayload, &responsePayload); + if (status != DeviceCallSuccessStatus) + return status; + if (!isValidBuffer(requestPayload, requestLen) || + !isValidBuffer(responsePayload, responseCapacity)) { + __cudaq_device_call_safely_release_realtime_frame(frame); + return DeviceCallInvalidArgumentStatus; + } + + if (requestLen > 0) + std::memcpy(requestPayload, request, requestLen); + + status = __cudaq_device_call_dispatch_realtime_frame(frame, responseLen); + if (status == DeviceCallSuccessStatus && *responseLen > responseCapacity) + status = DeviceCallResponseTooLargeStatus; + if (status == DeviceCallSuccessStatus && *responseLen > 0) + std::memcpy(response, responsePayload, *responseLen); + + __cudaq_device_call_safely_release_realtime_frame(frame); + return status; +} + +enum class TestGpuTable { AddThem, AddThemOffset }; + +TestGpuTable selectedGpuTable = TestGpuTable::AddThem; + +class TestRealtimeService : public DeviceCallService { +public: + int create(const void *, std::size_t) override { return 0; } + + int destroy() noexcept override { + teardownHostDispatch(); + return 0; + } + + std::uint32_t getFunctionCount() const override { return 1; } + + int populateTable(cudaq_function_entry_t *entries, std::uint32_t capacity, + cudaStream_t stream) override { + if (!entries || capacity < 1) + return 1; + return test::populateAddThemTable( + entries, selectedGpuTable == TestGpuTable::AddThemOffset, stream); + } + + cudaq_dispatch_launch_fn_t getDeviceDispatchLaunch() const override { + return cudaq_launch_dispatch_kernel_regular; + } + + int getHostDispatchTable(DeviceCallHostDispatchTable &table) override { + if (setupHostDispatch() != 0) + return 1; + table.entries = hostEntries.data(); + table.count = static_cast(hostEntries.size()); + table.deviceId = 0; + table.mailbox = h_mailbox; + return 0; + } + + int stop() noexcept override { + teardownHostDispatch(); + return 0; + } + +private: + int setupHostDispatch() { + if (h_mailbox && graphExec) + return 0; + + if (cudaHostAlloc(&h_mailbox, sizeof(void *), cudaHostAllocMapped) != + cudaSuccess) + return 1; + std::memset(h_mailbox, 0, sizeof(void *)); + if (cudaHostGetDevicePointer(reinterpret_cast(&d_mailbox), + h_mailbox, 0) != cudaSuccess) { + teardownHostDispatch(); + return 1; + } + if (!test::createGraphAddThem(d_mailbox, &graph, &graphExec)) { + teardownHostDispatch(); + return 1; + } + + test::fillHostGraphAddEntry(hostEntries[0], graphExec); + return 0; + } + + void teardownHostDispatch() noexcept { + if (graphExec) + cudaGraphExecDestroy(graphExec); + if (graph) + cudaGraphDestroy(graph); + if (h_mailbox) + cudaFreeHost(h_mailbox); + + graphExec = nullptr; + graph = nullptr; + h_mailbox = nullptr; + d_mailbox = nullptr; + hostEntries = {}; + } + + void **h_mailbox = nullptr; + void **d_mailbox = nullptr; + cudaGraph_t graph = nullptr; + cudaGraphExec_t graphExec = nullptr; + std::array hostEntries{}; +}; + +DeviceCallService *getTestRealtimeService() { + static TestRealtimeService service; + return &service; +} + +} // namespace + +extern "C" cudaq_internal::device_call::DeviceCallServicePluginInfo +cudaqGetDeviceCallServicePluginInfo() { + return {"test-device-call", &getTestRealtimeService}; +} + +namespace { + +void initializeGpuRuntime(TestGpuTable table = TestGpuTable::AddThem) { + selectedGpuTable = table; + char program[] = "test_device_call_dispatch"; + char *argv[] = {program}; + cudaq_internal::device_call::initializeDeviceCallRuntime(1, argv); +} + +void initializeHostRuntime() { + char program[] = "test_device_call_dispatch"; + char option[] = "--cudaq-device-call=host-dispatch"; + char *argv[] = {program, option}; + cudaq_internal::device_call::initializeDeviceCallRuntime(2, argv); +} + +void finalizeRuntime() { + cudaq_internal::device_call::finalizeDeviceCallRuntime(); +} + +class DeviceCallDispatchTest : public ::testing::Test { +protected: + void SetUp() override { ASSERT_NO_THROW(initializeGpuRuntime()); } + + void TearDown() override { ASSERT_NO_THROW(finalizeRuntime()); } +}; + +TEST_F(DeviceCallDispatchTest, DispatchesI32AddHandler) { + std::array request{}; + auto *const args = request.data(); + args[0] = 19; + args[1] = 23; + + std::int32_t response = 0; + std::uint64_t responseLen = 0; + ASSERT_EQ(0, + dispatchUsingFrameLease(0, AddThemFunctionId, request.data(), + request.size() * sizeof(request[0]), + &response, sizeof(response), &responseLen)); + EXPECT_EQ(sizeof(response), responseLen); + EXPECT_EQ(42, response); +} + +TEST_F(DeviceCallDispatchTest, DispatchesI32AddHandlerThroughFrameLease) { + void *frame = nullptr; + void *requestPayload = nullptr; + void *responsePayload = nullptr; + ASSERT_EQ(0, __cudaq_device_call_acquire_realtime_frame( + 0, AddThemFunctionId, 2 * sizeof(std::int32_t), + sizeof(std::int32_t), &frame, &requestPayload, + &responsePayload)); + ASSERT_NE(nullptr, frame); + ASSERT_NE(nullptr, requestPayload); + ASSERT_NE(nullptr, responsePayload); + + auto *const args = static_cast(requestPayload); + args[0] = 19; + args[1] = 23; + + std::uint64_t responseLen = 0; + ASSERT_EQ(0, + __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen)); + EXPECT_EQ(sizeof(std::int32_t), responseLen); + EXPECT_EQ(42, *static_cast(responsePayload)); + + __cudaq_device_call_safely_release_realtime_frame(frame); +} + +TEST_F(DeviceCallDispatchTest, DispatchesVoidFireAndForgetThroughFrameLease) { + void *frame = nullptr; + void *requestPayload = nullptr; + void *responsePayload = nullptr; + ASSERT_EQ(0, __cudaq_device_call_acquire_realtime_frame( + 0, AddThemFunctionId, 2 * sizeof(std::int32_t), 0, &frame, + &requestPayload, &responsePayload)); + ASSERT_NE(nullptr, frame); + ASSERT_NE(nullptr, requestPayload); + EXPECT_EQ(nullptr, responsePayload); + + auto *args = static_cast(requestPayload); + args[0] = 19; + args[1] = 23; + + std::uint64_t responseLen = 123; + ASSERT_EQ(0, + __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen)); + EXPECT_EQ(0u, responseLen); + + __cudaq_device_call_safely_release_realtime_frame(frame); + + for (int i = 0; i < 2; ++i) { + frame = nullptr; + requestPayload = nullptr; + responsePayload = nullptr; + ASSERT_EQ(0, __cudaq_device_call_acquire_realtime_frame( + 0, AddThemFunctionId, 2 * sizeof(std::int32_t), + sizeof(std::int32_t), &frame, &requestPayload, + &responsePayload)); + args = static_cast(requestPayload); + args[0] = 19; + args[1] = 23; + + responseLen = 0; + ASSERT_EQ(0, + __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen)); + EXPECT_EQ(sizeof(std::int32_t), responseLen); + EXPECT_EQ(42, *static_cast(responsePayload)); + + __cudaq_device_call_safely_release_realtime_frame(frame); + } +} + +TEST_F(DeviceCallDispatchTest, ReinitializesThroughDiscoveredPlugin) { + ASSERT_NO_THROW(finalizeRuntime()); + ASSERT_NO_THROW(initializeGpuRuntime(TestGpuTable::AddThemOffset)); + + std::array request{19, 23}; + std::int32_t response = 0; + std::uint64_t responseLen = 0; + ASSERT_EQ(0, + dispatchUsingFrameLease(0, AddThemFunctionId, request.data(), + request.size() * sizeof(request[0]), + &response, sizeof(response), &responseLen)); + EXPECT_EQ(sizeof(response), responseLen); + EXPECT_EQ(142, response); +} + +class HostGraphDispatchFrameTest : public ::testing::Test { +protected: + void SetUp() override { ASSERT_NO_THROW(initializeHostRuntime()); } + + void TearDown() override { + if (frame) + __cudaq_device_call_safely_release_realtime_frame(frame); + ASSERT_NO_THROW(finalizeRuntime()); + } + + void *frame = nullptr; +}; + +TEST_F(HostGraphDispatchFrameTest, DispatchesGraphLaunchThroughFrameLease) { + void *requestPayload = nullptr; + void *responsePayload = nullptr; + ASSERT_EQ(DeviceCallSuccessStatus, + __cudaq_device_call_acquire_realtime_frame( + 0, GraphAddThemFunctionId, 2 * sizeof(std::int32_t), + sizeof(std::int32_t), &frame, &requestPayload, + &responsePayload)); + ASSERT_NE(nullptr, frame); + ASSERT_NE(nullptr, requestPayload); + ASSERT_NE(nullptr, responsePayload); + + auto *const args = static_cast(requestPayload); + args[0] = 19; + args[1] = 23; + + std::uint64_t responseLen = 0; + ASSERT_EQ(DeviceCallSuccessStatus, + __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen)); + EXPECT_EQ(sizeof(std::int32_t), responseLen); + EXPECT_EQ(42, *static_cast(responsePayload)); + + __cudaq_device_call_safely_release_realtime_frame(frame); + frame = nullptr; +} + +class DeviceCallServicePluginTest : public ::testing::Test { +protected: + void SetUp() override { ASSERT_NO_THROW(initializeGpuRuntime()); } + + void TearDown() override { ASSERT_NO_THROW(finalizeRuntime()); } +}; + +TEST_F(DeviceCallServicePluginTest, DispatchesThroughDiscoveredPlugin) { + std::array request{19, 23}; + std::int32_t response = 0; + std::uint64_t responseLen = 0; + + ASSERT_EQ(0, + dispatchUsingFrameLease(0, AddThemFunctionId, request.data(), + request.size() * sizeof(request[0]), + &response, sizeof(response), &responseLen)); + EXPECT_EQ(sizeof(response), responseLen); + EXPECT_EQ(42, response); +} + +TEST_F(DeviceCallServicePluginTest, FinalizeClearsPluginSession) { + ASSERT_NO_THROW(finalizeRuntime()); + + std::array request{19, 23}; + std::int32_t response = 0; + std::uint64_t responseLen = 0; + EXPECT_EQ(DeviceCallNotInitializedStatus, + dispatchUsingFrameLease(0, AddThemFunctionId, request.data(), + request.size() * sizeof(request[0]), + &response, sizeof(response), &responseLen)); +} + +} // namespace diff --git a/unittests/device_call/DeviceCallDispatchTester.cu b/unittests/device_call/DeviceCallDispatchTester.cu index 28846c0402b..e07cb8fc39b 100644 --- a/unittests/device_call/DeviceCallDispatchTester.cu +++ b/unittests/device_call/DeviceCallDispatchTester.cu @@ -6,56 +6,19 @@ * the terms of the Apache License 2.0 which accompanies this distribution. * ******************************************************************************/ -#include "cudaq_internal/device_call/DeviceCallError.h" -#include "cudaq_internal/device_call/DeviceCallService.h" #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h" #include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h" #include -#include -#include #include -#include -namespace cudaq_internal::device_call { -void initializeDeviceCallRuntime(int argc, char **argv); -void finalizeDeviceCallRuntime(); -} // namespace cudaq_internal::device_call -extern "C" std::int32_t __cudaq_device_call_acquire_realtime_frame( - std::uint32_t deviceId, std::uint32_t functionId, - std::uint64_t requestBytes, std::uint64_t responseCapacity, - void **frameHandle, void **requestPayload, void **responsePayload); -extern "C" std::int32_t -__cudaq_device_call_dispatch_realtime_frame(void *frameHandle, - std::uint64_t *responseBytes); -extern "C" void -__cudaq_device_call_safely_release_realtime_frame(void *frameHandle); - -namespace { - -using namespace cudaq_internal::device_call; +namespace cudaq_internal::device_call::test { constexpr std::uint32_t AddThemFunctionId = cudaq::realtime::fnv1a_hash("addThem"); constexpr std::uint32_t GraphAddThemFunctionId = cudaq::realtime::fnv1a_hash("graphAddThem"); -constexpr std::int32_t DeviceCallSuccessStatus = - toAbiStatus(DeviceCallStatus::Success); -constexpr std::int32_t DeviceCallInvalidArgumentStatus = - toAbiStatus(DeviceCallStatus::InvalidArgument); -constexpr std::int32_t DeviceCallNotInitializedStatus = - toAbiStatus(DeviceCallStatus::NotInitialized); -constexpr std::int32_t DeviceCallResponseTooLargeStatus = - toAbiStatus(DeviceCallStatus::ResponseTooLarge); - -#define ASSERT_CUDA_SUCCESS(expr) \ - do { \ - const cudaError_t err = (expr); \ - ASSERT_EQ(cudaSuccess, err) \ - << #expr << " failed at " << __FILE__ << ":" << __LINE__ << ": " \ - << cudaGetErrorString(err); \ - } while (false) __device__ int addThemHandler(const void *input, void *output, std::uint32_t argLen, std::uint32_t maxResultLen, @@ -127,13 +90,13 @@ __global__ void graphAddThemHandler(void **mailboxSlotPtr) { *ioContext->tx_flag = ioContext->tx_flag_value; } -bool createGraphAddThem(void **d_mailbox, cudaGraph_t *graphOut, +bool createGraphAddThem(void **dMailbox, cudaGraph_t *graphOut, cudaGraphExec_t *execOut) { cudaGraph_t graph = nullptr; if (cudaGraphCreate(&graph, 0) != cudaSuccess) return false; - void *kernelArgs[] = {&d_mailbox}; + void *kernelArgs[] = {&dMailbox}; const cudaKernelNodeParams params = [&] { cudaKernelNodeParams result{}; result.func = reinterpret_cast(graphAddThemHandler); @@ -161,44 +124,6 @@ bool createGraphAddThem(void **d_mailbox, cudaGraph_t *graphOut, return true; } -std::int32_t dispatchUsingFrameLease(std::uint32_t deviceId, - std::uint32_t functionId, - const void *request, - std::uint64_t requestLen, void *response, - std::uint64_t responseCapacity, - std::uint64_t *responseLen) { - if ((requestLen > 0 && !request) || !responseLen) - return DeviceCallInvalidArgumentStatus; - if (responseCapacity > 0 && !response) - return DeviceCallInvalidArgumentStatus; - - void *frame = nullptr; - void *requestPayload = nullptr; - void *responsePayload = nullptr; - std::int32_t status = __cudaq_device_call_acquire_realtime_frame( - deviceId, functionId, requestLen, responseCapacity, &frame, - &requestPayload, &responsePayload); - if (status != DeviceCallSuccessStatus) - return status; - if ((requestLen > 0 && !requestPayload) || - (responseCapacity > 0 && !responsePayload)) { - __cudaq_device_call_safely_release_realtime_frame(frame); - return DeviceCallInvalidArgumentStatus; - } - - if (requestLen > 0) - std::memcpy(requestPayload, request, requestLen); - - status = __cudaq_device_call_dispatch_realtime_frame(frame, responseLen); - if (status == DeviceCallSuccessStatus && *responseLen > responseCapacity) - status = DeviceCallResponseTooLargeStatus; - if (status == DeviceCallSuccessStatus && *responseLen > 0) - std::memcpy(response, responsePayload, *responseLen); - - __cudaq_device_call_safely_release_realtime_frame(frame); - return status; -} - __global__ void initDeviceCallTable(cudaq_function_entry_t *entries) { if (threadIdx.x != 0 || blockIdx.x != 0) return; @@ -267,304 +192,13 @@ void fillHostGraphAddEntry(cudaq_function_entry_t &entry, entry.schema.results[0].num_elements = 1; } -enum class TestGpuTable { AddThem, AddThemOffset }; - -TestGpuTable selectedGpuTable = TestGpuTable::AddThem; - -class TestRealtimeService : public DeviceCallService { -public: - int create(const void *, std::size_t) override { return 0; } - - int destroy() noexcept override { - teardownHostDispatch(); - return 0; - } - - std::uint32_t getFunctionCount() const override { return 1; } - - int populateTable(cudaq_function_entry_t *entries, std::uint32_t capacity, - cudaStream_t stream) override { - if (!entries || capacity < 1) - return 1; - if (selectedGpuTable == TestGpuTable::AddThemOffset) - initDeviceCallTableWithOffset<<<1, 1, 0, stream>>>(entries); - else - initDeviceCallTable<<<1, 1, 0, stream>>>(entries); - return cudaGetLastError() == cudaSuccess ? 0 : 1; - } - - cudaq_dispatch_launch_fn_t getDeviceDispatchLaunch() const override { - return cudaq_launch_dispatch_kernel_regular; - } - - int getHostDispatchTable(DeviceCallHostDispatchTable &table) override { - if (setupHostDispatch() != 0) - return 1; - table.entries = hostEntries.data(); - table.count = static_cast(hostEntries.size()); - table.deviceId = 0; - table.mailbox = h_mailbox; - return 0; - } - - int stop() noexcept override { - teardownHostDispatch(); - return 0; - } - -private: - int setupHostDispatch() { - if (h_mailbox && graphExec) - return 0; - - if (cudaHostAlloc(&h_mailbox, sizeof(void *), cudaHostAllocMapped) != - cudaSuccess) - return 1; - std::memset(h_mailbox, 0, sizeof(void *)); - if (cudaHostGetDevicePointer(reinterpret_cast(&d_mailbox), h_mailbox, - 0) != cudaSuccess) { - teardownHostDispatch(); - return 1; - } - if (!createGraphAddThem(d_mailbox, &graph, &graphExec)) { - teardownHostDispatch(); - return 1; - } - - fillHostGraphAddEntry(hostEntries[0], graphExec); - return 0; - } - - void teardownHostDispatch() noexcept { - if (graphExec) - cudaGraphExecDestroy(graphExec); - if (graph) - cudaGraphDestroy(graph); - if (h_mailbox) - cudaFreeHost(h_mailbox); - - graphExec = nullptr; - graph = nullptr; - h_mailbox = nullptr; - d_mailbox = nullptr; - hostEntries = {}; - } - - void **h_mailbox = nullptr; - void **d_mailbox = nullptr; - cudaGraph_t graph = nullptr; - cudaGraphExec_t graphExec = nullptr; - std::array hostEntries{}; -}; - -DeviceCallService *getTestRealtimeService() { - static TestRealtimeService service; - return &service; -} - -} // namespace - -extern "C" cudaq_internal::device_call::DeviceCallServicePluginInfo -cudaqGetDeviceCallServicePluginInfo() { - return {"test-device-call", &getTestRealtimeService}; -} - -namespace { - -void initializeGpuRuntime(TestGpuTable table = TestGpuTable::AddThem) { - selectedGpuTable = table; - char program[] = "test_device_call_dispatch"; - char *argv[] = {program}; - cudaq_internal::device_call::initializeDeviceCallRuntime(1, argv); -} - -void initializeHostRuntime() { - char program[] = "test_device_call_dispatch"; - char option[] = "--cudaq-device-call=host-dispatch"; - char *argv[] = {program, option}; - cudaq_internal::device_call::initializeDeviceCallRuntime(2, argv); -} - -void finalizeRuntime() { - cudaq_internal::device_call::finalizeDeviceCallRuntime(); -} - -class DeviceCallDispatchTest : public ::testing::Test { -protected: - void SetUp() override { ASSERT_NO_THROW(initializeGpuRuntime()); } - - void TearDown() override { ASSERT_NO_THROW(finalizeRuntime()); } -}; - -TEST_F(DeviceCallDispatchTest, DispatchesI32AddHandler) { - std::array request{}; - auto *const args = request.data(); - args[0] = 19; - args[1] = 23; - - std::int32_t response = 0; - std::uint64_t responseLen = 0; - ASSERT_EQ(0, - dispatchUsingFrameLease(0, AddThemFunctionId, request.data(), - request.size() * sizeof(request[0]), - &response, sizeof(response), &responseLen)); - EXPECT_EQ(sizeof(response), responseLen); - EXPECT_EQ(42, response); -} - -TEST_F(DeviceCallDispatchTest, DispatchesI32AddHandlerThroughFrameLease) { - void *frame = nullptr; - void *requestPayload = nullptr; - void *responsePayload = nullptr; - ASSERT_EQ(0, __cudaq_device_call_acquire_realtime_frame( - 0, AddThemFunctionId, 2 * sizeof(std::int32_t), - sizeof(std::int32_t), &frame, &requestPayload, - &responsePayload)); - ASSERT_NE(nullptr, frame); - ASSERT_NE(nullptr, requestPayload); - ASSERT_NE(nullptr, responsePayload); - - auto *const args = static_cast(requestPayload); - args[0] = 19; - args[1] = 23; - - std::uint64_t responseLen = 0; - ASSERT_EQ(0, - __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen)); - EXPECT_EQ(sizeof(std::int32_t), responseLen); - EXPECT_EQ(42, *static_cast(responsePayload)); - - __cudaq_device_call_safely_release_realtime_frame(frame); -} - -TEST_F(DeviceCallDispatchTest, DispatchesVoidFireAndForgetThroughFrameLease) { - void *frame = nullptr; - void *requestPayload = nullptr; - void *responsePayload = nullptr; - ASSERT_EQ(0, __cudaq_device_call_acquire_realtime_frame( - 0, AddThemFunctionId, 2 * sizeof(std::int32_t), 0, &frame, - &requestPayload, &responsePayload)); - ASSERT_NE(nullptr, frame); - ASSERT_NE(nullptr, requestPayload); - EXPECT_EQ(nullptr, responsePayload); - - auto *args = static_cast(requestPayload); - args[0] = 19; - args[1] = 23; - - std::uint64_t responseLen = 123; - ASSERT_EQ(0, - __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen)); - EXPECT_EQ(0u, responseLen); - - __cudaq_device_call_safely_release_realtime_frame(frame); - - for (int i = 0; i < 2; ++i) { - frame = nullptr; - requestPayload = nullptr; - responsePayload = nullptr; - ASSERT_EQ(0, __cudaq_device_call_acquire_realtime_frame( - 0, AddThemFunctionId, 2 * sizeof(std::int32_t), - sizeof(std::int32_t), &frame, &requestPayload, - &responsePayload)); - args = static_cast(requestPayload); - args[0] = 19; - args[1] = 23; - - responseLen = 0; - ASSERT_EQ(0, - __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen)); - EXPECT_EQ(sizeof(std::int32_t), responseLen); - EXPECT_EQ(42, *static_cast(responsePayload)); - - __cudaq_device_call_safely_release_realtime_frame(frame); - } -} - -TEST_F(DeviceCallDispatchTest, ReinitializesThroughDiscoveredPlugin) { - ASSERT_NO_THROW(finalizeRuntime()); - ASSERT_NO_THROW(initializeGpuRuntime(TestGpuTable::AddThemOffset)); - - std::array request{19, 23}; - std::int32_t response = 0; - std::uint64_t responseLen = 0; - ASSERT_EQ(0, - dispatchUsingFrameLease(0, AddThemFunctionId, request.data(), - request.size() * sizeof(request[0]), - &response, sizeof(response), &responseLen)); - EXPECT_EQ(sizeof(response), responseLen); - EXPECT_EQ(142, response); -} - -class HostGraphDispatchFrameTest : public ::testing::Test { -protected: - void SetUp() override { ASSERT_NO_THROW(initializeHostRuntime()); } - - void TearDown() override { - if (frame) - __cudaq_device_call_safely_release_realtime_frame(frame); - ASSERT_NO_THROW(finalizeRuntime()); - } - - void *frame = nullptr; -}; - -TEST_F(HostGraphDispatchFrameTest, DispatchesGraphLaunchThroughFrameLease) { - void *requestPayload = nullptr; - void *responsePayload = nullptr; - ASSERT_EQ(DeviceCallSuccessStatus, - __cudaq_device_call_acquire_realtime_frame( - 0, GraphAddThemFunctionId, 2 * sizeof(std::int32_t), - sizeof(std::int32_t), &frame, &requestPayload, - &responsePayload)); - ASSERT_NE(nullptr, frame); - ASSERT_NE(nullptr, requestPayload); - ASSERT_NE(nullptr, responsePayload); - - auto *const args = static_cast(requestPayload); - args[0] = 19; - args[1] = 23; - - std::uint64_t responseLen = 0; - ASSERT_EQ(DeviceCallSuccessStatus, - __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen)); - EXPECT_EQ(sizeof(std::int32_t), responseLen); - EXPECT_EQ(42, *static_cast(responsePayload)); - - __cudaq_device_call_safely_release_realtime_frame(frame); - frame = nullptr; -} - -class DeviceCallServicePluginTest : public ::testing::Test { -protected: - void SetUp() override { ASSERT_NO_THROW(initializeGpuRuntime()); } - - void TearDown() override { ASSERT_NO_THROW(finalizeRuntime()); } -}; - -TEST_F(DeviceCallServicePluginTest, DispatchesThroughDiscoveredPlugin) { - std::array request{19, 23}; - std::int32_t response = 0; - std::uint64_t responseLen = 0; - - ASSERT_EQ(0, - dispatchUsingFrameLease(0, AddThemFunctionId, request.data(), - request.size() * sizeof(request[0]), - &response, sizeof(response), &responseLen)); - EXPECT_EQ(sizeof(response), responseLen); - EXPECT_EQ(42, response); -} - -TEST_F(DeviceCallServicePluginTest, FinalizeClearsPluginSession) { - ASSERT_NO_THROW(finalizeRuntime()); - - std::array request{19, 23}; - std::int32_t response = 0; - std::uint64_t responseLen = 0; - EXPECT_EQ(DeviceCallNotInitializedStatus, - dispatchUsingFrameLease(0, AddThemFunctionId, request.data(), - request.size() * sizeof(request[0]), - &response, sizeof(response), &responseLen)); +int populateAddThemTable(cudaq_function_entry_t *entries, bool useOffset, + cudaStream_t stream) { + if (useOffset) + initDeviceCallTableWithOffset<<<1, 1, 0, stream>>>(entries); + else + initDeviceCallTable<<<1, 1, 0, stream>>>(entries); + return cudaGetLastError() == cudaSuccess ? 0 : 1; } -} // namespace +} // namespace cudaq_internal::device_call::test