diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1af795e12f4..20a99e2cb5b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -263,9 +263,9 @@ option(CUDAQ_ENABLE_SANITIZERS "Enable Address Sanitizer (ASan) and Undefined Be
 
 # Sub-projects that can be enabled independently, in the style of
 # LLVM_ENABLE_PROJECTS. Accepts a semicolon-separated list.
-set(CUDAQ_ALL_PROJECTS "cudaq;runtime;python")
+set(CUDAQ_ALL_PROJECTS "cudaq;runtime;python;realtime")
 set(CUDAQ_ENABLE_PROJECTS "" CACHE STRING
-    "Semicolon-separated list of CUDA-Q sub-projects to build. Valid values: ${CUDAQ_ALL_PROJECTS}. Defaults to 'cudaq;runtime' (plus 'python' if CUDAQ_ENABLE_PYTHON is set).")
+    "Semicolon-separated list of CUDA-Q sub-projects to build. Valid values: ${CUDAQ_ALL_PROJECTS}. Defaults to 'cudaq;runtime' (plus 'python' if CUDAQ_ENABLE_PYTHON is set). Include 'realtime' to build source-tree realtime; set CUDAQ_REALTIME_DIR to use an installed realtime package.")
 
 # Back-compat: if CUDAQ_ENABLE_PROJECTS wasn't given, derive it from the
 # legacy per-project flags so existing build scripts keep working, and warn
@@ -824,6 +824,13 @@ if (CUDAQ_REALTIME_DIR)
       "CUDAQ_REALTIME_DIR requires CUDA support, but CUDA was not found.")
   endif()
 
+  if ("realtime" IN_LIST CUDAQ_ENABLE_PROJECTS)
+    message(WARNING
+      "Both CUDAQ_REALTIME_DIR and 'realtime' in CUDAQ_ENABLE_PROJECTS were provided. "
+      "Using CUDAQ_REALTIME_DIR=${CUDAQ_REALTIME_DIR} and skipping the source-tree realtime build.")
+    list(REMOVE_ITEM CUDAQ_ENABLE_PROJECTS "realtime")
+  endif()
+
   find_package(cudaq-realtime CONFIG REQUIRED
     PATHS "${CUDAQ_REALTIME_DIR}"
     NO_DEFAULT_PATH)
@@ -841,9 +848,17 @@ if (CUDAQ_REALTIME_DIR)
 
   set(CUDAQ_ENABLE_REALTIME TRUE)
   message(STATUS "CUDA-Q realtime integration enabled from: ${CUDAQ_REALTIME_DIR}")
+elseif ("realtime" IN_LIST CUDAQ_ENABLE_PROJECTS)
+  if (NOT CUDA_FOUND)
+    message(FATAL_ERROR
+      "The \"realtime\" project requires CUDA support, but CUDA was not found.")
+  endif()
+
+  set(CUDAQ_ENABLE_REALTIME TRUE)
+  message(STATUS "CUDA-Q realtime integration enabled from the source tree.")
 else()
   message(STATUS
-    "CUDA-Q realtime integration disabled. Set CUDAQ_REALTIME_DIR to an installed cudaq-realtime prefix to enable it.")
+    "CUDA-Q realtime integration disabled. Include \"realtime\" in CUDAQ_ENABLE_PROJECTS to enable it.")
 endif()
 
 # Code coverage setup
@@ -868,6 +883,9 @@ endif()
 if("cudaq" IN_LIST CUDAQ_ENABLE_PROJECTS)
   add_subdirectory(cudaq)
 endif()
+if("realtime" IN_LIST CUDAQ_ENABLE_PROJECTS)
+  add_subdirectory(realtime)
+endif()
 if("runtime" IN_LIST CUDAQ_ENABLE_PROJECTS)
   add_subdirectory(runtime)
 endif()
diff --git a/cmake/modules/CUDAQConfig.cmake b/cmake/modules/CUDAQConfig.cmake
index 5891c164403..90240e8b164 100644
--- a/cmake/modules/CUDAQConfig.cmake
+++ b/cmake/modules/CUDAQConfig.cmake
@@ -42,7 +42,9 @@ if (CUDAQ_REALTIME_DIR)
 else()
   # Do not use find_dependency here: it inherits find_package(CUDAQ REQUIRED)
   # and would make realtime mandatory for CUDA-Q installs that do not use it.
-  find_package(cudaq-realtime CONFIG QUIET)
+  find_package(cudaq-realtime CONFIG QUIET
+    PATHS "${CUDAQ_CMAKE_DIR}/../cudaq-realtime"
+    NO_DEFAULT_PATH)
 endif()
 
 get_filename_component(PARENT_DIRECTORY ${CUDAQ_CMAKE_DIR} DIRECTORY)
diff --git a/cudaq/test/CMakeLists.txt b/cudaq/test/CMakeLists.txt
index 4708ed752e1..97be10a5402 100644
--- a/cudaq/test/CMakeLists.txt
+++ b/cudaq/test/CMakeLists.txt
@@ -82,14 +82,14 @@ if (CUDA_FOUND AND CUDAQ_ENABLE_REALTIME AND NOT CUDAQ_DISABLE_CPP_FRONTEND AND
         cudaq-device-call-runtime
         cudaq-device-call-runtime-headers
         cudaq::cudaq-realtime-dispatch
-        CUDA::cudart)
+        CUDA::cudart_static)
     target_compile_options(${target} PRIVATE
       $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
     set_target_properties(${target}
       PROPERTIES
         CUDA_SEPARABLE_COMPILATION ON
         CUDA_RESOLVE_DEVICE_SYMBOLS ON
-        CUDA_RUNTIME_LIBRARY Shared
+        CUDA_RUNTIME_LIBRARY Static
         CUDA_STANDARD 20
         CXX_STANDARD 20
         LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
diff --git a/cudaq/test/lit.site.cfg.py.in b/cudaq/test/lit.site.cfg.py.in
index a68f44d40f5..a4601412c8b 100644
--- a/cudaq/test/lit.site.cfg.py.in
+++ b/cudaq/test/lit.site.cfg.py.in
@@ -44,7 +44,9 @@ config.cuda_found = "@CUDA_FOUND@"
 if cmake_boolvar_to_bool(config.cuda_found):
   config.available_features.add('nvcc')
 
+config.cuda_gpu_available = os.system("nvidia-smi >/dev/null 2>&1") == 0
 if (cmake_boolvar_to_bool(config.cuda_found) and
+    config.cuda_gpu_available and
     config.cudaq_device_call_realtime_libs and
     all(os.path.exists(lib)
         for lib in config.cudaq_device_call_realtime_libs)):
diff --git a/realtime/CMakeLists.txt b/realtime/CMakeLists.txt
index ef771c93271..7b6e3242005 100644
--- a/realtime/CMakeLists.txt
+++ b/realtime/CMakeLists.txt
@@ -10,21 +10,25 @@ cmake_minimum_required(VERSION 3.22 FATAL_ERROR)
 
 include(FetchContent)
 
-# Set a default build type if none was specified. Must set this before
-# project().
-set(CMAKE_BUILD_TYPE "Release" CACHE STRING
-    "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel")
+string(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}" CUDAQ_REALTIME_STANDALONE_BUILD)
 
-# Set a default install prefix if none was specified.
-set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.cudaq_realtime" CACHE STRING
-    "Install path prefix, prepended onto install directories")
+if(CUDAQ_REALTIME_STANDALONE_BUILD)
+  # Set a default build type if none was specified. Must set this before
+  # project().
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+      "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel")
+
+  # Set a default install prefix if none was specified.
+  set(CMAKE_INSTALL_PREFIX "$ENV{HOME}/.cudaq_realtime" CACHE STRING
+      "Install path prefix, prepended onto install directories")
+endif()
 
 # Project setup
 # ==============================================================================
 
-# Check if core is built as a standalone project.
-project(cudaq-realtime)
-set(CUDAQ_REALTIME_STANDALONE_BUILD TRUE)
+if(CUDAQ_REALTIME_STANDALONE_BUILD)
+  project(cudaq-realtime)
+endif()
 
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
@@ -45,71 +49,107 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 # Options
 # ==============================================================================
 
+set(_cudaq_realtime_build_tests_default ON)
+if(NOT CUDAQ_REALTIME_STANDALONE_BUILD)
+  set(_cudaq_realtime_build_tests_default OFF)
+endif()
+
 option(CUDAQ_REALTIME_BUILD_TESTS
-       "Generate build targets for the CUDAQ real-time unit tests" ON)
+       "Generate build targets for the CUDAQ real-time unit tests"
+       ${_cudaq_realtime_build_tests_default})
 option(CUDAQ_REALTIME_BUILD_EXAMPLES
        "Generate build targets for the CUDAQ real-time example programs" ON)
 option(CUDAQ_REALTIME_ENABLE_HOLOLINK_TOOLS
        "Build Hololink bridge/emulator/playback tools (requires hololink)."
        OFF)
 
+set(_host_compiler_opts_list "")
+if(CUDAQ_REALTIME_STANDALONE_BUILD)
+  list(APPEND _host_compiler_opts_list "-fPIC")
+endif()
+
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64" AND
+   CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
+    # nvcc cannot parse the GCC BF16 intrinsic headers exposed through
+    # <immintrin.h> on the CI toolchains. This only applies from GCC 13:
+    # GCC 13 switched the AVX512BF16 intrinsics to the `__bf16` builtin type
+    # (GCC 12 still used the parseable `__bfloat16` typedef) and introduced
+    # <avxneconvertintrin.h>, which does not exist on GCC 12.
+    list(APPEND _host_compiler_opts_list
+      "-D_AVX512BF16INTRIN_H_INCLUDED"
+      "-D_AVX512BF16VLINTRIN_H_INCLUDED"
+      "-D_AVXNECONVERTINTRIN_H_INCLUDED")
+  endif()
+
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
+    # -mno-amx-tile alone is insufficient: the AMX intrinsic headers use
+    # #pragma GCC target("amx-*") which re-enables the builtins that nvcc
+    # cannot parse. Pre-define the include guards to skip those headers.
+    list(APPEND _host_compiler_opts_list
+      "-mno-amx-tile"
+      "-D_AMXTILEINTRIN_H_INCLUDED"
+      "-D_AMXBF16INTRIN_H_INCLUDED"
+      "-D_AMXINT8INTRIN_H_INCLUDED"
+      "-D_AMXFP16INTRIN_H_INCLUDED"
+      "-D_AMXCOMPLEXINTRIN_H_INCLUDED")
+  endif()
+endif()
+
+list(JOIN _host_compiler_opts_list "," _host_compiler_opts)
+
 # Check for CUDA Support (ref: cuda-quantum/CMakeLists.txt)
 # ==============================================================================
-include(CheckLanguage)
-check_language(CUDA)
-set(CUDA_FOUND FALSE)
-# Generate -gencode arch=compute_XX,code=sm_XX for list of supported
-# arch values.
-# List should be sorted in increasing order.
-function(CUDA_get_gencode_args out_args_string arch_values)
-  # allow the user to pass the list like a normal variable
-  set(arch_list ${arch_values} ${ARGN})
-  set(out "")
-  foreach(arch IN LISTS arch_list)
-    set(out "${out} -gencode arch=compute_${arch},code=sm_${arch}")
-  endforeach(arch)
-
-  # Repeat the last one as to ensure the generation of PTX for most
-  # recent virtual architecture for forward compatibility
-  list(GET arch_list -1 last_arch)
-  set(out "${out} -gencode arch=compute_${last_arch},code=compute_${last_arch}")
-  set(${out_args_string} ${out} PARENT_SCOPE)
-endfunction()
-
-if(CMAKE_CUDA_COMPILER)
-  if (NOT CUDA_TARGET_ARCHS)
-    if (CUDAToolkit_VERSION VERSION_LESS 13.0)
-      # Ampere, Hopper
-      set(CUDA_TARGET_ARCHS  "80;90")
-    else()
-      # Ampere, Hopper, Blackwell
-      set(CUDA_TARGET_ARCHS  "80;90;100")
+if(CUDAQ_REALTIME_STANDALONE_BUILD)
+  include(CheckLanguage)
+  check_language(CUDA)
+  set(CUDA_FOUND FALSE)
+  # Generate -gencode arch=compute_XX,code=sm_XX for list of supported
+  # arch values.
+  # List should be sorted in increasing order.
+  function(CUDA_get_gencode_args out_args_string arch_values)
+    # allow the user to pass the list like a normal variable
+    set(arch_list ${arch_values} ${ARGN})
+    set(out "")
+    foreach(arch IN LISTS arch_list)
+      set(out "${out} -gencode arch=compute_${arch},code=sm_${arch}")
+    endforeach(arch)
+
+    # Repeat the last one as to ensure the generation of PTX for most
+    # recent virtual architecture for forward compatibility
+    list(GET arch_list -1 last_arch)
+    set(out "${out} -gencode arch=compute_${last_arch},code=compute_${last_arch}")
+    set(${out_args_string} ${out} PARENT_SCOPE)
+  endfunction()
+
+  if(CMAKE_CUDA_COMPILER)
+    find_package(CUDAToolkit REQUIRED)
+    if (NOT CUDA_TARGET_ARCHS)
+      if (CUDAToolkit_VERSION VERSION_LESS 13.0)
+        # Ampere, Hopper
+        set(CUDA_TARGET_ARCHS  "80;90")
+      else()
+        # Ampere, Hopper, Blackwell
+        set(CUDA_TARGET_ARCHS  "80;90;100")
+      endif()
     endif()
+    CUDA_get_gencode_args(CUDA_gencode_flags ${CUDA_TARGET_ARCHS})
+    # Keep realtime CUDA sources on C++17. nvcc's C++20 frontend can hit an
+    # internal compiler error in libstdc++ headers when compiling the standalone
+    # realtime tests, including test_host_dispatcher.cu, on the CI toolchain.
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -shared -std=c++17 ${CUDA_gencode_flags} --compiler-options ${_host_compiler_opts}")
+
+    enable_language(CUDA)
+    set(CUDA_FOUND TRUE)
+    set(CMAKE_CUDA_STANDARD 17)
+    set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+    message(STATUS "Cuda language found.")
   endif()
-  CUDA_get_gencode_args(CUDA_gencode_flags ${CUDA_TARGET_ARCHS})
-  set(_host_compiler_opts "-fPIC")
-  if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
-    # -mno-amx-tile alone is insufficient with GCC 13+: the AMX intrinsic
-    # headers use #pragma GCC target("amx-tile") which re-enables the
-    # builtins that nvcc cannot parse.  Pre-define the include guards to
-    # prevent these headers from being processed at all.
-    string(APPEND _host_compiler_opts ",-mno-amx-tile")
-    foreach(_amx_guard _AMXTILEINTRIN_H_INCLUDED
-                       _AMXBF16INTRIN_H_INCLUDED
-                       _AMXINT8INTRIN_H_INCLUDED
-                       _AMXFP16INTRIN_H_INCLUDED
-                       _AMXCOMPLEXINTRIN_H_INCLUDED)
-      string(APPEND _host_compiler_opts ",-D${_amx_guard}")
-    endforeach()
-  endif()
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -shared -std=c++17 ${CUDA_gencode_flags} --compiler-options ${_host_compiler_opts}")
-
-  enable_language(CUDA)
-  set(CUDA_FOUND TRUE)
-  set(CMAKE_CUDA_STANDARD 17)
-  set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+elseif(CUDA_FOUND)
   find_package(CUDAToolkit REQUIRED)
-  message(STATUS "Cuda language found.")
+  if(_host_compiler_opts)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin=${CMAKE_CXX_COMPILER} --compiler-options ${_host_compiler_opts}")
+  endif()
 endif()
 
 # External Dependencies 
@@ -117,9 +157,12 @@ endif()
 
 find_package(Threads REQUIRED)
 
-# Enable static linking of the C++ standard library to avoid dependency issues when distributing the library.
-SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libstdc++ -static-libgcc")
-SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++ -static-libgcc")
+if(CUDAQ_REALTIME_STANDALONE_BUILD)
+  # Standalone realtime owns its distribution linker policy. Integrated CUDA-Q
+  # builds inherit the top-level CUDA-Q runtime/dependency link settings.
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -static-libstdc++ -static-libgcc")
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++ -static-libgcc")
+endif()
 
 add_subdirectory(lib)
 
diff --git a/realtime/lib/cpu_transport/CMakeLists.txt b/realtime/lib/cpu_transport/CMakeLists.txt
index d321a894ba7..2b9a25a6834 100644
--- a/realtime/lib/cpu_transport/CMakeLists.txt
+++ b/realtime/lib/cpu_transport/CMakeLists.txt
@@ -33,6 +33,9 @@ add_library(cudaq-realtime-cpu-transport STATIC
   roce_transceiver.cpp  # class CpuRoceTransceiver: ibv setup, RX/TX threads, ring lifecycle
   roce_wrapper.cpp      # extern-C shim used by bridge tools and CpuRoceChannel
 )
+# Source-tree CUDA-Q integration uses the same cudaq:: target names
+# that installed exports provide.
+add_library(cudaq::cudaq-realtime-cpu-transport ALIAS cudaq-realtime-cpu-transport)
 
 target_include_directories(cudaq-realtime-cpu-transport
   PUBLIC
diff --git a/realtime/lib/daemon/CMakeLists.txt b/realtime/lib/daemon/CMakeLists.txt
index 5df1470dd91..b54d848ee7f 100644
--- a/realtime/lib/daemon/CMakeLists.txt
+++ b/realtime/lib/daemon/CMakeLists.txt
@@ -20,6 +20,9 @@ if(CUDA_FOUND)
   )
 
   add_library(cudaq-realtime SHARED ${CUDAQ_REALTIME_SOURCES})
+  # Source-tree CUDA-Q integration uses the same cudaq:: target names
+  # that installed exports provide.
+  add_library(cudaq::cudaq-realtime ALIAS cudaq-realtime)
   
   target_include_directories(cudaq-realtime
     PUBLIC
@@ -54,6 +57,9 @@ if(CUDA_FOUND)
   add_library(cudaq-realtime-dispatch STATIC
     dispatcher/dispatch_kernel.cu
   )
+  # Source-tree CUDA-Q integration uses the same cudaq:: target names
+  # that installed exports provide.
+  add_library(cudaq::cudaq-realtime-dispatch ALIAS cudaq-realtime-dispatch)
   set_target_properties(cudaq-realtime-dispatch
     PROPERTIES
     LINK_FLAGS_RELEASE "-Wl,--exclude-libs=ALL")
@@ -97,6 +103,9 @@ if(CUDA_FOUND)
     dispatcher/host_dispatcher.cu
     dispatcher/host_dispatcher_capi.cu
   )
+  # Source-tree CUDA-Q integration uses the same cudaq:: target names
+  # that installed exports provide.
+  add_library(cudaq::cudaq-realtime-host-dispatch ALIAS cudaq-realtime-host-dispatch)
 
   target_include_directories(cudaq-realtime-host-dispatch
     PUBLIC
diff --git a/realtime/unittests/utils/CMakeLists.txt b/realtime/unittests/utils/CMakeLists.txt
index 1cd14db74f6..08f73b09166 100644
--- a/realtime/unittests/utils/CMakeLists.txt
+++ b/realtime/unittests/utils/CMakeLists.txt
@@ -165,9 +165,10 @@ if (GPU_ROCE_TRANSCEIVER_LIB AND
   message(STATUS "Building hololink_bridge (generic increment)")
   message(STATUS "  GPU RoCE Transceiver: ${GPU_ROCE_TRANSCEIVER_LIB}")
 
-  # Hololink wrapper static library (compiled by g++, isolates fmt)
+  # Hololink wrapper static library (compiled by g++, isolates fmt).
+  # Use the realtime source root so this path works when nested in CUDA-Q.
   add_library(hololink_wrapper_generic STATIC
-    ${CMAKE_SOURCE_DIR}/lib/daemon/bridge/hololink/hololink_wrapper.cpp)
+    ${CUDAQ_REALTIME_SOURCE_DIR}/lib/daemon/bridge/hololink/hololink_wrapper.cpp)
 
   target_include_directories(hololink_wrapper_generic
     PRIVATE
diff --git a/runtime/internal/device_call/include/cudaq_internal/device_call/DeviceCallService.h b/runtime/internal/device_call/include/cudaq_internal/device_call/DeviceCallService.h
index 1683dd64d22..22782a3c5ed 100644
--- a/runtime/internal/device_call/include/cudaq_internal/device_call/DeviceCallService.h
+++ b/runtime/internal/device_call/include/cudaq_internal/device_call/DeviceCallService.h
@@ -167,5 +167,12 @@ using DeviceCallServicePluginInfoFn = DeviceCallServicePluginInfo (*)();
 // Default service discovery entry point. Service artifacts may also expose
 // suffixed variants with the same signature for tests or multi-service
 // deployments, e.g. cudaqGetDeviceCallServicePluginInfo_<name>.
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
+#endif
 extern "C" cudaq_internal::device_call::DeviceCallServicePluginInfo
 cudaqGetDeviceCallServicePluginInfo();
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index c9ca33e7d32..8b80a1b54ee 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -150,6 +150,7 @@ create_tests_with_backend(stim backends/StimTester.cpp)
 
 if (CUDA_FOUND AND CUDAQ_ENABLE_REALTIME)
   add_executable(test_device_call_dispatch
+    device_call/DeviceCallDispatchTester.cpp
     device_call/DeviceCallDispatchTester.cu)
   set_target_properties(test_device_call_dispatch PROPERTIES
     CUDA_SEPARABLE_COMPILATION ON
diff --git a/unittests/device_call/DeviceCallDispatchTester.cpp b/unittests/device_call/DeviceCallDispatchTester.cpp
new file mode 100644
index 00000000000..695a32358d9
--- /dev/null
+++ b/unittests/device_call/DeviceCallDispatchTester.cpp
@@ -0,0 +1,401 @@
+/*******************************************************************************
+ * Copyright (c) 2026 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq_internal/device_call/DeviceCallError.h"
+#include "cudaq_internal/device_call/DeviceCallService.h"
+#include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h"
+#include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h"
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+
+namespace cudaq_internal::device_call {
+void initializeDeviceCallRuntime(int argc, char **argv);
+void finalizeDeviceCallRuntime();
+
+namespace test {
+bool createGraphAddThem(void **dMailbox, cudaGraph_t *graphOut,
+                        cudaGraphExec_t *execOut);
+void fillHostGraphAddEntry(cudaq_function_entry_t &entry,
+                           cudaGraphExec_t graphExec);
+int populateAddThemTable(cudaq_function_entry_t *entries, bool useOffset,
+                         cudaStream_t stream);
+} // namespace test
+} // namespace cudaq_internal::device_call
+
+extern "C" std::int32_t __cudaq_device_call_acquire_realtime_frame(
+    std::uint32_t deviceId, std::uint32_t functionId,
+    std::uint64_t requestBytes, std::uint64_t responseCapacity,
+    void **frameHandle, void **requestPayload, void **responsePayload);
+extern "C" std::int32_t
+__cudaq_device_call_dispatch_realtime_frame(void *frameHandle,
+                                            std::uint64_t *responseBytes);
+extern "C" void
+__cudaq_device_call_safely_release_realtime_frame(void *frameHandle);
+
+namespace {
+
+using namespace cudaq_internal::device_call;
+
+constexpr std::uint32_t AddThemFunctionId =
+    cudaq::realtime::fnv1a_hash("addThem");
+constexpr std::uint32_t GraphAddThemFunctionId =
+    cudaq::realtime::fnv1a_hash("graphAddThem");
+constexpr std::int32_t DeviceCallSuccessStatus =
+    toAbiStatus(DeviceCallStatus::Success);
+constexpr std::int32_t DeviceCallInvalidArgumentStatus =
+    toAbiStatus(DeviceCallStatus::InvalidArgument);
+constexpr std::int32_t DeviceCallNotInitializedStatus =
+    toAbiStatus(DeviceCallStatus::NotInitialized);
+constexpr std::int32_t DeviceCallResponseTooLargeStatus =
+    toAbiStatus(DeviceCallStatus::ResponseTooLarge);
+
+// A payload buffer must be non-null whenever its declared length is nonzero;
+// a zero-length buffer is allowed to be null.
+constexpr bool isValidBuffer(const void *buffer, std::uint64_t length) {
+  return length == 0 || buffer != nullptr;
+}
+
+std::int32_t dispatchUsingFrameLease(std::uint32_t deviceId,
+                                     std::uint32_t functionId,
+                                     const void *request,
+                                     std::uint64_t requestLen, void *response,
+                                     std::uint64_t responseCapacity,
+                                     std::uint64_t *responseLen) {
+  if (!isValidBuffer(request, requestLen) || !responseLen ||
+      !isValidBuffer(response, responseCapacity))
+    return DeviceCallInvalidArgumentStatus;
+
+  void *frame = nullptr;
+  void *requestPayload = nullptr;
+  void *responsePayload = nullptr;
+  std::int32_t status = __cudaq_device_call_acquire_realtime_frame(
+      deviceId, functionId, requestLen, responseCapacity, &frame,
+      &requestPayload, &responsePayload);
+  if (status != DeviceCallSuccessStatus)
+    return status;
+  if (!isValidBuffer(requestPayload, requestLen) ||
+      !isValidBuffer(responsePayload, responseCapacity)) {
+    __cudaq_device_call_safely_release_realtime_frame(frame);
+    return DeviceCallInvalidArgumentStatus;
+  }
+
+  if (requestLen > 0)
+    std::memcpy(requestPayload, request, requestLen);
+
+  status = __cudaq_device_call_dispatch_realtime_frame(frame, responseLen);
+  if (status == DeviceCallSuccessStatus && *responseLen > responseCapacity)
+    status = DeviceCallResponseTooLargeStatus;
+  if (status == DeviceCallSuccessStatus && *responseLen > 0)
+    std::memcpy(response, responsePayload, *responseLen);
+
+  __cudaq_device_call_safely_release_realtime_frame(frame);
+  return status;
+}
+
+enum class TestGpuTable { AddThem, AddThemOffset };
+
+TestGpuTable selectedGpuTable = TestGpuTable::AddThem;
+
+class TestRealtimeService : public DeviceCallService {
+public:
+  int create(const void *, std::size_t) override { return 0; }
+
+  int destroy() noexcept override {
+    teardownHostDispatch();
+    return 0;
+  }
+
+  std::uint32_t getFunctionCount() const override { return 1; }
+
+  int populateTable(cudaq_function_entry_t *entries, std::uint32_t capacity,
+                    cudaStream_t stream) override {
+    if (!entries || capacity < 1)
+      return 1;
+    return test::populateAddThemTable(
+        entries, selectedGpuTable == TestGpuTable::AddThemOffset, stream);
+  }
+
+  cudaq_dispatch_launch_fn_t getDeviceDispatchLaunch() const override {
+    return cudaq_launch_dispatch_kernel_regular;
+  }
+
+  int getHostDispatchTable(DeviceCallHostDispatchTable &table) override {
+    if (setupHostDispatch() != 0)
+      return 1;
+    table.entries = hostEntries.data();
+    table.count = static_cast<std::uint32_t>(hostEntries.size());
+    table.deviceId = 0;
+    table.mailbox = h_mailbox;
+    return 0;
+  }
+
+  int stop() noexcept override {
+    teardownHostDispatch();
+    return 0;
+  }
+
+private:
+  int setupHostDispatch() {
+    if (h_mailbox && graphExec)
+      return 0;
+
+    if (cudaHostAlloc(&h_mailbox, sizeof(void *), cudaHostAllocMapped) !=
+        cudaSuccess)
+      return 1;
+    std::memset(h_mailbox, 0, sizeof(void *));
+    if (cudaHostGetDevicePointer(reinterpret_cast<void **>(&d_mailbox),
+                                 h_mailbox, 0) != cudaSuccess) {
+      teardownHostDispatch();
+      return 1;
+    }
+    if (!test::createGraphAddThem(d_mailbox, &graph, &graphExec)) {
+      teardownHostDispatch();
+      return 1;
+    }
+
+    test::fillHostGraphAddEntry(hostEntries[0], graphExec);
+    return 0;
+  }
+
+  void teardownHostDispatch() noexcept {
+    if (graphExec)
+      cudaGraphExecDestroy(graphExec);
+    if (graph)
+      cudaGraphDestroy(graph);
+    if (h_mailbox)
+      cudaFreeHost(h_mailbox);
+
+    graphExec = nullptr;
+    graph = nullptr;
+    h_mailbox = nullptr;
+    d_mailbox = nullptr;
+    hostEntries = {};
+  }
+
+  void **h_mailbox = nullptr;
+  void **d_mailbox = nullptr;
+  cudaGraph_t graph = nullptr;
+  cudaGraphExec_t graphExec = nullptr;
+  std::array<cudaq_function_entry_t, 1> hostEntries{};
+};
+
+DeviceCallService *getTestRealtimeService() {
+  static TestRealtimeService service;
+  return &service;
+}
+
+} // namespace
+
+extern "C" cudaq_internal::device_call::DeviceCallServicePluginInfo
+cudaqGetDeviceCallServicePluginInfo() {
+  return {"test-device-call", &getTestRealtimeService};
+}
+
+namespace {
+
+void initializeGpuRuntime(TestGpuTable table = TestGpuTable::AddThem) {
+  selectedGpuTable = table;
+  char program[] = "test_device_call_dispatch";
+  char *argv[] = {program};
+  cudaq_internal::device_call::initializeDeviceCallRuntime(1, argv);
+}
+
+void initializeHostRuntime() {
+  char program[] = "test_device_call_dispatch";
+  char option[] = "--cudaq-device-call=host-dispatch";
+  char *argv[] = {program, option};
+  cudaq_internal::device_call::initializeDeviceCallRuntime(2, argv);
+}
+
+void finalizeRuntime() {
+  cudaq_internal::device_call::finalizeDeviceCallRuntime();
+}
+
+class DeviceCallDispatchTest : public ::testing::Test {
+protected:
+  void SetUp() override { ASSERT_NO_THROW(initializeGpuRuntime()); }
+
+  void TearDown() override { ASSERT_NO_THROW(finalizeRuntime()); }
+};
+
+TEST_F(DeviceCallDispatchTest, DispatchesI32AddHandler) {
+  std::array<std::int32_t, 2> request{};
+  auto *const args = request.data();
+  args[0] = 19;
+  args[1] = 23;
+
+  std::int32_t response = 0;
+  std::uint64_t responseLen = 0;
+  ASSERT_EQ(0,
+            dispatchUsingFrameLease(0, AddThemFunctionId, request.data(),
+                                    request.size() * sizeof(request[0]),
+                                    &response, sizeof(response), &responseLen));
+  EXPECT_EQ(sizeof(response), responseLen);
+  EXPECT_EQ(42, response);
+}
+
+TEST_F(DeviceCallDispatchTest, DispatchesI32AddHandlerThroughFrameLease) {
+  void *frame = nullptr;
+  void *requestPayload = nullptr;
+  void *responsePayload = nullptr;
+  ASSERT_EQ(0, __cudaq_device_call_acquire_realtime_frame(
+                   0, AddThemFunctionId, 2 * sizeof(std::int32_t),
+                   sizeof(std::int32_t), &frame, &requestPayload,
+                   &responsePayload));
+  ASSERT_NE(nullptr, frame);
+  ASSERT_NE(nullptr, requestPayload);
+  ASSERT_NE(nullptr, responsePayload);
+
+  auto *const args = static_cast<std::int32_t *>(requestPayload);
+  args[0] = 19;
+  args[1] = 23;
+
+  std::uint64_t responseLen = 0;
+  ASSERT_EQ(0,
+            __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen));
+  EXPECT_EQ(sizeof(std::int32_t), responseLen);
+  EXPECT_EQ(42, *static_cast<std::int32_t *>(responsePayload));
+
+  __cudaq_device_call_safely_release_realtime_frame(frame);
+}
+
+TEST_F(DeviceCallDispatchTest, DispatchesVoidFireAndForgetThroughFrameLease) {
+  void *frame = nullptr;
+  void *requestPayload = nullptr;
+  void *responsePayload = nullptr;
+  ASSERT_EQ(0, __cudaq_device_call_acquire_realtime_frame(
+                   0, AddThemFunctionId, 2 * sizeof(std::int32_t), 0, &frame,
+                   &requestPayload, &responsePayload));
+  ASSERT_NE(nullptr, frame);
+  ASSERT_NE(nullptr, requestPayload);
+  EXPECT_EQ(nullptr, responsePayload);
+
+  auto *args = static_cast<std::int32_t *>(requestPayload);
+  args[0] = 19;
+  args[1] = 23;
+
+  std::uint64_t responseLen = 123;
+  ASSERT_EQ(0,
+            __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen));
+  EXPECT_EQ(0u, responseLen);
+
+  __cudaq_device_call_safely_release_realtime_frame(frame);
+
+  for (int i = 0; i < 2; ++i) {
+    frame = nullptr;
+    requestPayload = nullptr;
+    responsePayload = nullptr;
+    ASSERT_EQ(0, __cudaq_device_call_acquire_realtime_frame(
+                     0, AddThemFunctionId, 2 * sizeof(std::int32_t),
+                     sizeof(std::int32_t), &frame, &requestPayload,
+                     &responsePayload));
+    args = static_cast<std::int32_t *>(requestPayload);
+    args[0] = 19;
+    args[1] = 23;
+
+    responseLen = 0;
+    ASSERT_EQ(0,
+              __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen));
+    EXPECT_EQ(sizeof(std::int32_t), responseLen);
+    EXPECT_EQ(42, *static_cast<std::int32_t *>(responsePayload));
+
+    __cudaq_device_call_safely_release_realtime_frame(frame);
+  }
+}
+
+TEST_F(DeviceCallDispatchTest, ReinitializesThroughDiscoveredPlugin) {
+  ASSERT_NO_THROW(finalizeRuntime());
+  ASSERT_NO_THROW(initializeGpuRuntime(TestGpuTable::AddThemOffset));
+
+  std::array<std::int32_t, 2> request{19, 23};
+  std::int32_t response = 0;
+  std::uint64_t responseLen = 0;
+  ASSERT_EQ(0,
+            dispatchUsingFrameLease(0, AddThemFunctionId, request.data(),
+                                    request.size() * sizeof(request[0]),
+                                    &response, sizeof(response), &responseLen));
+  EXPECT_EQ(sizeof(response), responseLen);
+  EXPECT_EQ(142, response);
+}
+
+class HostGraphDispatchFrameTest : public ::testing::Test {
+protected:
+  void SetUp() override { ASSERT_NO_THROW(initializeHostRuntime()); }
+
+  void TearDown() override {
+    if (frame)
+      __cudaq_device_call_safely_release_realtime_frame(frame);
+    ASSERT_NO_THROW(finalizeRuntime());
+  }
+
+  void *frame = nullptr;
+};
+
+TEST_F(HostGraphDispatchFrameTest, DispatchesGraphLaunchThroughFrameLease) {
+  void *requestPayload = nullptr;
+  void *responsePayload = nullptr;
+  ASSERT_EQ(DeviceCallSuccessStatus,
+            __cudaq_device_call_acquire_realtime_frame(
+                0, GraphAddThemFunctionId, 2 * sizeof(std::int32_t),
+                sizeof(std::int32_t), &frame, &requestPayload,
+                &responsePayload));
+  ASSERT_NE(nullptr, frame);
+  ASSERT_NE(nullptr, requestPayload);
+  ASSERT_NE(nullptr, responsePayload);
+
+  auto *const args = static_cast<std::int32_t *>(requestPayload);
+  args[0] = 19;
+  args[1] = 23;
+
+  std::uint64_t responseLen = 0;
+  ASSERT_EQ(DeviceCallSuccessStatus,
+            __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen));
+  EXPECT_EQ(sizeof(std::int32_t), responseLen);
+  EXPECT_EQ(42, *static_cast<std::int32_t *>(responsePayload));
+
+  __cudaq_device_call_safely_release_realtime_frame(frame);
+  frame = nullptr;
+}
+
+class DeviceCallServicePluginTest : public ::testing::Test {
+protected:
+  void SetUp() override { ASSERT_NO_THROW(initializeGpuRuntime()); }
+
+  void TearDown() override { ASSERT_NO_THROW(finalizeRuntime()); }
+};
+
+TEST_F(DeviceCallServicePluginTest, DispatchesThroughDiscoveredPlugin) {
+  std::array<std::int32_t, 2> request{19, 23};
+  std::int32_t response = 0;
+  std::uint64_t responseLen = 0;
+
+  ASSERT_EQ(0,
+            dispatchUsingFrameLease(0, AddThemFunctionId, request.data(),
+                                    request.size() * sizeof(request[0]),
+                                    &response, sizeof(response), &responseLen));
+  EXPECT_EQ(sizeof(response), responseLen);
+  EXPECT_EQ(42, response);
+}
+
+TEST_F(DeviceCallServicePluginTest, FinalizeClearsPluginSession) {
+  ASSERT_NO_THROW(finalizeRuntime());
+
+  std::array<std::int32_t, 2> request{19, 23};
+  std::int32_t response = 0;
+  std::uint64_t responseLen = 0;
+  EXPECT_EQ(DeviceCallNotInitializedStatus,
+            dispatchUsingFrameLease(0, AddThemFunctionId, request.data(),
+                                    request.size() * sizeof(request[0]),
+                                    &response, sizeof(response), &responseLen));
+}
+
+} // namespace
diff --git a/unittests/device_call/DeviceCallDispatchTester.cu b/unittests/device_call/DeviceCallDispatchTester.cu
index 28846c0402b..e07cb8fc39b 100644
--- a/unittests/device_call/DeviceCallDispatchTester.cu
+++ b/unittests/device_call/DeviceCallDispatchTester.cu
@@ -6,56 +6,19 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  ******************************************************************************/
 
-#include "cudaq_internal/device_call/DeviceCallError.h"
-#include "cudaq_internal/device_call/DeviceCallService.h"
 #include "cudaq/realtime/daemon/dispatcher/cudaq_realtime.h"
 #include "cudaq/realtime/daemon/dispatcher/dispatch_kernel_launch.h"
 
 #include <cuda_runtime.h>
-#include <gtest/gtest.h>
 
-#include <array>
 #include <cstdint>
-#include <cstring>
 
-namespace cudaq_internal::device_call {
-void initializeDeviceCallRuntime(int argc, char **argv);
-void finalizeDeviceCallRuntime();
-} // namespace cudaq_internal::device_call
-extern "C" std::int32_t __cudaq_device_call_acquire_realtime_frame(
-    std::uint32_t deviceId, std::uint32_t functionId,
-    std::uint64_t requestBytes, std::uint64_t responseCapacity,
-    void **frameHandle, void **requestPayload, void **responsePayload);
-extern "C" std::int32_t
-__cudaq_device_call_dispatch_realtime_frame(void *frameHandle,
-                                            std::uint64_t *responseBytes);
-extern "C" void
-__cudaq_device_call_safely_release_realtime_frame(void *frameHandle);
-
-namespace {
-
-using namespace cudaq_internal::device_call;
+namespace cudaq_internal::device_call::test {
 
 constexpr std::uint32_t AddThemFunctionId =
     cudaq::realtime::fnv1a_hash("addThem");
 constexpr std::uint32_t GraphAddThemFunctionId =
     cudaq::realtime::fnv1a_hash("graphAddThem");
-constexpr std::int32_t DeviceCallSuccessStatus =
-    toAbiStatus(DeviceCallStatus::Success);
-constexpr std::int32_t DeviceCallInvalidArgumentStatus =
-    toAbiStatus(DeviceCallStatus::InvalidArgument);
-constexpr std::int32_t DeviceCallNotInitializedStatus =
-    toAbiStatus(DeviceCallStatus::NotInitialized);
-constexpr std::int32_t DeviceCallResponseTooLargeStatus =
-    toAbiStatus(DeviceCallStatus::ResponseTooLarge);
-
-#define ASSERT_CUDA_SUCCESS(expr)                                              \
-  do {                                                                         \
-    const cudaError_t err = (expr);                                            \
-    ASSERT_EQ(cudaSuccess, err)                                                \
-        << #expr << " failed at " << __FILE__ << ":" << __LINE__ << ": "       \
-        << cudaGetErrorString(err);                                            \
-  } while (false)
 
 __device__ int addThemHandler(const void *input, void *output,
                               std::uint32_t argLen, std::uint32_t maxResultLen,
@@ -127,13 +90,13 @@ __global__ void graphAddThemHandler(void **mailboxSlotPtr) {
     *ioContext->tx_flag = ioContext->tx_flag_value;
 }
 
-bool createGraphAddThem(void **d_mailbox, cudaGraph_t *graphOut,
+bool createGraphAddThem(void **dMailbox, cudaGraph_t *graphOut,
                         cudaGraphExec_t *execOut) {
   cudaGraph_t graph = nullptr;
   if (cudaGraphCreate(&graph, 0) != cudaSuccess)
     return false;
 
-  void *kernelArgs[] = {&d_mailbox};
+  void *kernelArgs[] = {&dMailbox};
   const cudaKernelNodeParams params = [&] {
     cudaKernelNodeParams result{};
     result.func = reinterpret_cast<void *>(graphAddThemHandler);
@@ -161,44 +124,6 @@ bool createGraphAddThem(void **d_mailbox, cudaGraph_t *graphOut,
   return true;
 }
 
-std::int32_t dispatchUsingFrameLease(std::uint32_t deviceId,
-                                     std::uint32_t functionId,
-                                     const void *request,
-                                     std::uint64_t requestLen, void *response,
-                                     std::uint64_t responseCapacity,
-                                     std::uint64_t *responseLen) {
-  if ((requestLen > 0 && !request) || !responseLen)
-    return DeviceCallInvalidArgumentStatus;
-  if (responseCapacity > 0 && !response)
-    return DeviceCallInvalidArgumentStatus;
-
-  void *frame = nullptr;
-  void *requestPayload = nullptr;
-  void *responsePayload = nullptr;
-  std::int32_t status = __cudaq_device_call_acquire_realtime_frame(
-      deviceId, functionId, requestLen, responseCapacity, &frame,
-      &requestPayload, &responsePayload);
-  if (status != DeviceCallSuccessStatus)
-    return status;
-  if ((requestLen > 0 && !requestPayload) ||
-      (responseCapacity > 0 && !responsePayload)) {
-    __cudaq_device_call_safely_release_realtime_frame(frame);
-    return DeviceCallInvalidArgumentStatus;
-  }
-
-  if (requestLen > 0)
-    std::memcpy(requestPayload, request, requestLen);
-
-  status = __cudaq_device_call_dispatch_realtime_frame(frame, responseLen);
-  if (status == DeviceCallSuccessStatus && *responseLen > responseCapacity)
-    status = DeviceCallResponseTooLargeStatus;
-  if (status == DeviceCallSuccessStatus && *responseLen > 0)
-    std::memcpy(response, responsePayload, *responseLen);
-
-  __cudaq_device_call_safely_release_realtime_frame(frame);
-  return status;
-}
-
 __global__ void initDeviceCallTable(cudaq_function_entry_t *entries) {
   if (threadIdx.x != 0 || blockIdx.x != 0)
     return;
@@ -267,304 +192,13 @@ void fillHostGraphAddEntry(cudaq_function_entry_t &entry,
   entry.schema.results[0].num_elements = 1;
 }
 
-enum class TestGpuTable { AddThem, AddThemOffset };
-
-TestGpuTable selectedGpuTable = TestGpuTable::AddThem;
-
-class TestRealtimeService : public DeviceCallService {
-public:
-  int create(const void *, std::size_t) override { return 0; }
-
-  int destroy() noexcept override {
-    teardownHostDispatch();
-    return 0;
-  }
-
-  std::uint32_t getFunctionCount() const override { return 1; }
-
-  int populateTable(cudaq_function_entry_t *entries, std::uint32_t capacity,
-                    cudaStream_t stream) override {
-    if (!entries || capacity < 1)
-      return 1;
-    if (selectedGpuTable == TestGpuTable::AddThemOffset)
-      initDeviceCallTableWithOffset<<<1, 1, 0, stream>>>(entries);
-    else
-      initDeviceCallTable<<<1, 1, 0, stream>>>(entries);
-    return cudaGetLastError() == cudaSuccess ? 0 : 1;
-  }
-
-  cudaq_dispatch_launch_fn_t getDeviceDispatchLaunch() const override {
-    return cudaq_launch_dispatch_kernel_regular;
-  }
-
-  int getHostDispatchTable(DeviceCallHostDispatchTable &table) override {
-    if (setupHostDispatch() != 0)
-      return 1;
-    table.entries = hostEntries.data();
-    table.count = static_cast<std::uint32_t>(hostEntries.size());
-    table.deviceId = 0;
-    table.mailbox = h_mailbox;
-    return 0;
-  }
-
-  int stop() noexcept override {
-    teardownHostDispatch();
-    return 0;
-  }
-
-private:
-  int setupHostDispatch() {
-    if (h_mailbox && graphExec)
-      return 0;
-
-    if (cudaHostAlloc(&h_mailbox, sizeof(void *), cudaHostAllocMapped) !=
-        cudaSuccess)
-      return 1;
-    std::memset(h_mailbox, 0, sizeof(void *));
-    if (cudaHostGetDevicePointer(reinterpret_cast<void **>(&d_mailbox), h_mailbox,
-                                 0) != cudaSuccess) {
-      teardownHostDispatch();
-      return 1;
-    }
-    if (!createGraphAddThem(d_mailbox, &graph, &graphExec)) {
-      teardownHostDispatch();
-      return 1;
-    }
-
-    fillHostGraphAddEntry(hostEntries[0], graphExec);
-    return 0;
-  }
-
-  void teardownHostDispatch() noexcept {
-    if (graphExec)
-      cudaGraphExecDestroy(graphExec);
-    if (graph)
-      cudaGraphDestroy(graph);
-    if (h_mailbox)
-      cudaFreeHost(h_mailbox);
-
-    graphExec = nullptr;
-    graph = nullptr;
-    h_mailbox = nullptr;
-    d_mailbox = nullptr;
-    hostEntries = {};
-  }
-
-  void **h_mailbox = nullptr;
-  void **d_mailbox = nullptr;
-  cudaGraph_t graph = nullptr;
-  cudaGraphExec_t graphExec = nullptr;
-  std::array<cudaq_function_entry_t, 1> hostEntries{};
-};
-
-DeviceCallService *getTestRealtimeService() {
-  static TestRealtimeService service;
-  return &service;
-}
-
-} // namespace
-
-extern "C" cudaq_internal::device_call::DeviceCallServicePluginInfo
-cudaqGetDeviceCallServicePluginInfo() {
-  return {"test-device-call", &getTestRealtimeService};
-}
-
-namespace {
-
-void initializeGpuRuntime(TestGpuTable table = TestGpuTable::AddThem) {
-  selectedGpuTable = table;
-  char program[] = "test_device_call_dispatch";
-  char *argv[] = {program};
-  cudaq_internal::device_call::initializeDeviceCallRuntime(1, argv);
-}
-
-void initializeHostRuntime() {
-  char program[] = "test_device_call_dispatch";
-  char option[] = "--cudaq-device-call=host-dispatch";
-  char *argv[] = {program, option};
-  cudaq_internal::device_call::initializeDeviceCallRuntime(2, argv);
-}
-
-void finalizeRuntime() {
-  cudaq_internal::device_call::finalizeDeviceCallRuntime();
-}
-
-class DeviceCallDispatchTest : public ::testing::Test {
-protected:
-  void SetUp() override { ASSERT_NO_THROW(initializeGpuRuntime()); }
-
-  void TearDown() override { ASSERT_NO_THROW(finalizeRuntime()); }
-};
-
-TEST_F(DeviceCallDispatchTest, DispatchesI32AddHandler) {
-  std::array<std::int32_t, 2> request{};
-  auto *const args = request.data();
-  args[0] = 19;
-  args[1] = 23;
-
-  std::int32_t response = 0;
-  std::uint64_t responseLen = 0;
-  ASSERT_EQ(0,
-            dispatchUsingFrameLease(0, AddThemFunctionId, request.data(),
-                                    request.size() * sizeof(request[0]),
-                                    &response, sizeof(response), &responseLen));
-  EXPECT_EQ(sizeof(response), responseLen);
-  EXPECT_EQ(42, response);
-}
-
-TEST_F(DeviceCallDispatchTest, DispatchesI32AddHandlerThroughFrameLease) {
-  void *frame = nullptr;
-  void *requestPayload = nullptr;
-  void *responsePayload = nullptr;
-  ASSERT_EQ(0, __cudaq_device_call_acquire_realtime_frame(
-                   0, AddThemFunctionId, 2 * sizeof(std::int32_t),
-                   sizeof(std::int32_t), &frame, &requestPayload,
-                   &responsePayload));
-  ASSERT_NE(nullptr, frame);
-  ASSERT_NE(nullptr, requestPayload);
-  ASSERT_NE(nullptr, responsePayload);
-
-  auto *const args = static_cast<std::int32_t *>(requestPayload);
-  args[0] = 19;
-  args[1] = 23;
-
-  std::uint64_t responseLen = 0;
-  ASSERT_EQ(0,
-            __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen));
-  EXPECT_EQ(sizeof(std::int32_t), responseLen);
-  EXPECT_EQ(42, *static_cast<std::int32_t *>(responsePayload));
-
-  __cudaq_device_call_safely_release_realtime_frame(frame);
-}
-
-TEST_F(DeviceCallDispatchTest, DispatchesVoidFireAndForgetThroughFrameLease) {
-  void *frame = nullptr;
-  void *requestPayload = nullptr;
-  void *responsePayload = nullptr;
-  ASSERT_EQ(0, __cudaq_device_call_acquire_realtime_frame(
-                   0, AddThemFunctionId, 2 * sizeof(std::int32_t), 0, &frame,
-                   &requestPayload, &responsePayload));
-  ASSERT_NE(nullptr, frame);
-  ASSERT_NE(nullptr, requestPayload);
-  EXPECT_EQ(nullptr, responsePayload);
-
-  auto *args = static_cast<std::int32_t *>(requestPayload);
-  args[0] = 19;
-  args[1] = 23;
-
-  std::uint64_t responseLen = 123;
-  ASSERT_EQ(0,
-            __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen));
-  EXPECT_EQ(0u, responseLen);
-
-  __cudaq_device_call_safely_release_realtime_frame(frame);
-
-  for (int i = 0; i < 2; ++i) {
-    frame = nullptr;
-    requestPayload = nullptr;
-    responsePayload = nullptr;
-    ASSERT_EQ(0, __cudaq_device_call_acquire_realtime_frame(
-                     0, AddThemFunctionId, 2 * sizeof(std::int32_t),
-                     sizeof(std::int32_t), &frame, &requestPayload,
-                     &responsePayload));
-    args = static_cast<std::int32_t *>(requestPayload);
-    args[0] = 19;
-    args[1] = 23;
-
-    responseLen = 0;
-    ASSERT_EQ(0,
-              __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen));
-    EXPECT_EQ(sizeof(std::int32_t), responseLen);
-    EXPECT_EQ(42, *static_cast<std::int32_t *>(responsePayload));
-
-    __cudaq_device_call_safely_release_realtime_frame(frame);
-  }
-}
-
-TEST_F(DeviceCallDispatchTest, ReinitializesThroughDiscoveredPlugin) {
-  ASSERT_NO_THROW(finalizeRuntime());
-  ASSERT_NO_THROW(initializeGpuRuntime(TestGpuTable::AddThemOffset));
-
-  std::array<std::int32_t, 2> request{19, 23};
-  std::int32_t response = 0;
-  std::uint64_t responseLen = 0;
-  ASSERT_EQ(0,
-            dispatchUsingFrameLease(0, AddThemFunctionId, request.data(),
-                                    request.size() * sizeof(request[0]),
-                                    &response, sizeof(response), &responseLen));
-  EXPECT_EQ(sizeof(response), responseLen);
-  EXPECT_EQ(142, response);
-}
-
-class HostGraphDispatchFrameTest : public ::testing::Test {
-protected:
-  void SetUp() override { ASSERT_NO_THROW(initializeHostRuntime()); }
-
-  void TearDown() override {
-    if (frame)
-      __cudaq_device_call_safely_release_realtime_frame(frame);
-    ASSERT_NO_THROW(finalizeRuntime());
-  }
-
-  void *frame = nullptr;
-};
-
-TEST_F(HostGraphDispatchFrameTest, DispatchesGraphLaunchThroughFrameLease) {
-  void *requestPayload = nullptr;
-  void *responsePayload = nullptr;
-  ASSERT_EQ(DeviceCallSuccessStatus,
-            __cudaq_device_call_acquire_realtime_frame(
-                0, GraphAddThemFunctionId, 2 * sizeof(std::int32_t),
-                sizeof(std::int32_t), &frame, &requestPayload,
-                &responsePayload));
-  ASSERT_NE(nullptr, frame);
-  ASSERT_NE(nullptr, requestPayload);
-  ASSERT_NE(nullptr, responsePayload);
-
-  auto *const args = static_cast<std::int32_t *>(requestPayload);
-  args[0] = 19;
-  args[1] = 23;
-
-  std::uint64_t responseLen = 0;
-  ASSERT_EQ(DeviceCallSuccessStatus,
-            __cudaq_device_call_dispatch_realtime_frame(frame, &responseLen));
-  EXPECT_EQ(sizeof(std::int32_t), responseLen);
-  EXPECT_EQ(42, *static_cast<std::int32_t *>(responsePayload));
-
-  __cudaq_device_call_safely_release_realtime_frame(frame);
-  frame = nullptr;
-}
-
-class DeviceCallServicePluginTest : public ::testing::Test {
-protected:
-  void SetUp() override { ASSERT_NO_THROW(initializeGpuRuntime()); }
-
-  void TearDown() override { ASSERT_NO_THROW(finalizeRuntime()); }
-};
-
-TEST_F(DeviceCallServicePluginTest, DispatchesThroughDiscoveredPlugin) {
-  std::array<std::int32_t, 2> request{19, 23};
-  std::int32_t response = 0;
-  std::uint64_t responseLen = 0;
-
-  ASSERT_EQ(0,
-            dispatchUsingFrameLease(0, AddThemFunctionId, request.data(),
-                                    request.size() * sizeof(request[0]),
-                                    &response, sizeof(response), &responseLen));
-  EXPECT_EQ(sizeof(response), responseLen);
-  EXPECT_EQ(42, response);
-}
-
-TEST_F(DeviceCallServicePluginTest, FinalizeClearsPluginSession) {
-  ASSERT_NO_THROW(finalizeRuntime());
-
-  std::array<std::int32_t, 2> request{19, 23};
-  std::int32_t response = 0;
-  std::uint64_t responseLen = 0;
-  EXPECT_EQ(DeviceCallNotInitializedStatus,
-            dispatchUsingFrameLease(0, AddThemFunctionId, request.data(),
-                                    request.size() * sizeof(request[0]),
-                                    &response, sizeof(response), &responseLen));
+int populateAddThemTable(cudaq_function_entry_t *entries, bool useOffset,
+                         cudaStream_t stream) {
+  if (useOffset)
+    initDeviceCallTableWithOffset<<<1, 1, 0, stream>>>(entries);
+  else
+    initDeviceCallTable<<<1, 1, 0, stream>>>(entries);
+  return cudaGetLastError() == cudaSuccess ? 0 : 1;
 }
 
-} // namespace
+} // namespace cudaq_internal::device_call::test