Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 62 additions & 2 deletions realtime/include/cudaq/realtime/daemon/dispatcher/cudaq_realtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,18 @@

#include "cudaq/realtime/daemon/dispatcher/rpc_wire_format.h"

// Visibility marker for entry points that consumers reach via
// dlsym(RTLD_DEFAULT,
// ...) at runtime. libcudaq-realtime-dispatch.a is built with hidden
Comment on lines +18 to +19

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nits: line break (clang-format fallout, perhaps)

// visibility
// + -Wl,--exclude-libs=ALL, so by default its symbols stay hidden inside the
// final binary even when the archive is absorbed. Marking individual symbols
// with default visibility opts them back into the binary's dynamic symbol table
// (when --export-dynamic is in effect on the linker command line for the exe),
// so a separately-loaded .so can resolve them by name without any explicit
// setter / constructor-shim plumbing on the consumer side.
#define CUDAQ_REALTIME_DISPATCH_API __attribute__((visibility("default")))

#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -116,6 +128,21 @@ typedef struct {
// external GPU kernel (e.g. Hololink TX) polls the
// same tx_flags array; the sentinel would be
// misinterpreted as a valid address.
uint32_t shared_ring_mode; // when non-zero, the dispatcher cooperates with
// OTHER dispatchers on the SAME ring buffer.
// Slots whose function_id is not in this
// dispatcher's function table (or is in the
// table but does not match this dispatcher's
// expected dispatch_mode) are SKIPPED without
// clearing rx_flags -- the local cursor
// advances, leaving the slot for another
// dispatcher to pick up. When zero (default),
// legacy behavior: unknown / wrong-mode slots
// are DROPPED (rx_flags cleared). Both
// dispatchers sharing a ring must set this to
// non-zero; the partitioning invariant is that
// each function_id appears in AT MOST ONE
// dispatcher's function table.
} cudaq_dispatcher_config_t;

// GPU ring buffer pointers. For device backend use device pointers only.
Expand Down Expand Up @@ -158,6 +185,15 @@ typedef struct {
uint8_t dispatch_mode; // cudaq_dispatch_mode_t value
uint8_t reserved[3]; // padding
cudaq_handler_schema_t schema; // function signature schema
// Optional sub-routing key for CUDAQ_DISPATCH_GRAPH_LAUNCH entries. When
// multiple GRAPH_LAUNCH entries share the same `function_id` (the multi-
// instance pattern used by e.g. the QEC realtime decoder suite, where
// the same `enqueue_syndromes` function name fronts N distinct captured
// graphs -- one per decoder), the host monitor disambiguates them by
// `routing_key`, matching it against the request payload's first 8
// bytes (arg0). Ignored when dispatch_mode != CUDAQ_DISPATCH_GRAPH_LAUNCH.
// See proposals/cudaq_realtime_host_api.bs#host-path-graph-routing-key.
uint64_t routing_key;
} cudaq_function_entry_t;

// Function table for device-side dispatch
Expand All @@ -174,8 +210,13 @@ typedef void (*cudaq_dispatch_launch_fn_t)(
volatile int *shutdown_flag, uint64_t *stats, size_t num_slots,
uint32_t num_blocks, uint32_t threads_per_block, cudaStream_t stream);

// Default dispatch kernel launch helpers (from libcudaq-realtime-dispatch.a)
void cudaq_launch_dispatch_kernel_regular(
// Default dispatch kernel launch helpers (from libcudaq-realtime-dispatch.a).
// Marked CUDAQ_REALTIME_DISPATCH_API so the symbol stays in the dynamic table
// after the archive is absorbed into a binary; consumer .so's that dlsym() it
// at runtime (e.g. cuda-qx's libcudaq-qec-realtime-decoding.so) can then
// resolve it without any explicit setter/constructor-shim plumbing on the
// consumer side.
CUDAQ_REALTIME_DISPATCH_API void cudaq_launch_dispatch_kernel_regular(
volatile uint64_t *rx_flags, volatile uint64_t *tx_flags, uint8_t *rx_data,
uint8_t *tx_data, size_t rx_stride_sz, size_t tx_stride_sz,
cudaq_function_entry_t *function_table, size_t func_count,
Expand Down Expand Up @@ -387,6 +428,25 @@ cudaError_t
cudaq_dispatch_kernel_cooperative_query_occupancy(int *out_blocks,
uint32_t threads_per_block);

// Push the shared_ring_mode flag into the DEVICE_LOOP kernel's __constant__
// memory. Must be called BEFORE cudaq_dispatcher_start() launches the
// device kernel; otherwise the kernel will start with shared_ring_mode=0.
//
// IMPORTANT: cudaq_dispatcher_start() does NOT call this for you. The
// __constant__ symbol lives in libcudaq-realtime-dispatch.a, which is
// linked directly into consumers (not into libcudaq-realtime.so), so the
// dispatcher manager cannot reach the symbol from inside the shared
// library. Consumers that set config.shared_ring_mode = 1 must also call
// cudaq_dispatch_kernel_set_shared_ring_mode(1) before starting the
// dispatcher. The HOST_LOOP path reads config.shared_ring_mode directly
// and does NOT require this call.
//
// CUDAQ_REALTIME_DISPATCH_API: see cudaq_launch_dispatch_kernel_regular for
// the rationale -- consumers (e.g. cuda-qx's libcudaq-qec-realtime-decoding.so)
// resolve this entry point via dlsym(RTLD_DEFAULT, ...) at runtime.
CUDAQ_REALTIME_DISPATCH_API cudaError_t
cudaq_dispatch_kernel_set_shared_ring_mode(uint32_t enabled);

#ifdef __cplusplus
}
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@ typedef struct {
void *pre_launch_data;
void (*post_launch_fn)(void *user_data, void *slot_dev, cudaStream_t stream);
void *post_launch_data;
/// Optional sub-routing key for `function_id` collisions across workers.
/// When several workers share the same `function_id` but back different
/// captured graphs, the monitor uses (function_id, routing_key) to
/// disambiguate. The runtime routing key comes from the request
/// payload's first 8 bytes (arg0); a worker matches only if both
/// function_id and routing_key match. Set to 0 when sub-routing isn't
/// needed (the historical function_id-only match).
/// See proposals/cudaq_realtime_host_api.bs#host-path-graph-routing-key.
uint64_t routing_key;

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: do we need to initialize this routing_key in cudaq_host_dispatcher_start_thread similar to other fields?

} cudaq_host_dispatch_worker_t;

typedef struct {
Expand Down
14 changes: 14 additions & 0 deletions realtime/lib/daemon/dispatcher/cudaq_realtime_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,20 @@ cudaq_status_t cudaq_dispatcher_start(cudaq_dispatcher_t *dispatcher) {
if (cudaStreamCreate(&dispatcher->stream) != cudaSuccess)
return CUDAQ_ERR_CUDA;

// NOTE on config.shared_ring_mode for DEVICE_LOOP:
//
// The device dispatch kernel reads shared_ring_mode from a __constant__
// symbol that lives in libcudaq-realtime-dispatch.a (the static lib).
// libcudaq-realtime.so does NOT link the static lib (architecturally
// separate: consumers link the static lib themselves), so we cannot
// call cudaq_dispatch_kernel_set_shared_ring_mode() from here.
//
// Callers that want shared_ring_mode for DEVICE_LOOP must invoke
// cudaq_dispatch_kernel_set_shared_ring_mode(1) themselves BEFORE
// cudaq_dispatcher_start(). The HOST_LOOP path reads
// config.shared_ring_mode directly from this struct (it has no
// __constant__ indirection) -- nothing needed here.

if (dispatcher->config.kernel_type == CUDAQ_KERNEL_UNIFIED) {
dispatcher->unified_launch_fn(
dispatcher->transport_ctx, dispatcher->table.entries,
Expand Down
Loading
Loading