diff --git a/.gitignore b/.gitignore
index 5c17336a..e90cb2ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -85,3 +85,6 @@ driver/kcompat/.scratch/
 
 # Python test coverage
 .coverage
+
+# Project-local scratch space
+/tmp/
diff --git a/docs/reference/kernel-abi/index.rst b/docs/reference/kernel-abi/index.rst
index 6fbc1faa..25045cc5 100644
--- a/docs/reference/kernel-abi/index.rst
+++ b/docs/reference/kernel-abi/index.rst
@@ -336,8 +336,8 @@ Memory transfers via QDMA: ``/dev/slash_qdma_ctl<N>``
 The QDMA device manages DMA queue pairs for bulk data movement between host memory and the card's
 on-board memory (HBM or DDR). Each queue pair is allocated with a mode (currently only MM) and a
 direction mask, then started before use. An anon-inode fd obtained from the queue pair serves as
-the I/O channel: ``write()`` performs H2C transfers, ``read()`` performs C2H transfers, and the
-file position encodes the device-side physical address.
+the transfer channel: host buffers are registered once, and transfer ioctls name the registered
+buffer, buffer offset, device-side physical address, length, and direction.
 
 - **Device file name:** ``/dev/slash_qdma_ctl<N>`` (e.g. ``/dev/slash_qdma_ctl0``)
 - **Sysfs name:** ``slash_qdma_ctl_<PCI-BDF>`` (e.g. ``/sys/class/misc/slash_qdma_ctl_0000:61:00.1``)
@@ -353,9 +353,9 @@ Usage
 -----
 
 In order to transfer data via QDMA, a queue pair must be added, started, and an I/O fd needs
-to be created. The I/O fd treats the file position as the device-side physical address:
-``write()`` performs an H2C (host-to-card) transfer, and ``read()`` performs a C2H (card-to-host)
-transfer. Full lifecycle:
+to be created. The I/O fd is ioctl-only for data movement: userspace registers a host buffer,
+then issues transfer ioctls that name the registered buffer, buffer offset, device-side address,
+length, and direction. Full lifecycle:
 
 .. code-block:: c
 
@@ -381,32 +381,58 @@ transfer. Full lifecycle:
     };
     int io_fd = ioctl(qdma_fd, SLASH_QDMA_IOCTL_QPAIR_GET_FD, &fd_req);
 
-    /* Step 4: H2C transfer to device address 0x4000000000 */
-    pwrite(io_fd, host_buf, nbytes, 0x4000000000LL);
+    /* Step 4: Create a kernel-owned DMA buffer and mmap it for CPU access.
+     * The buffer fd is returned by the ioctl; the kernel allocated the pages,
+     * built the SGL, and DMA-mapped everything once. */
+    struct slash_qdma_buf_create bc = { .size = sizeof(bc), .length = nbytes };
+    int buf_fd = ioctl(io_fd, SLASH_QDMA_IOCTL_BUF_CREATE, &bc);
+    void *host_buf = mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+                          buf_fd, 0);
+
+    /* Step 5: H2C transfer to device address 0x4000000000.  The transfer
+     * carries an array of per-qpair sub-transfers; a single-channel fd uses
+     * one sub-transfer with qpair_index 0. */
+    struct slash_qdma_transfer xfer = {
+        .size = sizeof(xfer),
+        .count = 1,
+        .xfers[0] = {
+            .qpair_index = 0,
+            .direction = SLASH_QDMA_XFER_H2C,
+            .buf_fd = buf_fd,
+            .buf_offset = 0,
+            .dev_addr = 0x4000000000LL,
+            .length = nbytes,
+        },
+    };
+    ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &xfer);
 
-    /* Step 5: C2H transfer from device address 0x4000000000 */
-    pread(io_fd, host_buf, nbytes, 0x4000000000LL);
+    /* Step 6: C2H transfer from device address 0x4000000000 */
+    xfer.xfers[0].direction = SLASH_QDMA_XFER_C2H;
+    ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &xfer);
 
-    /* Step 6: Teardown */
+    /* Step 7: Teardown — closing the buffer fd (after munmap) releases it. */
+    munmap(host_buf, nbytes);
+    close(buf_fd);
     close(io_fd);
     op.op = 1;  ioctl(qdma_fd, SLASH_QDMA_IOCTL_Q_OP, &op);  /* STOP */
     op.op = 2;  ioctl(qdma_fd, SLASH_QDMA_IOCTL_Q_OP, &op);  /* DEL */
 
-The file position can also be set explicitly with ``lseek`` before a plain ``read()``/``write()``:
-
-.. code-block:: c
-
-    lseek(io_fd, 0x1000, SEEK_SET);
-    write(io_fd, src_buf, nbytes);
-
-``lseek`` supports all flags ``SEEK_SET``, ``SEEK_CUR``, and ``SEEK_END``, and both ``pread`` and
-``pwrite`` are supported. However, the fd does **not** support ``mmap``, ``poll``/``select``, or
-``splice``.
+The qpair fd does **not** support ``read``, ``write``, ``pread``, ``pwrite``, ``mmap``,
+``poll``/``select``, or ``splice`` for data movement.  Buffer fds returned by
+``SLASH_QDMA_IOCTL_BUF_CREATE`` **are** mappable with ``mmap`` (full length,
+offset 0).
 
 All transfers are synchronous and block until the transfer completes or times out. The timeout is
 **10 seconds**; after expiry the call returns ``-ETIME``. Partial transfers are possible; the
 return value is the number of bytes transferred, and the file position is advanced accordingly.
 
+The userspace buffer address and ``count`` must be page-aligned: the address
+must be 4 KiB-aligned and ``count`` must be a non-zero multiple of 4 KiB. The
+transfer is backed by 4 KiB base pages, one descriptor per page. Transparent
+hugepages are not accepted, so callers using anonymous mappings should apply
+``MADV_NOHUGEPAGE`` before faulting pages when they need deterministic
+base-page transfers.
+
 Multiple fds can be obtained for the same qpair via multiple ``QPAIR_GET_FD`` calls, including
 from different processes. Concurrent ``read()``/``write()`` calls on the same qpair (from any
 fd or thread) are serialized by the kernel and execute one at a time; for parallel I/O, allocate
@@ -425,7 +451,7 @@ The following errno values can be returned by ``read()`` and ``write()`` on the
    * - ``-ENODEV``
      - Device shutting down, or the required direction is not enabled for this qpair
    * - ``-EINVAL``
-     - Zero-length transfer (``count`` results in 0 pages)
+     - Zero-length, unaligned, or non-page-multiple transfer
    * - ``-ENOMEM``
      - SGL allocation failure
    * - ``-EFAULT``
@@ -667,38 +693,49 @@ removed.
 ``SLASH_QDMA_IOCTL_QPAIR_GET_FD``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Creates a new file descriptor for data transfer on an existing queue pair. The returned fd supports
-``read``, ``write``, ``pread``, ``pwrite``, and ``lseek``; it does **not** support ``mmap``,
-``poll``/``select``, or ``splice``. Multiple fds can be obtained for the same qpair via multiple
-calls. The fd is returned as the ``ioctl()`` return value.
+Creates a new file descriptor for data transfer.  The fd is a **collection of one or two queue
+pairs** (typically one per AXI-MM/NoC channel): a transfer issued on it selects a bound queue pair
+by index, so one transfer ioctl can fan across both channels.  The returned fd is ioctl-only for
+data movement: it supports buffer register/unregister and transfer ioctls, but not ``read``,
+``write``, ``pread``, ``pwrite``, ``mmap``, ``poll``/``select``, or ``splice`` (an optional
+``io_uring`` ``uring_cmd`` async transfer path is available on capable kernels).  Multiple fds can
+be obtained for the same qpair(s) via multiple calls.  The fd is returned as the ``ioctl()`` return
+value.
 
 **Interface:**
 
 .. code-block:: c
 
+    #define SLASH_QDMA_FD_MAX_QPAIRS 2u
+
     #define SLASH_QDMA_IOCTL_QPAIR_GET_FD _IOWR('v', 0x53, struct slash_qdma_qpair_fd_request)
 
     struct slash_qdma_qpair_fd_request {
-        __u32 size;   /* [in/out] ABI version */
-        __u32 qid;    /* [in]     Queue pair ID (must exist and be non-empty) */
-        __u32 flags;  /* [in]     fd flags: only O_CLOEXEC is honoured */
+        __u32 size;        /* [in/out] ABI version */
+        __u32 qid;         /* [in]     Legacy single qpair ID; used when qpair_count == 0 */
+        __u32 flags;       /* [in]     fd flags: only O_CLOEXEC is honoured */
+        __u32 qpair_count; /* [in]     Number of qpair_ids (1..SLASH_QDMA_FD_MAX_QPAIRS); 0 = use qid */
+        __u32 qpair_ids[SLASH_QDMA_FD_MAX_QPAIRS]; /* [in] qpair IDs; index == qpair_index */
     };
 
-**Direction:** ``_IOWR`` — userspace writes ``qid`` and ``flags``; the kernel returns the new fd
-as the ``ioctl()`` return value (not as a struct field).
+**Direction:** ``_IOWR`` — userspace writes the qpair selection and ``flags``; the kernel returns
+the new fd as the ``ioctl()`` return value (not as a struct field).
 
 **Preconditions:**
 
-- ``size`` must cover at least ``flags`` (the trailing input field) — otherwise ``-EINVAL``
-- ``qid`` must refer to an existing, non-empty queue pair
+- ``size`` must cover at least ``flags`` (the trailing input field of the legacy form) — otherwise ``-EINVAL``
+- The selected queue pairs must exist and be non-empty (``qpair_count == 0`` selects the single ``qid``)
+- ``qpair_count`` must not exceed ``SLASH_QDMA_FD_MAX_QPAIRS``
 - ``flags & ~O_CLOEXEC == 0`` (any other bits cause ``-EINVAL``)
-- The queue pair should be in the started state for I/O to work
+- The queue pairs should be in the started state for I/O to work
+- Each bound qpair keeps the per-qpair configuration (``mm_channel``, ring sizes, directions) it was
+  given at ``QPAIR_ADD`` time, so the two channels can be configured independently
 
 **Postconditions:**
 
 - The return value is a non-negative fd number on success.
-- The fd holds a reference on both the qpair entry and the device; neither can be freed while
-  this fd is open.
+- The fd holds a reference on the qpair entry, device, and the client context that owns registered
+  buffers; neither can be freed while this fd is open.
 
 **Return values:**
 
@@ -710,6 +747,146 @@ as the ``ioctl()`` return value (not as a struct field).
 - ``-ENOMEM`` — allocation failure
 - Other negative errno from ``anon_inode_getfile()`` or ``get_unused_fd_flags()``
 
+``SLASH_QDMA_IOCTL_BUF_CREATE``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Creates a kernel-owned DMA buffer and returns a mappable fd for it. The ioctl may be issued on
+either the QDMA control fd or a qpair fd of the same device. The kernel allocates ``length`` bytes
+as a set of 4 KiB base pages (not physically contiguous), builds the transfer scatter-gather list,
+and DMA-maps every page **once** — so the steady-state transfer path only slices the prebuilt SGL,
+syncs the touched pages, and submits. Userspace maps the returned fd with ``mmap`` to obtain a CPU
+pointer and passes the fd in ``struct slash_qdma_subxfer`` to move data. The buffer is bound to the
+fd's QDMA device; transfers must use a qpair fd of that same device.
+
+**Interface:**
+
+.. code-block:: c
+
+    #define SLASH_QDMA_IOCTL_BUF_CREATE _IOWR('v', 0x54, struct slash_qdma_buf_create)
+
+    struct slash_qdma_buf_create {
+        __u32 size;          /* [in/out] ABI version */
+        __u32 flags;         /* [in]  Only O_CLOEXEC is honoured */
+        __u64 length;        /* [in]  Buffer length in bytes (page multiple) */
+        __u32 granule;       /* [out] Bytes per SGL descriptor (host page size) */
+        __u32 transfer_hint; /* [out] enum slash_qdma_transfer_hint */
+    };
+
+**Direction:** ``_IOWR`` — issued on the control fd or a qpair fd. Userspace writes ``flags`` and
+``length``; the kernel writes back ``granule`` and ``transfer_hint`` and returns the new buffer fd
+as the ``ioctl()`` return value (same convention as the BAR/queue-pair fd ioctls).
+
+The returned fd:
+
+- is ``mmap``-able (full length, offset 0, ``MAP_SHARED``) for CPU access to the buffer;
+- releases the buffer when it (and any mapping) is closed — there is no explicit unregister ioctl;
+- keeps its pages (and DMA mapping) alive as long as either the fd or any mapping exists.
+
+``transfer_hint`` is advisory and tells userspace which queue topology the kernel expects to be
+best for this buffer on the current hardware. Current SLASH hardware returns
+``SLASH_QDMA_TRANSFER_HINT_V80``; userspace may ignore this value. Known values are:
+
+.. code-block:: c
+
+    enum slash_qdma_transfer_hint {
+        SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR = 1,
+        SLASH_QDMA_TRANSFER_HINT_V80          = 2,
+    };
+
+``SLASH_QDMA_TRANSFER_HINT_V80`` asks userspace to apply the V80 placement-aware channel policy:
+spread a transfer across both AXI-MM channels so each NoC ingress master (NMU) drives an
+independent memory endpoint (NSU). The marker is opaque; the client computes the actual split from
+the buffer's device address (DDR ranges are halved across the two channels, while HBM ranges are
+routed by the 16 GiB half-memory boundary). ``SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR`` keeps all
+traffic on a single queue.
+
+**Preconditions:**
+
+- ``size`` must cover at least ``length`` (the trailing input field) — otherwise ``-EINVAL``
+- ``flags`` must contain only ``O_CLOEXEC``
+- ``length`` must be a non-zero multiple of the page size
+
+**Postconditions:**
+
+- the ``ioctl()`` return value is the new buffer fd (``>= 0``)
+- ``granule`` is the per-descriptor page size (4 KiB); ``transfer_hint`` is an advisory topology hint
+- the pages stay allocated and DMA-mapped until the fd and all mappings are closed and no transfer
+  is in flight
+
+**Return values:**
+
+- ``>= 0`` — the new buffer fd (success)
+- ``-EFAULT`` — copy failure
+- ``-EINVAL`` — ``size`` too small, unsupported ``flags`` bits, or misaligned/zero ``length``
+- ``-ENOMEM`` — page allocation or DMA-mapping failure
+- ``-ENODEV`` — device shutting down
+- Other negative errno from ``anon_inode_getfile()`` or ``get_unused_fd_flags()``
+
+The ``'v'`` ``0x55`` ioctl number is reserved (it was the removed
+``SLASH_QDMA_IOCTL_BUF_UNREGISTER``; kernel buffers are now released by closing the fd).
+
+``SLASH_QDMA_QPAIR_IOCTL_TRANSFER``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Performs a DMA transfer batch using kernel buffers. Unlike ``read``/``write``/``pread``/``pwrite``,
+this ioctl is issued on a **queue-pair I/O fd** (from ``SLASH_QDMA_IOCTL_QPAIR_GET_FD``), not the
+control device. The transfer carries an array of per-qpair sub-transfers; sub-transfers that target
+distinct queue pairs are submitted **concurrently** (all but the last asynchronously, the last
+blocking, then awaited), so a single ioctl can drive both NoC channels in parallel. No pages are
+allocated or DMA-mapped on this path — that work was amortised at ``BUF_CREATE`` time — so each
+sub-transfer syncs and submits the cached, pre-DMA-mapped SGL slice directly.
+
+**Interface:**
+
+.. code-block:: c
+
+    #define SLASH_QDMA_QPAIR_IOCTL_TRANSFER _IOWR('v', 0x56, struct slash_qdma_transfer)
+
+    struct slash_qdma_subxfer {
+        __u32 qpair_index; /* [in] Index into the fd's bound qpairs */
+        __u32 direction;   /* [in] 1=H2C (write), 2=C2H (read) */
+        __s32 buf_fd;      /* [in] Kernel buffer fd from BUF_CREATE */
+        __u32 pad0;        /* padding */
+        __u64 buf_offset;  /* [in] Byte offset within the buffer */
+        __u64 dev_addr;    /* [in] Device-side (endpoint) address */
+        __u64 length;      /* [in] Number of bytes to transfer */
+    };
+
+    struct slash_qdma_transfer {
+        __u32 size;   /* [in/out] ABI version */
+        __u32 count;  /* [in] Number of sub-transfers (1..SLASH_QDMA_FD_MAX_QPAIRS) */
+        struct slash_qdma_subxfer xfers[SLASH_QDMA_FD_MAX_QPAIRS];
+    };
+
+**Direction:** ``_IOWR`` — userspace writes all input fields; the total number of bytes transferred
+across all sub-transfers is returned as the ``ioctl()`` return value (not as a struct field).
+
+**Preconditions:**
+
+- ``size`` must cover at least ``count`` (the trailing header field) — otherwise ``-EINVAL``
+- ``count`` must be in ``[1, SLASH_QDMA_FD_MAX_QPAIRS]``
+- each sub-transfer's ``qpair_index`` must be ``< `` the number of qpairs the fd owns
+- each ``direction`` must be 1 (H2C) or 2 (C2H) and must be enabled on the selected queue pair
+- each ``buf_fd`` must be a buffer fd (from ``BUF_CREATE``) bound to the same device as this qpair fd
+- each ``buf_offset`` and ``length`` must be aligned to the buffer's page granule, ``length`` non-zero
+  and ``<= UINT_MAX``, and ``buf_offset + length`` must not exceed the buffer length
+
+**Return values:**
+
+- ``>= 0`` — total number of bytes transferred (success)
+- ``-EFAULT`` — copy failure
+- ``-EBADF`` — a ``buf_fd`` is not a valid open fd
+- ``-EINVAL`` — ``size``/``count`` invalid, bad ``qpair_index``/``direction``, a ``buf_fd`` that is not
+  a SLASH buffer or belongs to another device, or an out-of-range / misaligned slice
+- ``-ENODEV`` — device shutting down or the requested direction is not enabled on the qpair
+- Other negative errno from libqdma's ``qdma_request_submit()`` (the first sub-transfer error wins)
+
+An optional asynchronous form of this transfer is exposed via ``io_uring`` ``uring_cmd`` (opcode
+``SLASH_QDMA_URING_CMD_TRANSFER``), available only on kernels built with ``CONFIG_IO_URING`` and
+``uring_cmd`` support. The SQE inline command carries a single ``__u64`` userspace pointer to a
+``struct slash_qdma_transfer``; the completion CQE ``res`` holds the total bytes transferred or a
+negative errno. This lets many buffer transfers be kept in flight from a single thread.
+
 Device resets and hotplugging: ``/dev/slash_hotplug``
 =====================================================
 
diff --git a/docs/reference/smi/commands.rst b/docs/reference/smi/commands.rst
index 563a81b5..2fd1ccb9 100644
--- a/docs/reference/smi/commands.rst
+++ b/docs/reference/smi/commands.rst
@@ -151,11 +151,27 @@ validate
 --------
 
 Run memory integrity and bandwidth tests against a board's HBM and DDR
-subsystems.
+subsystems. For each memory path, bandwidth is reported as single-direction
+C2H read, single-direction H2C write, and simultaneous bidirectional
+throughput (read, write, and total). After the per-memory phases, a final
+parallel phase drives HBM and DDR simultaneously with ``2 * N`` buffers for
+single-direction tests and ``4 * N`` threads for bidirectional tests; this
+phase is skipped when ``--ddr-only`` or ``--hbm-only`` is given.
 
 .. code-block:: text
 
-   v80-smi validate -d <BDF> [-j|--threads <N>]
+   v80-smi validate -d <BDF> [-j|--threads <N>] [-R|--no-reset] [--mm-channel <spec>] [--buffer-size <size>] [--offset <size>] [--starting-offset <size>] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation <auto|paired>] [--channel-region-stride <size>] [--ring-size-index <0-15>] [--bandwidth-iterations <N>] [--bandwidth-duration <seconds>]
+
+Requirements by mode:
+
+* Default mode uses VRTD buffers, requires a running VRTD daemon, and resets
+  the board unless ``--no-reset`` is given.
+* ``--raw-transfer-test`` bypasses VRTD for transfers and requires the SLASH
+  QDMA driver device node for the board. It skips reset.
+* ``--use-qdma-driver`` bypasses both VRTD and SLASH for transfers and requires
+  the stock ``qdma-pf`` driver to be bound to the board's QDMA PF. This backend
+  is built only when ``SMI_ENABLE_QDMA_DRIVER_BACKEND`` is enabled at CMake
+  configure time.
 
 .. option:: -d, --device <BDF>
 
@@ -164,6 +180,140 @@ subsystems.
 .. option:: -j, --threads <N>
 
    Number of parallel buffers/threads for the validation test (1–64, default 8).
+   Bidirectional phases use ``2 * N`` logical positions in each enabled memory
+   space.
+
+.. option:: --buffer-size <size>
+
+   Size of each test buffer. Values may be bare bytes or use ``k``/``K`` or
+   ``m``/``M`` suffixes. The default and maximum are ``512M``. Values must be
+   4 KiB-aligned.
+
+.. option:: --offset <size>
+
+   Distance between logical buffer positions. The default is ``512M``. Values
+   may be bare bytes or use ``k``/``K`` or ``m``/``M`` suffixes, must be
+   4 KiB-aligned, and must be at least ``--buffer-size`` so buffers do not
+   overlap.
+
+.. option:: --starting-offset <size>
+
+   Offset from each memory-space base for logical position 0. The default is
+   ``0``. Values may be bare bytes or use ``k``/``K`` or ``m``/``M`` suffixes
+   and must be 4 KiB-aligned.
+
+Buffers are placed at ``memory_base + starting_offset + position * offset``.
+Single-direction phases use positions ``0..N-1``. Bidirectional phases use
+positions ``0..2N-1`` with reads on even positions and writes on odd positions.
+The full range must remain inside the 64 x 512 MB DDR/HBM address space. If any
+placement option is specified in default VRTD mode, ``validate`` uses raw VRTD
+buffers so the exact addresses are honored; this requires raw memory access
+permission.
+
+The largest phase maps up to ``4 * N * buffer-size`` of host buffers when both
+HBM and DDR are enabled, or ``2 * N * buffer-size`` with ``--ddr-only`` or
+``--hbm-only``; the command fails early if that exceeds currently available
+host memory.
+
+.. option:: -R, --no-reset
+
+   Skip the device reset step before running memory tests.
+
+.. option:: --mm-channel <spec>
+
+   AXI-MM / NoC channel selection for each buffer's QDMA queue pair, in every
+   mode. ``spec`` is either a single value applied to all buffers, or a
+   comma-separated list giving one channel per logical buffer position
+   (exactly ``2 x --threads`` entries; there is no repeating/wrap, and any
+   other length is an error):
+
+   * ``auto`` (the default) lets the driver stripe queues across both channels
+     by ``qid & 1``.
+   * ``0`` / ``1`` pin the queue to that AXI-MM channel (and hence NoC channel).
+   * e.g. with ``-j 1`` the list ``0,1`` puts buffer position 0 on channel 0 and
+     position 1 on channel 1. Bidirectional phases use positions ``0..2N-1``;
+     single-direction phases use the first ``N`` entries.
+
+   This is independent of ``--channel-allocation`` (which controls the device
+   address): ``--mm-channel`` controls the host-side NoC ingress (NMU) per
+   queue. With ``--use-qdma-driver`` the selection maps to the stock driver's
+   per-queue MM-channel attribute.
+
+.. option:: --raw-transfer-test
+
+   Use libslash raw QDMA transfers instead of VRTD buffers. This mode implies
+   ``--no-reset`` and requires the SLASH QDMA driver device to be present.
+
+.. option:: --use-qdma-driver
+
+   Run the raw transfer test over the off-the-shelf Xilinx QDMA driver
+   (``/dev/qdma<idx>-MM-<qid>``) instead of SLASH. smi provisions the queues
+   itself: it raises the function's ``qmax`` via sysfs if needed, creates and
+   starts bidirectional AXI-MM queue pairs over generic netlink (the same
+   ``xnl_pf`` interface ``dma-ctl`` uses), then transfers over the per-queue
+   char devices. Queue pairs are spread round-robin across the function's MM
+   engine channels (``channel = qid % mm_channel_max``); the CPM5 QDMA on the
+   V80 exposes two, so the test exercises both. This mode implies
+   ``--no-reset`` and is mutually exclusive with ``--raw-transfer-test``. It
+   requires the stock ``qdma-pf`` driver to be bound to the board's PF (it
+   cannot be bound at the same time as the SLASH driver), and typically
+   requires root to raise ``qmax`` and open the queue devices.
+
+.. option:: --ddr-only
+
+   Run only the DDR memory tests and skip the HBM phase. Mutually exclusive
+   with ``--hbm-only``.
+
+.. option:: --hbm-only
+
+   Run only the HBM memory tests and skip the DDR phase. Mutually exclusive
+   with ``--ddr-only``.
+
+.. option:: --channel-allocation <auto|paired>
+
+   Raw-transfer-only (``--raw-transfer-test`` or ``--use-qdma-driver``) control
+   over how QDMA MM/NoC channels map onto device memory. On CPM5 the host-side
+   NoC ingress port (NMU) is chosen per queue by the SW-context
+   mm-channel/host_id (SLASH uses ``qid & 1``), while the memory-side NoC egress
+   endpoint (NSU / pseudo-channel) is chosen by the device address. Default
+   ``auto`` keeps the historical behaviour: channel ``qid & 1`` with linear
+   addressing, so both NMUs can converge on a single NSU and bandwidth caps at
+   one path. ``paired`` couples the two: even positions land in memory region 0
+   on channel 0, odd positions in region 1 on channel 1 (one
+   ``--channel-region-stride`` apart), giving two independent NMU->NSU paths.
+   This mirrors the off-the-shelf ``dma-perf`` ``offset_ch0``/``offset_ch1``
+   knobs and is the placement that lets both NoC ports contribute bandwidth.
+
+.. option:: --channel-region-stride <size>
+
+   In ``--channel-allocation paired`` mode, the byte distance between the two
+   per-channel memory regions (the NSU / pseudo-channel stride). Default ``16G``
+   (== half the per-memory address space, matching the dma-perf HBM
+   ``offset_ch1 - offset_ch0`` spacing). Must be a non-zero multiple of 4 KiB.
+   Accepts bare bytes or ``k``/``K``, ``m``/``M``, ``g``/``G`` suffixes.
+
+.. option:: --ring-size-index <0-15>
+
+   Raw-transfer-only (``--raw-transfer-test`` or ``--use-qdma-driver``).
+   Override the QDMA descriptor-ring size index used when creating SLASH raw
+   queue pairs or starting stock-driver queues. When omitted, each backend keeps
+   its existing default. Useful A/B values for 4 KiB descriptor throughput are
+   ``0``, ``11``, ``13``, and ``15``.
+
+.. option:: --bandwidth-iterations <N>
+
+   Raw-transfer-only (``--raw-transfer-test`` or ``--use-qdma-driver``). Repeat
+   each whole-buffer transfer in every bandwidth phase ``N`` times and report
+   bandwidth over the sustained loop. The default is ``1``, which preserves the
+   historical one-shot measurement.
+
+.. option:: --bandwidth-duration <seconds>
+
+   Raw-transfer-only duration mode. When non-zero, each bandwidth phase repeats
+   whole-buffer transfers until the requested wall-clock duration has elapsed
+   and counts only completed transfers. This is useful for comparing SLASH's raw
+   path against long-running tools such as ``dma-perf``. A value of ``0`` uses
+   ``--bandwidth-iterations`` instead.
 
 debug
 -----
diff --git a/driver/Makefile b/driver/Makefile
index 98a56815..ac28900e 100644
--- a/driver/Makefile
+++ b/driver/Makefile
@@ -42,8 +42,19 @@ else
   LIBQDMA_PATH := $(LIBQDMA_FALLBACK)
 endif
 
+# SLASH carries a few local modifications to the pinned QDMA submodule's
+# libqdma sources (see $(LIBQDMA_PATCH_DIR)/). The submodule itself stays
+# pristine; the patches are applied to whichever libqdma tree is being built
+# (the DKMS-local ./libqdma or the in-tree submodule) by the libqdma-patches
+# target before the module is compiled. See that target for details.
+LIBQDMA_PATCH_DIR := patches
+
 SLASH_QDMA_OP_DEBUG ?= 0
 
+# Per-transfer timing instrumentation. Set to 1 to emit one dmesg line per
+# DMA transfer breaking down the kernel phases. Default off (zero overhead).
+SLASH_QDMA_TIMING ?= 0
+
 # Kcompat feature flags. Defaults are "n"; the all: recipe runs
 # driver/kcompat/probe.sh against $(KDIR) to detect the actual values
 # and passes them into the kbuild recursion. Each pair (modern API +
@@ -51,6 +62,8 @@ SLASH_QDMA_OP_DEBUG ?= 0
 # absent, the legacy form is the unconditional fallback in slash_compat.h.
 SLASH_HAVE_VM_FLAGS_SET ?= n
 SLASH_HAVE_MODULE_IMPORT_NS_TOKEN ?= n
+SLASH_HAVE_URING_CMD ?= n
+SLASH_HAVE_URING_SQE_CMD ?= n
 
 # Set GCOV=1 to instrument the module for kernel gcov coverage.
 # Not set by default — never enable this in production builds.
@@ -72,6 +85,7 @@ ccflags-y += \
 	\
 	-DTANDEM_BOOT_SUPPORTED=1 \
 	-DSLASH_QDMA_OP_DEBUG=$(SLASH_QDMA_OP_DEBUG) \
+	-DSLASH_QDMA_TIMING=$(SLASH_QDMA_TIMING) \
 	-DSLASH_VERSION_STR=\"$(SLASH_VERSION)\"
 
 ifeq ($(SLASH_HAVE_VM_FLAGS_SET),y)
@@ -82,6 +96,25 @@ ifeq ($(SLASH_HAVE_MODULE_IMPORT_NS_TOKEN),y)
 ccflags-y += -DSLASH_HAVE_MODULE_IMPORT_NS_TOKEN
 endif
 
+# Optional io_uring uring_cmd async transfer path. Probed by kcompat; absent on
+# kernels without CONFIG_IO_URING or uring_cmd support (e.g. RHEL 9, Ubuntu
+# 22.04 GA), where the synchronous transfer ioctl remains the only path.
+ifeq ($(SLASH_HAVE_URING_CMD),y)
+ccflags-y += -DSLASH_HAVE_URING_CMD
+endif
+
+# Selects the io_uring SQE payload accessor: io_uring_sqe_cmd(cmd->sqe) when
+# present (newer kernels + distro backports), else cmd->cmd. Only meaningful
+# when SLASH_HAVE_URING_CMD is also set.
+ifeq ($(SLASH_HAVE_URING_SQE_CMD),y)
+ccflags-y += -DSLASH_HAVE_URING_SQE_CMD
+endif
+
+# Force-include the compat header into every TU (including the pinned libqdma
+# submodule sources we don't modify) so kernel-API shims such as from_timer()
+# reach third-party code too. Safe on all kernels: the shims are guarded.
+ccflags-y += -include $(src)/slash_compat.h
+
 
 LIBQDMA_OBJS := \
 	$(LIBQDMA_PATH)/qdma_mbox.o \
@@ -120,18 +153,80 @@ $(MODULE)-objs += $(LIBQDMA_OBJS) $(QDMA_ACCESS_OBJS)
 
 KCOMPAT := "$(SHELL)" "$(PWD)/kcompat/probe.sh"
 
-all:
+all: libqdma-patches
 	@flags="$$($(KCOMPAT) "$(KDIR)" | tr '\n' ' ')"; \
 	echo "slash: kcompat: $$flags"; \
 	$(MAKE) -C "$(KDIR)" M="$(PWD)" $$flags modules
 
+# Apply SLASH's local libqdma patches ($(LIBQDMA_PATCH_DIR)/*.patch) to the
+# libqdma source tree in use, in filename order, right before building.
+#
+# The pinned submodule is not edited directly by commits: patches live in-tree
+# and are stamped onto the working copy here. Application is idempotent — each patch is first tested
+# for being already applied (reverse dry-run) and skipped if so — so repeated
+# `make` runs, incremental builds, and DKMS rebuilds are all safe. A patch that
+# neither applies cleanly nor is already present aborts the build.
+#
+# $(PWD) is the driver dir for both `make` (in-tree) and DKMS (MAKE[0] runs
+# `make -C driver ...`); ./libqdma is the DKMS-packaged copy, otherwise fall
+# back to the in-tree submodule path. Uses patch(1) so it is independent of
+# whether the libqdma tree lives inside a git checkout.
+libqdma-patches:
+	@set -e; \
+	patch_dir="$(PWD)/$(LIBQDMA_PATCH_DIR)"; \
+	set -- "$$patch_dir"/*.patch; \
+	if [ ! -e "$$1" ]; then exit 0; fi; \
+	if [ -d "$(PWD)/libqdma" ]; then lq="$(PWD)/libqdma"; \
+	else lq="$(PWD)/$(LIBQDMA_FALLBACK)"; fi; \
+	if [ ! -d "$$lq" ]; then \
+		echo "slash: ERROR libqdma sources not found at $$lq" >&2; \
+		echo "slash:       run 'git submodule update --init --recursive' first" >&2; \
+		exit 1; \
+	fi; \
+	command -v patch >/dev/null 2>&1 || { \
+		echo "slash: ERROR patch(1) not found; it is required to apply libqdma patches" >&2; \
+		exit 1; }; \
+	for p in "$$@"; do \
+		name="$$(basename "$$p")"; \
+		if patch -R -p1 -d "$$lq" --dry-run -f -s -i "$$p" >/dev/null 2>&1; then \
+			echo "slash: libqdma patch already applied, skipping: $$name"; \
+		elif patch -p1 -d "$$lq" --dry-run -f -s -i "$$p" >/dev/null 2>&1; then \
+			echo "slash: applying libqdma patch: $$name"; \
+			patch -p1 -d "$$lq" -f -s -i "$$p"; \
+		else \
+			echo "slash: ERROR libqdma patch does not apply cleanly: $$name" >&2; \
+			echo "slash:       (libqdma tree at $$lq is neither pristine nor already patched)" >&2; \
+			exit 1; \
+		fi; \
+	done
+
+# Best-effort revert of the libqdma patches, restoring the submodule working
+# copy to pristine. Useful when editing the patches themselves. Never fails the
+# build: patches that are not currently applied are simply skipped.
+unpatch-libqdma:
+	@set -e; \
+	patch_dir="$(PWD)/$(LIBQDMA_PATCH_DIR)"; \
+	set -- "$$patch_dir"/*.patch; \
+	if [ ! -e "$$1" ]; then exit 0; fi; \
+	if [ -d "$(PWD)/libqdma" ]; then lq="$(PWD)/libqdma"; \
+	else lq="$(PWD)/$(LIBQDMA_FALLBACK)"; fi; \
+	[ -d "$$lq" ] || exit 0; \
+	for p in $$(printf '%s\n' "$$@" | tac); do \
+		name="$$(basename "$$p")"; \
+		if patch -R -p1 -d "$$lq" --dry-run -f -s -i "$$p" >/dev/null 2>&1; then \
+			echo "slash: reverting libqdma patch: $$name"; \
+			patch -R -p1 -d "$$lq" -f -s -i "$$p"; \
+		fi; \
+	done
+
 clean:
-	$(MAKE) -C "$(KDIR)" M="$(PWD)" clean
+	-$(MAKE) -C "$(KDIR)" M="$(PWD)" clean
 	rm -rf "$(PWD)/kcompat/.scratch"
+	$(MAKE) unpatch-libqdma
 
 install: all
 	sudo install -d -m 755 /lib/modules/$(shell uname -r)/extra
 	sudo install -m 644 $(MODULE).ko /lib/modules/$(shell uname -r)/extra
 	sudo depmod -a
 
-.PHONY: all clean install
+.PHONY: all clean install libqdma-patches unpatch-libqdma
diff --git a/driver/README.md b/driver/README.md
index 65cd911a..7576dafb 100644
--- a/driver/README.md
+++ b/driver/README.md
@@ -1,10 +1,58 @@
 # SLASH kernel module
 
+## Module parameters
+
+Exposed under `/sys/module/slash/parameters/` (all writable at runtime; see
+`modinfo slash.ko`):
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `qdma_num_threads` | uint | 8 | Number of libqdma worker threads. |
+| `qdma_debugfs_path` | charp | disabled | debugfs mount path for libqdma. |
+
+### A/B testing NoC channel bandwidth
+
+The AXI-MM / NoC channel is chosen per queue pair when it is added (the
+`mm_channel` field of the qpair-add ioctl, `enum slash_qdma_mm_channel`):
+`auto` stripes queues across both channels by `qid & 1`, while `0` / `1` pin a
+queue to a single channel. Every queue creator carries this setting, so it can
+be driven per buffer to check whether both PCIe NMUs (NoC channels) actually
+contribute bandwidth. With `v80-smi validate`:
+
+```sh
+# All queues on NoC channel 0 (NMU S00)
+sudo v80-smi validate -d <BDF> --raw-transfer-test --no-reset --mm-channel 0
+
+# All queues on NoC channel 1 (NMU S01)
+sudo v80-smi validate -d <BDF> --raw-transfer-test --no-reset --mm-channel 1
+
+# Split across both channels (qid & 1)
+sudo v80-smi validate -d <BDF> --raw-transfer-test --no-reset --mm-channel auto
+
+# Explicit per-buffer split (even positions -> channel 0, odd -> channel 1)
+sudo v80-smi validate -d <BDF> --raw-transfer-test --no-reset --mm-channel 0,1
+```
+
+Debug builds with `SLASH_QDMA_OP_DEBUG=1` log each queue's selected
+`mm_channel` when it is added. If the split run is no faster than a single
+forced channel, traffic is not being spread across both NMUs. The per-queue
+setting affects every queue created through this driver (both the VRTD buffer
+path and `--raw-transfer-test`); the off-the-shelf Xilinx QDMA driver path
+(`--use-qdma-driver`) honors `--mm-channel` through its own channel attribute.
+
 ## Testing
 
 The test suite requires a physical V80 to be present and the module to be
 loaded into a running kernel.
 
+## Local libqdma patches
+
+SLASH carries small patches for the pinned `libqdma` submodule under
+`driver/patches/`. The driver `Makefile` applies them before building, and
+`make clean` attempts to revert them so the submodule working copy returns to
+its pristine pinned state. DKMS packages include the same patch directory and
+depend on `patch(1)`.
+
 ### Prerequisites
 
 - A kernel built with `CONFIG_GCOV_KERNEL=y` (only needed for coverage runs).
diff --git a/driver/kcompat/uring_cmd.c b/driver/kcompat/uring_cmd.c
new file mode 100644
index 00000000..21e9ef93
--- /dev/null
+++ b/driver/kcompat/uring_cmd.c
@@ -0,0 +1,78 @@
+/**
+ * Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify it under the terms of the
+ * GNU General Public License as published by the Free Software Foundation; version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with this program; if
+ * not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/*
+ * Probe for the io_uring uring_cmd *infrastructure* in the exact shape
+ * slash_qdma.c uses, excluding the SQE payload accessor (that axis changed
+ * independently and is probed separately by uring_sqe_cmd.c):
+ *   - struct file_operations has a .uring_cmd member,
+ *   - struct io_uring_cmd exposes ->pdu, ->file, and ->cmd_op,
+ *   - io_uring_cmd_complete_in_task() takes a (cmd, issue_flags) callback,
+ *   - io_uring_cmd_done() takes (cmd, ret, res2, issue_flags).
+ *
+ * This requires CONFIG_IO_URING and a kernel >= 5.19 with the settled
+ * (>= 6.1) signatures; anywhere it fails to build, SLASH_HAVE_URING_CMD=n and
+ * the optional async transfer path is compiled out.  The payload pointer is
+ * read via the SLASH_HAVE_URING_SQE_CMD-selected accessor (see slash_qdma.c):
+ * io_uring_sqe_cmd(cmd->sqe) on newer kernels, cmd->cmd on older ones.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/io_uring.h>
+#if __has_include(<linux/io_uring/cmd.h>)
+#include <linux/io_uring/cmd.h>
+#endif
+
+static void conftest_tw(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+    io_uring_cmd_done(cmd, 0, 0, issue_flags);
+}
+
+static int conftest_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+    void *p = cmd->pdu;
+    struct file *f = cmd->file;
+    u32 op = cmd->cmd_op;
+
+    (void)p;
+    (void)f;
+    (void)op;
+
+    if (issue_flags & IO_URING_F_NONBLOCK)
+        return -EAGAIN;
+
+    io_uring_cmd_complete_in_task(cmd, conftest_tw);
+    return -EIOCBQUEUED;
+}
+
+static const struct file_operations conftest_fops = {
+    .owner = THIS_MODULE,
+    .uring_cmd = conftest_uring_cmd,
+};
+
+static int __init conftest_init(void)
+{
+    (void)conftest_fops;
+    return 0;
+}
+
+static void __exit conftest_exit(void)
+{
+}
+
+MODULE_LICENSE("GPL");
+module_init(conftest_init);
+module_exit(conftest_exit);
diff --git a/driver/kcompat/uring_sqe_cmd.c b/driver/kcompat/uring_sqe_cmd.c
new file mode 100644
index 00000000..62020b30
--- /dev/null
+++ b/driver/kcompat/uring_sqe_cmd.c
@@ -0,0 +1,59 @@
+/**
+ * Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * This program is free software; you can redistribute it and/or modify it under the terms of the
+ * GNU General Public License as published by the Free Software Foundation; version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with this program; if
+ * not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/*
+ * Probe for the *newer* io_uring uring_cmd SQE payload accessor.
+ *
+ * Upstream removed `struct io_uring_cmd::cmd` (a const void * pointing at the
+ * inline SQE command payload) and replaced it with `->sqe` plus the
+ * io_uring_sqe_cmd() accessor.  This change was backported into distro kernels
+ * (e.g. Ubuntu 6.8), so a LINUX_VERSION_CODE check is unreliable — probe the
+ * accessor directly instead.
+ *
+ *   - SLASH_HAVE_URING_SQE_CMD=y  -> use io_uring_sqe_cmd(cmd->sqe)
+ *   - SLASH_HAVE_URING_SQE_CMD=n  -> fall back to cmd->cmd (older kernels)
+ *
+ * This probe only governs the payload accessor; the rest of the uring_cmd
+ * infrastructure is probed by uring_cmd.c (SLASH_HAVE_URING_CMD).
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/io_uring.h>
+#if __has_include(<linux/io_uring/cmd.h>)
+#include <linux/io_uring/cmd.h>
+#endif
+
+static int conftest_uring_sqe_cmd(struct io_uring_cmd *cmd)
+{
+    const void *payload = io_uring_sqe_cmd(cmd->sqe);
+
+    (void)payload;
+    return 0;
+}
+
+static int __init conftest_init(void)
+{
+    (void)conftest_uring_sqe_cmd;
+    return 0;
+}
+
+static void __exit conftest_exit(void)
+{
+}
+
+MODULE_LICENSE("GPL");
+module_init(conftest_init);
+module_exit(conftest_exit);
diff --git a/driver/libslash/README.md b/driver/libslash/README.md
index 9e04813a..5dc4c07d 100644
--- a/driver/libslash/README.md
+++ b/driver/libslash/README.md
@@ -108,10 +108,23 @@ uint32_t qid = req.qid;
 
 slash_qdma_qpair_start(qdma, qid);
 
-/* Get an fd for data transfer — read() = C2H, write() = H2C */
+/* Get an ioctl-only qpair fd for buffer transfers. */
 int fd = slash_qdma_qpair_get_fd(qdma, qid, O_CLOEXEC);
-write(fd, buf, len);   /* H2C */
-read(fd, buf, len);    /* C2H */
+
+/* Create a kernel-owned DMA buffer (length must be a whole number of pages)
+ * and mmap it for CPU access via buf.addr.  Current SLASH hardware reports
+ * SLASH_QDMA_TRANSFER_HINT_V80 in buf.transfer_hint. */
+struct slash_qdma_buffer buf;
+slash_qdma_qpair_buffer_create(fd, len, &buf);
+/* ... fill buf.addr from the CPU for an H2C transfer ... */
+
+/* H2C: host -> device at dev_addr */
+slash_qdma_qpair_transfer(fd, buf.fd, /*buf_offset=*/0, dev_addr, len,
+                          SLASH_QDMA_XFER_H2C);
+/* C2H: device -> host */
+slash_qdma_qpair_transfer(fd, buf.fd, 0, dev_addr, len, SLASH_QDMA_XFER_C2H);
+
+slash_qdma_buffer_destroy(&buf);
 close(fd);
 
 slash_qdma_qpair_stop(qdma, qid);
diff --git a/driver/libslash/include/slash/qdma.h b/driver/libslash/include/slash/qdma.h
index 8d726544..6f097288 100644
--- a/driver/libslash/include/slash/qdma.h
+++ b/driver/libslash/include/slash/qdma.h
@@ -31,10 +31,18 @@
  *   6. slash_qdma_qpair_del()   — destroy
  *   7. slash_qdma_close()       — close the device
  *
- * The fd from qpair_get_fd() supports read() for C2H (card-to-host)
- * and write() for H2C (host-to-card) DMA transfers.  Positional I/O
- * via lseek()/pread()/pwrite() is also supported.  splice(), mmap(),
- * and poll() are not available.
+ * The fd from qpair_get_fd() is ioctl-only for data movement: create kernel
+ * buffers with slash_qdma_buffer_create() (or slash_qdma_qpair_buffer_create()
+ * through a queue-pair fd), then move them with slash_qdma_qpair_transfer() /
+ * slash_qdma_qpair_transfer_batch().  read(), write(), and poll() are not
+ * available for SLASH transfers.
+ *
+ * Kernel buffers:
+ *   For high-throughput transfers, the kernel allocates a DMA buffer once
+ *   (pages + SGL + DMA mapping built at creation), returns a mappable fd, and
+ *   userspace mmaps it for CPU access.  Transfers reference the buffer by its
+ *   fd instead of re-pinning per call.  Closing the buffer fd (and unmapping)
+ *   releases it.
  *
  * Error conventions: int-returning functions return -1 with errno set.
  * Pointer-returning functions return NULL with errno set.
@@ -46,6 +54,7 @@
 #include "uapi/slash_interface.h"
 
 #include <stdint.h>
+#include <sys/types.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -144,13 +153,126 @@ int slash_qdma_qpair_del(struct slash_qdma *qdma, uint32_t qid);
  * @param flags Only O_CLOEXEC is accepted; the kernel returns -EINVAL for
  *              any other bits.
  *
- * The returned fd supports read() (C2H) and write() (H2C).  Positional
- * I/O via lseek()/pread()/pwrite() is also available.
+ * The returned fd supports transfer and buffer-registration ioctls.  It does
+ * not support read/write data movement; use slash_qdma_qpair_transfer().
  *
  * @return Non-negative fd on success, -1 on failure.
  */
 int slash_qdma_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags);
 
+/**
+ * @brief Obtain a transfer fd bound to one or more queue pairs.
+ *
+ * Like slash_qdma_qpair_get_fd(), but the returned fd is a collection of up to
+ * SLASH_QDMA_FD_MAX_QPAIRS queue pairs.  A transfer issued on the fd selects a
+ * bound queue pair by its index in @qids, so a single transfer can fan across
+ * both AXI-MM/NoC channels.  Each bound queue pair keeps whatever per-qpair
+ * settings (mm_channel, ring sizes, directions) it was given at add time.
+ *
+ * @param qdma        Open QDMA handle.
+ * @param qids        Array of @qpair_count queue pair IDs (must be started).
+ * @param qpair_count Number of entries in @qids (1..SLASH_QDMA_FD_MAX_QPAIRS).
+ * @param flags       Only O_CLOEXEC is accepted.
+ *
+ * @return Non-negative fd on success, -1 on failure (errno set).
+ */
+int slash_qdma_qpair_get_fd_multi(struct slash_qdma *qdma, const uint32_t *qids,
+                                  uint32_t qpair_count, int flags);
+
+/**
+ * @brief A kernel-owned DMA buffer and its CPU mapping.
+ *
+ * Created by slash_qdma_buffer_create() / slash_qdma_qpair_buffer_create() and
+ * released by slash_qdma_buffer_destroy().  @addr is an mmap of the kernel
+ * buffer fd; write/read it from the CPU and move it with the transfer helpers,
+ * passing @fd as the sub-transfer's buf_fd.
+ */
+struct slash_qdma_buffer {
+    int fd;                                 /**< Buffer fd (close via destroy). */
+    void *addr;                             /**< CPU mapping of the buffer. */
+    uint64_t length;                        /**< Buffer length in bytes. */
+    uint32_t granule;                       /**< Bytes per DMA descriptor (page). */
+    enum slash_qdma_transfer_hint transfer_hint; /**< Advisory channel policy. */
+};
+
+/**
+ * @brief Create a kernel-owned DMA buffer and mmap it.
+ *
+ * Allocates @length bytes of kernel memory (DMA-mapped once), returns a buffer
+ * fd, and mmaps it into @buf_out->addr for CPU access.  The buffer is bound to
+ * @qdma's device; transfers must use a queue-pair fd of the same device.
+ *
+ * @param qdma    Open QDMA handle.
+ * @param length  Buffer length in bytes (non-zero multiple of the page size).
+ * @param buf_out [out] Receives the created buffer (fd, mapping, metadata).
+ *
+ * @return 0 on success, -1 on failure (errno set).
+ */
+int slash_qdma_buffer_create(struct slash_qdma *qdma, uint64_t length,
+                             struct slash_qdma_buffer *buf_out);
+
+/**
+ * @brief Create a kernel-owned DMA buffer through a queue-pair fd.
+ *
+ * Same semantics as slash_qdma_buffer_create(), but issues the create ioctl on
+ * @p qpair_fd.  This is the preferred form for clients that received only qpair
+ * fds via SCM_RIGHTS (for example libvrtd clients).
+ *
+ * @return 0 on success, -1 on failure (errno set).
+ */
+int slash_qdma_qpair_buffer_create(int qpair_fd, uint64_t length,
+                                   struct slash_qdma_buffer *buf_out);
+
+/**
+ * @brief Release a buffer created with slash_qdma_buffer_create() or
+ *        slash_qdma_qpair_buffer_create().
+ *
+ * Unmaps @buf->addr and closes @buf->fd.  Safe to call on a zeroed/partial
+ * buffer (fields are reset).
+ *
+ * @return 0 on success, -1 on failure (errno set).
+ */
+int slash_qdma_buffer_destroy(struct slash_qdma_buffer *buf);
+
+/**
+ * @brief Perform a DMA transfer using a single buffer fd.
+ *
+ * Convenience wrapper around slash_qdma_qpair_transfer_batch() for a single
+ * sub-transfer on qpair_index 0.
+ *
+ * @param qpair_fd   Queue-pair I/O fd from slash_qdma_qpair_get_fd().
+ * @param buf_fd     Buffer fd (from slash_qdma_buffer_create()).
+ * @param buf_offset Byte offset within the buffer.
+ * @param dev_addr   Device-side (endpoint) address.
+ * @param length     Number of bytes to transfer.
+ * @param direction  One of enum slash_qdma_transfer_dir (H2C or C2H).
+ *
+ * @return Number of bytes transferred (>= 0) on success, -1 on failure
+ *         (errno set).
+ */
+ssize_t slash_qdma_qpair_transfer(int qpair_fd, int buf_fd,
+                                  uint64_t buf_offset, uint64_t dev_addr,
+                                  uint64_t length, uint32_t direction);
+
+/**
+ * @brief Perform a batch of buffer DMA sub-transfers in one call.
+ *
+ * Issues a single transfer ioctl carrying @count sub-transfers.  The kernel
+ * runs sub-transfers that target distinct queue pairs concurrently, so one
+ * call can drive both NoC channels in parallel.  Each sub-transfer names a
+ * bound queue pair by index (see slash_qdma_qpair_get_fd_multi()) and a buffer
+ * by its buf_fd.
+ *
+ * @param qpair_fd Transfer fd from slash_qdma_qpair_get_fd[_multi]().
+ * @param xfers    Array of @count sub-transfer descriptors.
+ * @param count    Number of sub-transfers (1..SLASH_QDMA_FD_MAX_QPAIRS).
+ *
+ * @return Total bytes transferred (>= 0) on success, -1 on failure (errno set).
+ */
+ssize_t slash_qdma_qpair_transfer_batch(int qpair_fd,
+                                        const struct slash_qdma_subxfer *xfers,
+                                        uint32_t count);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif /* __cplusplus */
diff --git a/driver/libslash/include/slash/uapi/slash_interface.h b/driver/libslash/include/slash/uapi/slash_interface.h
index bbe6908d..1b1d85cb 100644
--- a/driver/libslash/include/slash/uapi/slash_interface.h
+++ b/driver/libslash/include/slash/uapi/slash_interface.h
@@ -153,6 +153,19 @@ struct slash_qdma_info {
     __u32 caps;          /**< [out] Capability bitmask. */
 };
 
+/**
+ * @brief AXI-MM / NoC channel selection for a queue pair.
+ *
+ * Selects which CPM5 AXI-MM channel a queue pair uses.  libqdma mirrors the
+ * channel into the SW-context host_id, which selects the programmed Host
+ * Profile and hence the NoC channel.
+ */
+enum slash_qdma_mm_channel {
+    SLASH_QDMA_MM_CHANNEL_AUTO = 0, /**< Stripe across channels by (qid & 1). */
+    SLASH_QDMA_MM_CHANNEL_0    = 1, /**< Pin to AXI-MM/NoC channel 0. */
+    SLASH_QDMA_MM_CHANNEL_1    = 2, /**< Pin to AXI-MM/NoC channel 1. */
+};
+
 /**
  * @brief Add (allocate) a new QDMA queue pair.
  *
@@ -176,6 +189,7 @@ struct slash_qdma_qpair_add {
     /* Userspace to kernel */
     __u32 mode;          /**< [in]  Queue operating mode. */
     __u32 dir_mask;      /**< [in]  Direction bitmask — which directions to enable. */
+    __u32 mm_channel;    /**< [in]  AXI-MM/NoC channel selection (enum slash_qdma_mm_channel). */
 
     __u32 h2c_ring_sz;   /**< [in]  Host-to-card descriptor ring size. */
     __u32 c2h_ring_sz;   /**< [in]  Card-to-host descriptor ring size. */
@@ -208,24 +222,137 @@ struct slash_qdma_qpair_op {
     __u32 op;   /**< [in] One of the SLASH_QDMA_QUEUE_OP_* constants. */
 };
 
+/**
+ * @brief Maximum number of queue pairs a single transfer fd may own.
+ *
+ * A transfer fd is a collection of up to this many queue pairs (the intended
+ * use is one per AXI-MM/NoC channel).  A single transfer ioctl issued on the
+ * fd may fan a buffer transfer across all of them, running up to this many
+ * hardware DMAs in parallel.  Each bound qpair keeps whatever settings it was
+ * given at SLASH_QDMA_IOCTL_QPAIR_ADD time (mm_channel, ring sizes, etc.), so
+ * the two channels can be configured independently.
+ */
+#define SLASH_QDMA_FD_MAX_QPAIRS 2u
+
 /**
  * @brief Obtain a file descriptor for queue I/O.
  *
- * The returned fd can be used for read/write (or mmap) to transfer data
- * through the queue pair.
+ * The returned fd is a collection of one or two queue pairs.  It can be used
+ * for registered-buffer ioctls to transfer data through those queue pairs.
  *
- * The fd is returned as the ioctl return value (same convention as
- * the BAR fd ioctl).  A single fd is returned per queue pair;
- * read() on the fd performs C2H transfers and write() performs H2C
- * transfers, using whichever directions were enabled in \@dir_mask
- * when the queue pair was added.
+ * The fd is returned as the ioctl return value (same convention as the BAR fd
+ * ioctl).  Data movement is issued via SLASH_QDMA_QPAIR_IOCTL_TRANSFER, whose
+ * sub-transfers select a bound queue pair by index and a direction (which must
+ * have been enabled in \@dir_mask when that queue pair was added).
+ *
+ * Set \@qpair_count to the number of queue pairs to bind and list their IDs in
+ * \@qpair_ids; the array index becomes the qpair_index used by
+ * struct slash_qdma_subxfer.  For backward compatibility \@qpair_count == 0
+ * binds the single queue pair named by \@qid.
  */
 struct slash_qdma_qpair_fd_request {
     __u32 size;  /**< Struct size for ABI versioning. */
 
     /* Userspace to kernel */
-    __u32 qid;   /**< [in] Queue pair ID. */
+    __u32 qid;   /**< [in] Legacy single queue pair ID; used only when
+                  *        @qpair_count == 0. */
     __u32 flags; /**< [in] File descriptor flags.  Only O_CLOEXEC is honoured. */
+    __u32 qpair_count; /**< [in] Number of valid entries in @qpair_ids
+                        *        (1..SLASH_QDMA_FD_MAX_QPAIRS); 0 = use @qid. */
+    __u32 qpair_ids[SLASH_QDMA_FD_MAX_QPAIRS]; /**< [in] Queue pair IDs bound to
+                        *  this fd; the array index is the qpair_index. */
+};
+
+/**
+ * @brief Transfer direction for a registered-buffer DMA transfer.
+ */
+enum slash_qdma_transfer_dir {
+    SLASH_QDMA_XFER_H2C = 1, /**< Host-to-Card (write to device). */
+    SLASH_QDMA_XFER_C2H = 2, /**< Card-to-Host (read from device). */
+};
+
+/**
+ * @brief Advisory transfer topology for a registered QDMA buffer.
+ *
+ * The kernel returns this hint when a buffer is registered so userspace can
+ * choose a suitable transfer strategy without hard-coding hardware-specific
+ * scheduling policy.  The hint is advisory: transfers are still valid with any
+ * queue pair whose direction and ownership checks pass.
+ */
+enum slash_qdma_transfer_hint {
+    SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR = 1, /**< Prefer a single qpair (all traffic on one channel). */
+    SLASH_QDMA_TRANSFER_HINT_V80          = 2, /**< Apply the V80 placement-aware channel policy. */
+};
+
+/**
+ * @brief Create a kernel-owned DMA buffer and return a mappable fd.
+ *
+ * The kernel allocates @length bytes of host memory as a set of 4 KiB base
+ * pages (not physically contiguous), builds the transfer scatter-gather list,
+ * and DMA-maps every page once.  All of this expensive setup happens here, at
+ * creation time, so the steady-state transfer path only slices the prebuilt
+ * SGL, syncs the relevant pages, and submits.
+ *
+ * The new buffer is returned as an fd (via the ioctl return value, same
+ * convention as the BAR/queue-pair fd ioctls).  Userspace maps it with
+ * mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED, buf_fd, 0) to obtain
+ * a CPU pointer, and passes @buf_fd in struct slash_qdma_subxfer to move data.
+ * The pages stay alive as long as either the fd or any mapping exists, and the
+ * DMA mapping is torn down once both are gone and no transfer is in flight.
+ *
+ * The buffer is bound to the QDMA device of the fd it is created on (control
+ * fd or queue-pair fd); transfers must use a queue-pair fd of the same device.
+ *
+ * \@length must be a non-zero multiple of the host page size.  The kernel
+ * returns the page @granule (bytes per descriptor) and a @transfer_hint;
+ * current SLASH hardware returns SLASH_QDMA_TRANSFER_HINT_V80.
+ */
+struct slash_qdma_buf_create {
+    __u32 size;          /**< Struct size for ABI versioning. */
+
+    /* Userspace to kernel */
+    __u32 flags;         /**< [in]  File descriptor flags.  Only O_CLOEXEC is honoured. */
+    __u64 length;        /**< [in]  Buffer length in bytes (page multiple). */
+
+    /* Kernel to userspace */
+    __u32 granule;       /**< [out] Bytes per SGL descriptor (host page size). */
+    __u32 transfer_hint; /**< [out] enum slash_qdma_transfer_hint. */
+};
+
+/**
+ * @brief One per-queue-pair sub-transfer within a transfer batch.
+ *
+ * Moves \@length bytes between the kernel buffer named by \@buf_fd at
+ * \@buf_offset and the device endpoint address \@dev_addr, on the queue pair
+ * selected by \@qpair_index (an index into the fd's bound qpairs).
+ * \@buf_offset and \@length must be aligned to the buffer's 4 KiB page granule,
+ * and \@buf_offset + \@length must not exceed the buffer length.  \@direction
+ * must be one of enum slash_qdma_transfer_dir and must be enabled on the
+ * selected queue pair.
+ */
+struct slash_qdma_subxfer {
+    __u32 qpair_index; /**< [in] Index into the fd's bound qpairs. */
+    __u32 direction;   /**< [in] enum slash_qdma_transfer_dir (H2C or C2H). */
+    __s32 buf_fd;      /**< [in] Kernel buffer fd from SLASH_QDMA_IOCTL_BUF_CREATE. */
+    __u32 pad0;        /**< Padding for natural alignment. */
+    __u64 buf_offset;  /**< [in] Byte offset within the buffer. */
+    __u64 dev_addr;    /**< [in] Device-side (endpoint) address. */
+    __u64 length;      /**< [in] Number of bytes to transfer. */
+};
+
+/**
+ * @brief Perform one or more buffer DMA sub-transfers in one call.
+ *
+ * Issued on a queue-pair I/O fd (from SLASH_QDMA_IOCTL_QPAIR_GET_FD).  The
+ * kernel submits all \@count sub-transfers and waits for completion, running
+ * those that target distinct queue pairs concurrently (so a single syscall can
+ * drive both NoC channels in parallel).  The total number of bytes transferred
+ * across all sub-transfers is returned as the ioctl return value.
+ */
+struct slash_qdma_transfer {
+    __u32 size;        /**< Struct size for ABI versioning. */
+    __u32 count;       /**< [in] Number of sub-transfers (1..SLASH_QDMA_FD_MAX_QPAIRS). */
+    struct slash_qdma_subxfer xfers[SLASH_QDMA_FD_MAX_QPAIRS]; /**< [in] Sub-transfers. */
 };
 
 /** Query QDMA subsystem capabilities. */
@@ -240,4 +367,30 @@ struct slash_qdma_qpair_fd_request {
 /** Obtain an I/O file descriptor for a queue pair. */
 #define SLASH_QDMA_IOCTL_QPAIR_GET_FD  _IOWR('v', 0x53, struct slash_qdma_qpair_fd_request)
 
+/**
+ * Create a kernel-owned DMA buffer (allocate pages + build SGL + DMA-map once);
+ * returns a mappable buffer fd as the ioctl return value.  May be issued on the
+ * control device or a queue-pair I/O fd.
+ */
+#define SLASH_QDMA_IOCTL_BUF_CREATE    _IOWR('v', 0x54, struct slash_qdma_buf_create)
+
+/* 'v' 0x55 is reserved (previously SLASH_QDMA_IOCTL_BUF_UNREGISTER, removed:
+ * kernel buffers are released by closing their fd). */
+
+/**
+ * Perform a buffer DMA transfer.  Issued on a queue-pair I/O fd (not the
+ * control device); returns the number of bytes transferred.
+ */
+#define SLASH_QDMA_QPAIR_IOCTL_TRANSFER _IOWR('v', 0x56, struct slash_qdma_transfer)
+
+/**
+ * io_uring command opcode (SQE cmd_op) for an asynchronous buffer transfer
+ * batch, issued on a queue-pair I/O fd via IORING_OP_URING_CMD.  The SQE inline
+ * command carries a single __u64: the userspace pointer to a struct
+ * slash_qdma_transfer.  The completion CQE res holds the total bytes
+ * transferred (>= 0) or a negative errno.  This path is optional and only
+ * available on kernels with io_uring uring_cmd support.
+ */
+#define SLASH_QDMA_URING_CMD_TRANSFER 0x56u
+
 #endif
diff --git a/driver/libslash/src/qdma.c b/driver/libslash/src/qdma.c
index 68c38b6d..efe8c3d3 100644
--- a/driver/libslash/src/qdma.c
+++ b/driver/libslash/src/qdma.c
@@ -40,6 +40,179 @@
 #include <stdio.h>
 
 #include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+/* Bounce-copy chunk used by the @mock transfer fallback. */
+#define QDMA_XFER_BOUNCE_CHUNK (1u << 20)
+
+/*
+ * mmap a buffer fd (kernel buffer or @mock memfd) for CPU access.  Always
+ * MAP_SHARED so writes are visible to the kernel/device and to pread/pwrite on
+ * the same fd.
+ */
+static int qdma_buffer_mmap(struct slash_qdma_buffer *buf)
+{
+    void *addr = mmap(NULL, (size_t)buf->length, PROT_READ | PROT_WRITE,
+                      MAP_SHARED, buf->fd, 0);
+
+    if (addr == MAP_FAILED) {
+        return -1;
+    }
+    buf->addr = addr;
+    return 0;
+}
+
+/*
+ * @mock / fallback buffer: a memfd sized to @length and mmapped shared.  Used
+ * when the BUF_CREATE ioctl is unavailable (the memfd-backed @mock path).
+ */
+static int qdma_buffer_create_memfd(uint64_t length,
+                                    struct slash_qdma_buffer *buf_out)
+{
+    int fd;
+    int saved_errno;
+
+    fd = memfd_create("slash_qdma_buf", MFD_CLOEXEC);
+    if (fd < 0) {
+        return -1;
+    }
+    if (ftruncate(fd, (off_t)length) != 0) {
+        saved_errno = errno;
+        (void)close(fd);
+        errno = saved_errno;
+        return -1;
+    }
+
+    buf_out->fd = fd;
+    buf_out->length = length;
+    buf_out->granule = 4096;
+    buf_out->transfer_hint = SLASH_QDMA_TRANSFER_HINT_V80;
+    buf_out->addr = NULL;
+
+    if (qdma_buffer_mmap(buf_out) != 0) {
+        saved_errno = errno;
+        (void)close(fd);
+        buf_out->fd = -1;
+        errno = saved_errno;
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Create a kernel buffer via the BUF_CREATE ioctl on @ioctl_fd (control fd or
+ * queue-pair fd), then mmap it.  Falls back to a memfd buffer when the ioctl is
+ * not implemented (ENOTTY: the @mock path).
+ */
+static int qdma_buffer_create_on_fd(int ioctl_fd, uint64_t length,
+                                    struct slash_qdma_buffer *buf_out)
+{
+    struct slash_qdma_buf_create req;
+    int fd;
+    int saved_errno;
+
+    memset(&req, 0, sizeof(req));
+    req.size = sizeof(req);
+    req.flags = O_CLOEXEC;
+    req.length = length;
+
+    fd = ioctl(ioctl_fd, SLASH_QDMA_IOCTL_BUF_CREATE, &req);
+    if (fd < 0) {
+        if (errno == ENOTTY) {
+            return qdma_buffer_create_memfd(length, buf_out);
+        }
+        return -1;
+    }
+
+    buf_out->fd = fd;
+    buf_out->length = length;
+    buf_out->granule = req.granule ? req.granule : 4096;
+    buf_out->transfer_hint = (enum slash_qdma_transfer_hint)req.transfer_hint;
+    buf_out->addr = NULL;
+
+    if (qdma_buffer_mmap(buf_out) != 0) {
+        saved_errno = errno;
+        (void)close(fd);
+        buf_out->fd = -1;
+        errno = saved_errno;
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * @mock transfer fallback: bounce a single sub-transfer between the host buffer
+ * fd and the queue-pair memfd that stands in for device memory.  Only used when
+ * the transfer ioctl returns ENOTTY.
+ */
+static ssize_t qdma_fallback_subxfer(int qpair_fd,
+                                     const struct slash_qdma_subxfer *x)
+{
+    uint8_t *tmp;
+    uint64_t done = 0;
+
+    if (x->buf_fd < 0 ||
+        (x->direction != SLASH_QDMA_XFER_H2C &&
+         x->direction != SLASH_QDMA_XFER_C2H)) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    /*
+     * For C2H, make sure the device memfd is large enough that reads of
+     * never-written regions return zeros instead of a short read.  Only ever
+     * grow the file: shrinking would discard data a prior H2C wrote.
+     */
+    if (x->direction == SLASH_QDMA_XFER_C2H) {
+        struct stat st;
+        off_t want = (off_t)(x->dev_addr + x->length);
+
+        if (fstat(qpair_fd, &st) == 0 && st.st_size < want) {
+            (void)ftruncate(qpair_fd, want);
+        }
+    }
+
+    tmp = (uint8_t *)malloc(QDMA_XFER_BOUNCE_CHUNK);
+    if (tmp == NULL) {
+        return -1;
+    }
+
+    while (done < x->length) {
+        uint64_t remaining = x->length - done;
+        size_t chunk = remaining < QDMA_XFER_BOUNCE_CHUNK
+                           ? (size_t)remaining : QDMA_XFER_BOUNCE_CHUNK;
+        ssize_t r;
+        ssize_t w;
+
+        if (x->direction == SLASH_QDMA_XFER_H2C) {
+            r = pread(x->buf_fd, tmp, chunk, (off_t)(x->buf_offset + done));
+            if (r <= 0) {
+                free(tmp);
+                return -1;
+            }
+            w = pwrite(qpair_fd, tmp, (size_t)r, (off_t)(x->dev_addr + done));
+        } else {
+            r = pread(qpair_fd, tmp, chunk, (off_t)(x->dev_addr + done));
+            if (r <= 0) {
+                free(tmp);
+                return -1;
+            }
+            w = pwrite(x->buf_fd, tmp, (size_t)r, (off_t)(x->buf_offset + done));
+        }
+
+        if (w != r) {
+            free(tmp);
+            return -1;
+        }
+        done += (uint64_t)r;
+    }
+
+    free(tmp);
+    return (ssize_t)done;
+}
 
 struct slash_qdma *slash_qdma_open(const char *path)
 {
@@ -146,6 +319,7 @@ int slash_qdma_qpair_add(struct slash_qdma *qdma,
     tmp.size        = sizeof(tmp);
     tmp.mode        = req->mode;
     tmp.dir_mask    = req->dir_mask;
+    tmp.mm_channel  = req->mm_channel;
     tmp.h2c_ring_sz = req->h2c_ring_sz;
     tmp.c2h_ring_sz = req->c2h_ring_sz;
     tmp.cmpt_ring_sz = req->cmpt_ring_sz;
@@ -248,3 +422,156 @@ int slash_qdma_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags)
     return fd;
 }
 
+int slash_qdma_qpair_get_fd_multi(struct slash_qdma *qdma, const uint32_t *qids,
+                                  uint32_t qpair_count, int flags)
+{
+    struct slash_qdma_qpair_fd_request req;
+    uint32_t i;
+    int fd;
+
+    if (qdma == NULL || qids == NULL ||
+        qpair_count == 0 || qpair_count > SLASH_QDMA_FD_MAX_QPAIRS) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    if (qdma->priv) {
+        return slash_qdma_mock_qpair_get_fd_multi(qdma, qids, qpair_count,
+                                                  flags);
+    }
+
+    memset(&req, 0, sizeof(req));
+    req.size        = sizeof(req);
+    req.flags       = flags;
+    req.qid         = qids[0];
+    req.qpair_count = qpair_count;
+    for (i = 0; i < qpair_count; ++i) {
+        req.qpair_ids[i] = qids[i];
+    }
+
+    fd = ioctl(qdma->fd, SLASH_QDMA_IOCTL_QPAIR_GET_FD, &req);
+    if (fd < 0) {
+        return -1;
+    }
+
+    return fd;
+}
+
+int slash_qdma_buffer_create(struct slash_qdma *qdma, uint64_t length,
+                             struct slash_qdma_buffer *buf_out)
+{
+    if (qdma == NULL || buf_out == NULL || length == 0) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    /* @mock has no character device: back the buffer with a memfd directly. */
+    if (qdma->priv) {
+        return qdma_buffer_create_memfd(length, buf_out);
+    }
+
+    return qdma_buffer_create_on_fd(qdma->fd, length, buf_out);
+}
+
+int slash_qdma_qpair_buffer_create(int qpair_fd, uint64_t length,
+                                   struct slash_qdma_buffer *buf_out)
+{
+    if (qpair_fd < 0 || buf_out == NULL || length == 0) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    return qdma_buffer_create_on_fd(qpair_fd, length, buf_out);
+}
+
+int slash_qdma_buffer_destroy(struct slash_qdma_buffer *buf)
+{
+    int ret = 0;
+
+    if (buf == NULL) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    if (buf->addr != NULL && buf->addr != MAP_FAILED && buf->length != 0) {
+        if (munmap(buf->addr, (size_t)buf->length) != 0) {
+            ret = -1;
+        }
+    }
+    buf->addr = NULL;
+
+    if (buf->fd >= 0) {
+        if (close(buf->fd) != 0) {
+            ret = -1;
+        }
+        buf->fd = -1;
+    }
+
+    return ret;
+}
+
+ssize_t slash_qdma_qpair_transfer_batch(int qpair_fd,
+                                        const struct slash_qdma_subxfer *xfers,
+                                        uint32_t count)
+{
+    struct slash_qdma_transfer req;
+    uint32_t i;
+    int ret;
+
+    if (qpair_fd < 0 || xfers == NULL ||
+        count == 0 || count > SLASH_QDMA_FD_MAX_QPAIRS) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    memset(&req, 0, sizeof(req));
+    req.size  = sizeof(req);
+    req.count = count;
+    for (i = 0; i < count; ++i) {
+        if (xfers[i].direction != SLASH_QDMA_XFER_H2C &&
+            xfers[i].direction != SLASH_QDMA_XFER_C2H) {
+            errno = EINVAL;
+            return -1;
+        }
+        req.xfers[i] = xfers[i];
+    }
+
+    ret = ioctl(qpair_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req);
+    if (ret < 0) {
+        if (errno == ENOTTY) {
+            /* @mock path: bounce each sub-transfer through the memfds. */
+            uint64_t total = 0;
+
+            for (i = 0; i < count; ++i) {
+                ssize_t n = qdma_fallback_subxfer(qpair_fd, &xfers[i]);
+
+                if (n < 0) {
+                    return -1;
+                }
+                total += (uint64_t)n;
+            }
+            return (ssize_t)total;
+        }
+        return -1;
+    }
+
+    return (ssize_t)ret;
+}
+
+ssize_t slash_qdma_qpair_transfer(int qpair_fd, int buf_fd,
+                                  uint64_t buf_offset, uint64_t dev_addr,
+                                  uint64_t length, uint32_t direction)
+{
+    struct slash_qdma_subxfer xfer;
+
+    memset(&xfer, 0, sizeof(xfer));
+    xfer.qpair_index = 0;
+    xfer.direction   = direction;
+    xfer.buf_fd      = buf_fd;
+    xfer.buf_offset  = buf_offset;
+    xfer.dev_addr    = dev_addr;
+    xfer.length      = length;
+
+    return slash_qdma_qpair_transfer_batch(qpair_fd, &xfer, 1);
+}
+
diff --git a/driver/libslash/src/qdma_mock.c b/driver/libslash/src/qdma_mock.c
index 92a24c6c..d72762bb 100644
--- a/driver/libslash/src/qdma_mock.c
+++ b/driver/libslash/src/qdma_mock.c
@@ -257,3 +257,44 @@ int slash_qdma_mock_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flag
 
     return new_fd;
 }
+
+int slash_qdma_mock_qpair_get_fd_multi(struct slash_qdma *qdma,
+                                       const uint32_t *qids,
+                                       uint32_t qpair_count, int flags)
+{
+    struct slash_qdma_mock *ctx;
+    uint32_t i;
+    int new_fd;
+    (void) flags; /* O_CLOEXEC already set on the memfd */
+
+    if (qdma == NULL || qids == NULL ||
+        qpair_count == 0 || qpair_count > SLASH_QDMA_FD_MAX_QPAIRS) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    ctx = mock_ctx(qdma);
+
+    for (i = 0; i < qpair_count; ++i) {
+        if (qids[i] >= QDMA_MOCK_MAX_QUEUES ||
+            !ctx->queues[qids[i]].in_use || !ctx->queues[qids[i]].started) {
+            errno = EINVAL;
+            return -1;
+        }
+    }
+
+    /*
+     * The mock backs the device address space with one memfd per queue pair.
+     * Both NoC channels address the same device memory, so a multi-qpair fd is
+     * emulated by a single backing store: dup the first queue pair's memfd and
+     * route every sub-transfer through it.  This keeps round-trips consistent
+     * regardless of which channel a sub-transfer used.
+     */
+    new_fd = dup(ctx->queues[qids[0]].fd);
+    if (new_fd < 0) {
+        return -1;
+    }
+
+    return new_fd;
+}
+
diff --git a/driver/libslash/src/qdma_mock.h b/driver/libslash/src/qdma_mock.h
index 36f3d596..cd7e54e6 100644
--- a/driver/libslash/src/qdma_mock.h
+++ b/driver/libslash/src/qdma_mock.h
@@ -25,6 +25,8 @@
 
 #include <stdint.h>
 
+#include <sys/types.h>
+
 struct slash_qdma *slash_qdma_mock_open(void);
 int slash_qdma_mock_close(struct slash_qdma *qdma);
 int slash_qdma_mock_info_read(struct slash_qdma *qdma, struct slash_qdma_info *info);
@@ -33,5 +35,8 @@ int slash_qdma_mock_qpair_start(struct slash_qdma *qdma, uint32_t qid);
 int slash_qdma_mock_qpair_stop(struct slash_qdma *qdma, uint32_t qid);
 int slash_qdma_mock_qpair_del(struct slash_qdma *qdma, uint32_t qid);
 int slash_qdma_mock_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags);
+int slash_qdma_mock_qpair_get_fd_multi(struct slash_qdma *qdma,
+                                       const uint32_t *qids,
+                                       uint32_t qpair_count, int flags);
 
 #endif /* LIBSLASH_QDMA_MOCK_H */
diff --git a/driver/libslash/tests/qdma_test.cpp b/driver/libslash/tests/qdma_test.cpp
index 5b024111..9519302e 100644
--- a/driver/libslash/tests/qdma_test.cpp
+++ b/driver/libslash/tests/qdma_test.cpp
@@ -21,6 +21,7 @@
 #include <gtest/gtest.h>
 
 #include <cerrno>
+#include <cstdlib>
 #include <cstring>
 #include <unistd.h>
 
@@ -100,6 +101,36 @@ TEST(QdmaNullTest, QpaiGetFd) {
     EXPECT_EQ(errno, EINVAL);
 }
 
+TEST(QdmaNullTest, BufferCreate) {
+    struct slash_qdma_buffer buf{};
+    errno = 0;
+    EXPECT_EQ(slash_qdma_buffer_create(nullptr, 4096, &buf), -1);
+    EXPECT_EQ(errno, EINVAL);
+
+    struct slash_qdma fake{};
+    fake.fd = -1;
+    errno = 0;
+    EXPECT_EQ(slash_qdma_buffer_create(&fake, 4096, nullptr), -1);
+    EXPECT_EQ(errno, EINVAL);
+
+    errno = 0;
+    EXPECT_EQ(slash_qdma_qpair_buffer_create(-1, 4096, &buf), -1);
+    EXPECT_EQ(errno, EINVAL);
+}
+
+TEST(QdmaNullTest, BufferDestroy) {
+    errno = 0;
+    EXPECT_EQ(slash_qdma_buffer_destroy(nullptr), -1);
+    EXPECT_EQ(errno, EINVAL);
+}
+
+TEST(QdmaNullTest, Transfer) {
+    errno = 0;
+    /* Invalid qpair fd is rejected. */
+    EXPECT_EQ(slash_qdma_qpair_transfer(-1, 4, 0, 0, 4096, SLASH_QDMA_XFER_H2C), -1);
+    EXPECT_EQ(errno, EINVAL);
+}
+
 // ─── Real device tests (requires /dev/slash_qdma_ctl0) ───────────────────────
 
 class ParametrizedQdmaTest : public ::testing::TestWithParam<bool> {
@@ -158,26 +189,207 @@ TEST_P(ParametrizedQdmaTest, QueueDmaTransfer) {
     int queue_fd = slash_qdma_qpair_get_fd(qdma_, qid, 0);
     ASSERT_GE(queue_fd, 0);
 
-    // Write a known pattern to DDR (H2C).
-    uint8_t src[XFER_SIZE];
+    // Kernel-owned buffers created through the queue-pair fd.
+    struct slash_qdma_buffer src_buf{};
+    struct slash_qdma_buffer dst_buf{};
+    ASSERT_EQ(slash_qdma_qpair_buffer_create(queue_fd, XFER_SIZE, &src_buf), 0);
+    ASSERT_EQ(slash_qdma_qpair_buffer_create(queue_fd, XFER_SIZE, &dst_buf), 0);
+    auto *src = static_cast<uint8_t *>(src_buf.addr);
+    auto *dst = static_cast<uint8_t *>(dst_buf.addr);
     for (size_t i = 0; i < XFER_SIZE; ++i) {
         src[i] = static_cast<uint8_t>(i & 0xFF);
     }
-    ssize_t written = pwrite(queue_fd, src, XFER_SIZE, static_cast<off_t>(DDR_BASE_ADDRESS));
+    std::memset(dst, 0, XFER_SIZE);
+
+    ssize_t written = slash_qdma_qpair_transfer(
+        queue_fd, src_buf.fd, 0, DDR_BASE_ADDRESS, XFER_SIZE, SLASH_QDMA_XFER_H2C);
     EXPECT_EQ(written, static_cast<ssize_t>(XFER_SIZE));
 
     // Read back from DDR (C2H) and verify.
-    uint8_t dst[XFER_SIZE]{};
-    ssize_t read_bytes = pread(queue_fd, dst, XFER_SIZE, static_cast<off_t>(DDR_BASE_ADDRESS));
+    ssize_t read_bytes = slash_qdma_qpair_transfer(
+        queue_fd, dst_buf.fd, 0, DDR_BASE_ADDRESS, XFER_SIZE, SLASH_QDMA_XFER_C2H);
     EXPECT_EQ(read_bytes, static_cast<ssize_t>(XFER_SIZE));
     EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0);
 
+    EXPECT_EQ(slash_qdma_buffer_destroy(&src_buf), 0);
+    EXPECT_EQ(slash_qdma_buffer_destroy(&dst_buf), 0);
+
     EXPECT_EQ(close(queue_fd), 0);
 
     EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qid), 0);
     EXPECT_EQ(slash_qdma_qpair_del(qdma_, qid), 0);
 }
 
+TEST_P(ParametrizedQdmaTest, BufferCreateTransfer) {
+    static constexpr size_t XFER_SIZE = 4096;
+
+    struct slash_qdma_qpair_add req{};
+    req.mode     = 0;   /* QDMA_Q_MODE_MM */
+    req.dir_mask = 0x3; /* H2C | C2H */
+
+    ASSERT_EQ(slash_qdma_qpair_add(qdma_, &req), 0);
+    uint32_t qid = req.qid;
+    ASSERT_EQ(slash_qdma_qpair_start(qdma_, qid), 0);
+
+    int queue_fd = slash_qdma_qpair_get_fd(qdma_, qid, 0);
+    ASSERT_GE(queue_fd, 0);
+
+    // Kernel-owned buffers created through the control handle.
+    struct slash_qdma_buffer src_buf{};
+    struct slash_qdma_buffer dst_buf{};
+    ASSERT_EQ(slash_qdma_buffer_create(qdma_, XFER_SIZE, &src_buf), 0);
+    ASSERT_EQ(slash_qdma_buffer_create(qdma_, XFER_SIZE, &dst_buf), 0);
+    EXPECT_EQ(src_buf.transfer_hint, SLASH_QDMA_TRANSFER_HINT_V80);
+    EXPECT_EQ(dst_buf.transfer_hint, SLASH_QDMA_TRANSFER_HINT_V80);
+    auto *src = static_cast<uint8_t *>(src_buf.addr);
+    auto *dst = static_cast<uint8_t *>(dst_buf.addr);
+    for (size_t i = 0; i < XFER_SIZE; ++i) {
+        src[i] = static_cast<uint8_t>(i & 0xFF);
+    }
+    std::memset(dst, 0, XFER_SIZE);
+
+    // H2C: push the source buffer to the device.
+    ssize_t written = slash_qdma_qpair_transfer(queue_fd, src_buf.fd, 0,
+                                                DDR_BASE_ADDRESS, XFER_SIZE,
+                                                SLASH_QDMA_XFER_H2C);
+    EXPECT_EQ(written, static_cast<ssize_t>(XFER_SIZE));
+
+    // C2H: pull it back into the destination buffer and verify.
+    ssize_t read_bytes = slash_qdma_qpair_transfer(queue_fd, dst_buf.fd, 0,
+                                                   DDR_BASE_ADDRESS, XFER_SIZE,
+                                                   SLASH_QDMA_XFER_C2H);
+    EXPECT_EQ(read_bytes, static_cast<ssize_t>(XFER_SIZE));
+    EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0);
+
+    EXPECT_EQ(slash_qdma_buffer_destroy(&src_buf), 0);
+    EXPECT_EQ(slash_qdma_buffer_destroy(&dst_buf), 0);
+
+    EXPECT_EQ(close(queue_fd), 0);
+    EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qid), 0);
+    EXPECT_EQ(slash_qdma_qpair_del(qdma_, qid), 0);
+}
+
+TEST_P(ParametrizedQdmaTest, MultiQpairBatchTransfer) {
+    // Two 4 KiB halves transferred concurrently across two queue pairs bound to
+    // a single fd, exercising the get-fd-multi + batch transfer API.
+    static constexpr size_t HALF = 4096;
+    static constexpr size_t XFER_SIZE = 2 * HALF;
+
+    uint32_t qids[2] = {0, 0};
+    for (int ch = 0; ch < 2; ++ch) {
+        struct slash_qdma_qpair_add req{};
+        req.mode       = 0;   /* QDMA_Q_MODE_MM */
+        req.dir_mask   = 0x3; /* H2C | C2H */
+        req.mm_channel = static_cast<uint32_t>(
+            ch == 0 ? SLASH_QDMA_MM_CHANNEL_0 : SLASH_QDMA_MM_CHANNEL_1);
+        ASSERT_EQ(slash_qdma_qpair_add(qdma_, &req), 0);
+        qids[ch] = req.qid;
+        ASSERT_EQ(slash_qdma_qpair_start(qdma_, qids[ch]), 0);
+    }
+
+    int fd = slash_qdma_qpair_get_fd_multi(qdma_, qids, 2, 0);
+    ASSERT_GE(fd, 0);
+
+    struct slash_qdma_buffer src_buf{};
+    struct slash_qdma_buffer dst_buf{};
+    ASSERT_EQ(slash_qdma_qpair_buffer_create(fd, XFER_SIZE, &src_buf), 0);
+    ASSERT_EQ(slash_qdma_qpair_buffer_create(fd, XFER_SIZE, &dst_buf), 0);
+    auto *src = static_cast<uint8_t *>(src_buf.addr);
+    auto *dst = static_cast<uint8_t *>(dst_buf.addr);
+    for (size_t i = 0; i < XFER_SIZE; ++i) {
+        src[i] = static_cast<uint8_t>((i * 7 + 1) & 0xFF);
+    }
+    std::memset(dst, 0, XFER_SIZE);
+
+    // H2C: lower half on qpair 0, upper half on qpair 1, in one ioctl.
+    struct slash_qdma_subxfer h2c[2]{};
+    h2c[0] = {0, SLASH_QDMA_XFER_H2C, src_buf.fd, 0, 0, DDR_BASE_ADDRESS, HALF};
+    h2c[1] = {1, SLASH_QDMA_XFER_H2C, src_buf.fd, 0, HALF, DDR_BASE_ADDRESS + HALF, HALF};
+    EXPECT_EQ(slash_qdma_qpair_transfer_batch(fd, h2c, 2),
+              static_cast<ssize_t>(XFER_SIZE));
+
+    // C2H: read both halves back across both channels in one ioctl.
+    struct slash_qdma_subxfer c2h[2]{};
+    c2h[0] = {0, SLASH_QDMA_XFER_C2H, dst_buf.fd, 0, 0, DDR_BASE_ADDRESS, HALF};
+    c2h[1] = {1, SLASH_QDMA_XFER_C2H, dst_buf.fd, 0, HALF, DDR_BASE_ADDRESS + HALF, HALF};
+    EXPECT_EQ(slash_qdma_qpair_transfer_batch(fd, c2h, 2),
+              static_cast<ssize_t>(XFER_SIZE));
+
+    EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0);
+
+    EXPECT_EQ(slash_qdma_buffer_destroy(&src_buf), 0);
+    EXPECT_EQ(slash_qdma_buffer_destroy(&dst_buf), 0);
+
+    EXPECT_EQ(close(fd), 0);
+    for (int ch = 0; ch < 2; ++ch) {
+        EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qids[ch]), 0);
+        EXPECT_EQ(slash_qdma_qpair_del(qdma_, qids[ch]), 0);
+    }
+}
+
+TEST(QdmaNullTest, QpairGetFdMultiInvalid) {
+    uint32_t qids[2] = {0, 1};
+    errno = 0;
+    EXPECT_EQ(slash_qdma_qpair_get_fd_multi(nullptr, qids, 2, 0), -1);
+    EXPECT_EQ(errno, EINVAL);
+
+    struct slash_qdma fake{};
+    fake.fd = -1;
+    errno = 0;
+    EXPECT_EQ(slash_qdma_qpair_get_fd_multi(&fake, qids, 0, 0), -1);
+    EXPECT_EQ(errno, EINVAL);
+
+    errno = 0;
+    EXPECT_EQ(slash_qdma_qpair_get_fd_multi(&fake, qids, 3, 0), -1);
+    EXPECT_EQ(errno, EINVAL);
+}
+
+TEST(QdmaNullTest, TransferBatchInvalid) {
+    struct slash_qdma_subxfer x{};
+    x.direction = SLASH_QDMA_XFER_H2C;
+    errno = 0;
+    EXPECT_EQ(slash_qdma_qpair_transfer_batch(-1, &x, 1), -1);
+    EXPECT_EQ(errno, EINVAL);
+
+    errno = 0;
+    EXPECT_EQ(slash_qdma_qpair_transfer_batch(3, nullptr, 1), -1);
+    EXPECT_EQ(errno, EINVAL);
+
+    errno = 0;
+    EXPECT_EQ(slash_qdma_qpair_transfer_batch(3, &x, 0), -1);
+    EXPECT_EQ(errno, EINVAL);
+}
+
+TEST_P(ParametrizedQdmaTest, QueueFdReadWriteRejectedOnHardware) {
+    if (mock) {
+        GTEST_SKIP() << "mock qpair fds are memfds and still support read/write";
+    }
+
+    struct slash_qdma_qpair_add req{};
+    req.mode     = 0;
+    req.dir_mask = 0x3;
+
+    ASSERT_EQ(slash_qdma_qpair_add(qdma_, &req), 0);
+    uint32_t qid = req.qid;
+    ASSERT_EQ(slash_qdma_qpair_start(qdma_, qid), 0);
+
+    int queue_fd = slash_qdma_qpair_get_fd(qdma_, qid, 0);
+    ASSERT_GE(queue_fd, 0);
+
+    uint8_t byte = 0;
+    errno = 0;
+    EXPECT_EQ(write(queue_fd, &byte, sizeof(byte)), -1);
+    EXPECT_TRUE(errno == EINVAL || errno == EOPNOTSUPP || errno == EBADF);
+
+    errno = 0;
+    EXPECT_EQ(read(queue_fd, &byte, sizeof(byte)), -1);
+    EXPECT_TRUE(errno == EINVAL || errno == EOPNOTSUPP || errno == EBADF);
+
+    EXPECT_EQ(close(queue_fd), 0);
+    EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qid), 0);
+    EXPECT_EQ(slash_qdma_qpair_del(qdma_, qid), 0);
+}
+
 TEST_P(ParametrizedQdmaTest, CloseSucceeds) {
     EXPECT_EQ(slash_qdma_close(qdma_), 0);
     qdma_ = nullptr;
diff --git a/driver/patches/0003-libqdma-pr-fmt-guard.patch b/driver/patches/0003-libqdma-pr-fmt-guard.patch
new file mode 100644
index 00000000..d253070a
--- /dev/null
+++ b/driver/patches/0003-libqdma-pr-fmt-guard.patch
@@ -0,0 +1,43 @@
+SLASH local modification to the pinned QDMA submodule (libqdma).
+
+libqdma: make qdma_platform_env.h self-sufficient for pr_fmt
+
+SLASH force-includes driver/slash_compat.h into every TU (driver/Makefile) so
+kernel-API shims such as from_timer() reach the pinned libqdma sources. That
+header pulls in <linux/printk.h> early (via <linux/mm.h>) and then #undefs
+pr_fmt, so each libqdma .c that sets its own
+    #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+at the top compiles without a "pr_fmt redefined" warning.
+
+The qdma_access HAL files don't set their own pr_fmt -- they log via
+qdma_log_* -> pr_*, which expand pr_fmt at the call site -- so after that
+#undef they would reference an undefined pr_fmt (<linux/printk.h> is already
+include-guarded by the time they include it, so its guarded default can no
+longer re-arm). Re-arm the kernel default here, guarded by #ifndef so the
+libqdma sources that do set a custom pr_fmt before including this header keep
+it. Behaviour for the HAL files is unchanged (the kernel default is "fmt").
+
+Generated against qdma_drv @ e0168be (pinned submodule commit).
+Applied automatically by driver/Makefile (libqdma-patches target, patch -p1).
+diff --git a/qdma_platform_env.h b/qdma_platform_env.h
+index fa26c9a..c9e1082 100755
+--- a/qdma_platform_env.h
++++ b/qdma_platform_env.h
+@@ -25,6 +25,17 @@
+ #define QDMA_SNPRINTF_S(arg1, arg2, arg3, ...) \
+ 		snprintf(arg1, arg3, ##__VA_ARGS__)
+ 
++/*
++ * SLASH: re-arm the kernel-default pr_fmt for TUs that log via qdma_log_* ->
++ * pr_* but never set their own pr_fmt. SLASH force-includes a compat header
++ * that #undefs pr_fmt after <linux/printk.h> is already include-guarded, so the
++ * default can no longer re-arm on its own. Guarded with #ifndef so the libqdma
++ * sources that set a custom pr_fmt before including this header keep it.
++ */
++#ifndef pr_fmt
++#define pr_fmt(fmt) fmt
++#endif
++
+ #define qdma_log_info(x_, ...) pr_info(x_, ##__VA_ARGS__)
+ #define qdma_log_warning(x_, ...) pr_warn(x_, ##__VA_ARGS__)
+ #define qdma_log_error(x_, ...) pr_err(x_, ##__VA_ARGS__)
diff --git a/driver/slash_compat.h b/driver/slash_compat.h
index 5b3a50c2..e6719487 100644
--- a/driver/slash_compat.h
+++ b/driver/slash_compat.h
@@ -17,6 +17,7 @@
 
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/timer.h>
 
 /*
  * Compat shims selected by the kcompat probes in driver/kcompat/.
@@ -53,4 +54,43 @@ static inline void slash_vm_flags_set(struct vm_area_struct *vma, vm_flags_t fla
 #define SLASH_MODULE_IMPORT_NS(ns) MODULE_IMPORT_NS(#ns)
 #endif
 
+/*
+ * from_timer() was renamed to timer_container_of() upstream in v6.16
+ * (commit 41cb08555c41) and backported by RHEL/CentOS 9.8 (kernel
+ * 5.14.0-687; 9.7 / 5.14.0-611 and earlier still ship from_timer) into the
+ * 5.14 baseline, so a LINUX_VERSION_CODE / RHEL_RELEASE_CODE check is
+ * unreliable across the 9.x rebuilds. Both names are typeof()-based macros,
+ * so detect them directly and prefer the kernel's own API:
+ *   1. kernel still defines from_timer()          -> use it as-is (no redefine)
+ *   2. kernel renamed it to timer_container_of()  -> delegate to that
+ *   3. neither exists                             -> hand-roll the historical body
+ * <linux/timer.h> is included above so the guard sees whichever name the
+ * kernel defines, regardless of -include ordering.
+ */
+#ifndef from_timer
+#  ifdef timer_container_of
+#    define from_timer(var, callback_timer, timer_fieldname) \
+	timer_container_of(var, callback_timer, timer_fieldname)
+#  else
+#    define from_timer(var, callback_timer, timer_fieldname) \
+	container_of(callback_timer, typeof(*var), timer_fieldname)
+#  endif
+#endif
+
+/*
+ * The kernel headers included above (<linux/mm.h> -> <linux/printk.h>) install
+ * the default `#define pr_fmt(fmt) fmt` under an #ifndef guard. Because this
+ * header is force-included (-include, see driver/Makefile) ahead of every TU,
+ * that default lands before each pinned libqdma source's own top-of-file
+ *     #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+ * turning it into a redefinition ("pr_fmt redefined" warning). Undefine it here
+ * so each TU starts from the clean "nobody has defined pr_fmt yet" state the
+ * idiom relies on; the file's own #define is then the first and only one.
+ *
+ * TUs that never set their own pr_fmt and log via the kernel default (the
+ * qdma_access HAL, whose qdma_log_* macros expand to pr_*) re-arm that default
+ * from qdma_platform_env.h; see driver/patches/0003-libqdma-pr-fmt-guard.patch.
+ */
+#undef pr_fmt
+
 #endif /* SLASH_COMPAT_H */
diff --git a/driver/slash_config.h b/driver/slash_config.h
index c06c5962..acebe253 100644
--- a/driver/slash_config.h
+++ b/driver/slash_config.h
@@ -23,9 +23,9 @@
  *
  * The SLASH design exposes two PCI physical functions per card:
  *
- *   - **PF1** (device 0x50B5) — QDMA function.  Hosts the Xilinx QDMA
- *     IP used for high-throughput DMA transfers between host memory and
- *     the FPGA fabric.
+ *   - **PF1** (device 0x50B5, or 0x50BD on AVED/V80P designs) — QDMA
+ *     function.  Hosts the Xilinx QDMA IP used for high-throughput DMA
+ *     transfers between host memory and the FPGA fabric.
  *
  *   - **PF2** (device 0x50B6) — Control function.  Exposes PCI BARs
  *     that the host can mmap for register-level MMIO access to the
@@ -52,6 +52,8 @@
 #define SLASH_QDMA_PCI_VENDOR_ID 0x10EE
 /** PCI device ID for the V80 SLASH QDMA function. */
 #define SLASH_QDMA_PCI_DEVICE_ID 0x50B5
+/** PCI device ID for the V80P/AVED QDMA function. */
+#define SLASH_AVED_QDMA_PCI_DEVICE_ID 0x50BD
 /** Physical function number for the QDMA DMA engine. */
 #define SLASH_QDMA_PF 1
 
diff --git a/driver/slash_qdma.c b/driver/slash_qdma.c
index 6c64272b..3b63148e 100644
--- a/driver/slash_qdma.c
+++ b/driver/slash_qdma.c
@@ -23,8 +23,9 @@
  * to provide queue-pair-based DMA transfers between host memory and the
  * FPGA fabric.
  *
- * The QDMA subsystem binds to PF1 (PCI device ID 0x50B5), while the
- * control device (slash_ctldev) binds to PF2 (device ID 0x50B6).
+ * The QDMA subsystem binds to PF1 (PCI device ID 0x50B5, or 0x50BD on
+ * AVED/V80P designs), while the control device (slash_ctldev) binds to
+ * PF2 (device ID 0x50B6).
  *
  * Queue pair lifecycle:
  *   add -> start -> I/O (via anon_inode fd) -> stop -> del
@@ -50,10 +51,16 @@
 
 #include <asm/cacheflush.h>
 #include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
 #include <linux/err.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/kernel.h>
 #include <linux/kref.h>
+#include <linux/ktime.h>
+#include <linux/limits.h>
 #include <linux/miscdevice.h>
 #include <linux/minmax.h>
 #include <linux/mutex.h>
@@ -65,6 +72,31 @@
 #include <linux/xarray.h>
 #include <linux/anon_inodes.h>
 
+#if defined(SLASH_HAVE_URING_CMD)
+#include <linux/io_uring.h>
+#if __has_include(<linux/io_uring/cmd.h>)
+#include <linux/io_uring/cmd.h>
+#endif
+
+/**
+ * slash_qdma_uring_cmd_payload() - Pointer to a uring_cmd's inline SQE payload.
+ * @cmd: The io_uring command.
+ *
+ * Abstracts the kernel API change that removed struct io_uring_cmd::cmd in
+ * favour of ->sqe + io_uring_sqe_cmd().  The accessor is selected at build
+ * time by the kcompat probe (SLASH_HAVE_URING_SQE_CMD); both forms return the
+ * same inline command payload pointer.
+ */
+static inline const void *slash_qdma_uring_cmd_payload(struct io_uring_cmd *cmd)
+{
+#if defined(SLASH_HAVE_URING_SQE_CMD)
+    return io_uring_sqe_cmd(cmd->sqe);
+#else
+    return cmd->cmd;
+#endif
+}
+#endif
+
 /*
  * Direction bitmask constants.
  *
@@ -100,6 +132,45 @@
     offsetofend(struct slash_qdma_qpair_op, op)
 #define SLASH_QDMA_QPAIR_GET_FD_MIN_SIZE \
     offsetofend(struct slash_qdma_qpair_fd_request, flags)
+#define SLASH_QDMA_BUF_CREATE_MIN_SIZE \
+    offsetofend(struct slash_qdma_buf_create, length)
+#define SLASH_QDMA_TRANSFER_MIN_SIZE \
+    offsetofend(struct slash_qdma_transfer, count)
+
+/*
+ * CPM5 Host Profile indirect-context programming.
+ *
+ * The Host Profile context tells the CPM5 QDMA how to route AXI4-MM
+ * traffic onto the Versal NoC.  It is programmed via the same indirect
+ * context command interface libqdma uses for queue contexts, but with
+ * the host-profile selector (0xA).  Register offsets and the command
+ * word layout mirror eqdma_cpm5_reg.h:
+ *
+ *   IND_CTXT_DATA  base 0x804 (8 x u32 context words)
+ *   IND_CTXT_MASK  base 0x824 (8 x u32 write masks)
+ *   IND_CTXT_CMD   0x844      (busy[0], sel[4:1], op[6:5], qid[18:7])
+ *
+ * We program two profiles so the per-queue SW-context host_id selects
+ * the NoC channel: Host ID 0 -> NoC Channel 0, Host ID 1 -> NoC Channel 1.
+ */
+#define SLASH_QDMA_HP_DATA_ADDR  0x804u
+#define SLASH_QDMA_HP_MASK_ADDR  0x824u
+#define SLASH_QDMA_HP_CMD_ADDR   0x844u
+#define SLASH_QDMA_HP_CMD_BUSY   BIT(0)
+#define SLASH_QDMA_HP_NUM_WORDS  8
+#define SLASH_QDMA_HP_SEL        0xAu   /* QDMA_CTXT_SELC_HOST_PROFILE */
+#define SLASH_QDMA_HP_OP_WR      0x1u   /* indirect context WR opcode */
+#define SLASH_QDMA_HP_OP_RD      0x2u   /* indirect context RD opcode */
+#define SLASH_QDMA_HP_SMID_BASE  0x100u /* bit 8 set; base AXI-MM master ID */
+#define SLASH_QDMA_HP_POLL_US    1000   /* busy-wait budget in microseconds */
+
+/*
+ * The qpair fd data path operates on spans of 4 KiB base pages.  Each
+ * scatter-gather entry is exactly one base page, so a whole transfer is
+ * submitted to libqdma as a single multi-descriptor request and libqdma
+ * refills the descriptor ring as needed -- the transfer size is not bounded
+ * by the ring depth.
+ */
 
 /**
  * SLASH_QDMA_QTYPE_COUNT - Number of queue types tracked per queue pair.
@@ -152,6 +223,18 @@
     } while (0)
 #endif
 
+/*
+ * Per-transfer timing instrumentation (compile-time flag).
+ *
+ * Retained for parity with the userspace SLASH_QDMA_TIMING knob.  With the
+ * kernel-owned buffer model all the expensive setup (page allocation, SGL
+ * build, DMA mapping) happens once at SLASH_QDMA_IOCTL_BUF_CREATE time, so the
+ * steady-state transfer cost is dominated by the libqdma submit/completion.
+ */
+#ifndef SLASH_QDMA_TIMING
+#define SLASH_QDMA_TIMING 0
+#endif
+
 /* Forward declaration; full definition follows. */
 struct slash_qdma_dev;
 
@@ -503,44 +586,57 @@ slash_qdma_qpair_remove(struct slash_qdma_dev *qdma_dev, u32 qid)
 
 /**
  * struct slash_qdma_qpair_file_ctx - Private data for an anon_inode qpair fd.
- * @qdma_dev: Back-pointer to the owning QDMA device (ref held).
- * @entry:    The queue pair entry this fd operates on (ref held).
- * @qid:      Queue pair ID, cached for debug logging.
+ * @qdma_dev:  Back-pointer to the owning QDMA device (ref held).
+ * @entries:   The queue pair entries this fd operates on (one ref each).
+ *             A transfer sub-transfer's qpair_index selects an entry here.
+ * @qids:      Queue pair IDs, cached for debug logging.
+ * @n_qpairs:  Number of valid entries in @entries / @qids
+ *             (1..SLASH_QDMA_FD_MAX_QPAIRS).
  *
  * Allocated in slash_qdma_ioctl_qpair_get_fd_w() and freed in
- * slash_qdma_qpair_release().  Both @qdma_dev and @entry have their
- * reference counts incremented when the ctx is created, and decremented
- * when the fd is closed.
+ * slash_qdma_qpair_release().  @qdma_dev and each entry have their reference
+ * counts incremented when the ctx is created, and decremented when the fd is
+ * closed.
  */
 struct slash_qdma_qpair_file_ctx {
     struct slash_qdma_dev *qdma_dev;
-    struct slash_qdma_qpair_entry *entry;
-    u32 qid;
+    struct slash_qdma_qpair_entry *entries[SLASH_QDMA_FD_MAX_QPAIRS];
+    u32 qids[SLASH_QDMA_FD_MAX_QPAIRS];
+    u32 n_qpairs;
 };
 
 /**
- * struct slash_qdma_io_cb - I/O control block for a single DMA transfer.
- * @buf:      User-space buffer address (source for H2C, destination for C2H).
- * @len:      Transfer length in bytes.
- * @pages_nr: Number of user pages pinned by get_user_pages_fast().
- * @sgl:      Scatter-gather list of qdma_sw_sg entries, one per pinned page.
- *            Allocated as a single contiguous block together with @pages.
- * @pages:    Array of struct page pointers for the pinned user pages.
- *            Points into the same allocation as @sgl (immediately after it).
- * @req:      The libqdma request structure submitted to qdma_request_submit().
- *
- * This is a stack-local structure (allocated in slash_qdma_qpair_read_write)
- * that bundles all per-transfer state.  The SGL and page array are heap-
- * allocated in slash_qdma_map_user_buf_to_sgl() and freed in
- * slash_qdma_iocb_release().
- */
-struct slash_qdma_io_cb {
-    void __user *buf;
-    size_t len;
+ * struct slash_qdma_buf - A kernel-owned, mmap-able DMA buffer.
+ * @ref:        Reference count.  The buffer fd holds one ref, each live VMA
+ *              (mmap) holds one ref, and each in-flight transfer holds a
+ *              temporary ref so a close cannot tear the buffer down under
+ *              active DMA or while userspace still has it mapped.
+ * @qdma_dev:   Device whose DMA mappings back this buffer (holds a device
+ *              reference for the lifetime of the buffer object).
+ * @length:     Buffer length in bytes (a multiple of @granule).
+ * @granule:    Bytes per SGL entry / page (PAGE_SIZE).  Uniform across all
+ *              entries, so transfer slices are computed by simple division.
+ * @pages_nr:   Number of base pages backing the buffer (length / granule).
+ * @pages:      Array of @pages_nr kernel pages (alloc_page()), not physically
+ *              contiguous.  Used both for the CPU mmap and the DMA SGL.
+ * @sgl:        Prebuilt scatter-gather list, one entry per page, each with its
+ *              dma_addr filled in once at creation so transfers submit with
+ *              req->dma_mapped = 1.
+ * @dma_mapped: True once @sgl entries have been DMA-mapped.
+ *
+ * All expensive setup (page allocation, SGL construction, DMA mapping) happens
+ * once at creation; the transfer fast path only slices @sgl, syncs the touched
+ * pages, and submits.
+ */
+struct slash_qdma_buf {
+    struct kref ref;
+    struct slash_qdma_dev *qdma_dev;
+    u64 length;
+    u32 granule;
     unsigned int pages_nr;
-    struct qdma_sw_sg *sgl;
     struct page **pages;
-    struct qdma_request req;
+    struct qdma_sw_sg *sgl;
+    bool dma_mapped;
 };
 
 /* ─────────────────────────────────────────────────────────────────────
@@ -586,32 +682,37 @@ static int slash_qdma_ioctl_qpair_op_apply(struct slash_qdma_dev *qdma_dev,
 static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc,
                                            struct slash_qdma_dev *qdma_dev,
                                            void __user *uarg);
+static int slash_qdma_ioctl_buf_create_w(struct miscdevice *misc,
+                                          struct slash_qdma_dev *qdma_dev,
+                                          void __user *uarg);
+static void slash_qdma_buf_release(struct kref *ref);
+static void slash_qdma_buf_put(struct slash_qdma_buf *buf);
+static long slash_qdma_qpair_transfer(struct file *file, void __user *uarg);
 
-static ssize_t slash_qdma_qpair_read(struct file *file, char __user *buf,
-                                     size_t count, loff_t *ppos);
-static ssize_t slash_qdma_qpair_write(struct file *file, const char __user *buf,
-                                      size_t count, loff_t *ppos);
 static int slash_qdma_qpair_release(struct inode *inode, struct file *file);
 static long slash_qdma_qpair_ioctl(struct file *file,
                                    unsigned int cmd, unsigned long arg);
+#if defined(SLASH_HAVE_URING_CMD)
+static int slash_qdma_qpair_uring_cmd(struct io_uring_cmd *cmd,
+                                      unsigned int issue_flags);
+#endif
 
 /**
  * slash_qdma_qpair_fops - File operations for per-qpair anon_inode fds.
  *
- * read()  performs a C2H (card-to-host) DMA transfer.
- * write() performs an H2C (host-to-card) DMA transfer.
- * llseek  uses default_llseek so that pread/pwrite can set the
- *         device-side address via the file position.
- * ioctl   is a stub that returns -ENOTTY (no per-fd ioctls defined yet).
- * release drops the refs on the qpair entry and device.
+ * ioctl    performs buffer DMA transfers and buffer creation for clients that
+ *          only hold a queue-pair fd.
+ * uring_cmd (optional) is the asynchronous equivalent of the transfer ioctl,
+ *          available only on kernels with io_uring uring_cmd support.
+ * release  drops the refs on the bound qpair entries and device.
  */
 static const struct file_operations slash_qdma_qpair_fops = {
     .owner          = THIS_MODULE,
-    .read           = slash_qdma_qpair_read,
-    .write          = slash_qdma_qpair_write,
     .unlocked_ioctl = slash_qdma_qpair_ioctl,
+#if defined(SLASH_HAVE_URING_CMD)
+    .uring_cmd      = slash_qdma_qpair_uring_cmd,
+#endif
     .release        = slash_qdma_qpair_release,
-    .llseek         = default_llseek,
 };
 
 
@@ -625,10 +726,12 @@ static void slash_qdma_ioctl_info(struct miscdevice *misc, struct slash_qdma_dev
 /**
  * slash_qdma_ids - PCI device ID table for the QDMA PF.
  *
- * Matches only PF1 (device ID 0x50B5) on AMD/Xilinx V80 cards.
+ * Matches PF1 QDMA functions on AMD/Xilinx V80 cards, including the
+ * AVED/V80P device ID.
  */
 static const struct pci_device_id slash_qdma_ids[] = {
     {PCI_DEVICE(SLASH_QDMA_PCI_VENDOR_ID, SLASH_QDMA_PCI_DEVICE_ID)},
+    {PCI_DEVICE(SLASH_QDMA_PCI_VENDOR_ID, SLASH_AVED_QDMA_PCI_DEVICE_ID)},
     {0,}
 };
 MODULE_DEVICE_TABLE(pci, slash_qdma_ids);
@@ -850,6 +953,287 @@ void slash_qdma_exit(void)
     SLASH_QDMA_OP_LOG("libqdma_exit done\n");
 }
 
+/* ─────────────────────────────────────────────────────────────────────
+ * CPM5 Host Profile context programming
+ * ───────────────────────────────────────────────────────────────────── */
+
+/**
+ * slash_qdma_hp_set_field() - Set a bit field in the host profile context.
+ * @words: Array of SLASH_QDMA_HP_NUM_WORDS u32s holding the 256-bit context
+ *         (word i covers bits [32*i+31 : 32*i]).
+ * @hi:    Most-significant bit index of the field (inclusive).
+ * @lo:    Least-significant bit index of the field (inclusive).
+ * @val:   Value to place in [hi:lo]; bits outside the field width are masked.
+ *
+ * Handles fields that straddle a 32-bit word boundary (e.g. the C2H
+ * AXI4-MM steering field at bits [97:94], which spans words 2 and 3).
+ */
+static void slash_qdma_hp_set_field(u32 *words, unsigned int hi,
+                                    unsigned int lo, u32 val)
+{
+    unsigned int width = hi - lo + 1;
+    u32 fmask = (width >= 32) ? ~0u : ((1u << width) - 1u);
+    unsigned int word = lo >> 5;
+    unsigned int off = lo & 31;
+    u64 wmask = (u64)fmask << off;
+    u64 wval = (u64)(val & fmask) << off;
+
+    words[word] = (words[word] & ~(u32)(wmask & 0xFFFFFFFFu)) |
+                  (u32)(wval & 0xFFFFFFFFu);
+
+    if ((off + width) > 32 && (word + 1) < SLASH_QDMA_HP_NUM_WORDS)
+        words[word + 1] = (words[word + 1] & ~(u32)(wmask >> 32)) |
+                          (u32)(wval >> 32);
+}
+
+/**
+ * slash_qdma_hp_wait_ready() - Poll the indirect-context BUSY bit.
+ * @device:  QDMA device (provides the libqdma handle for register access).
+ * @val_out: If non-NULL, receives the last QDMA_IND_CTXT_CMD value read.
+ *
+ * Spins (up to SLASH_QDMA_HP_POLL_US microseconds) until the indirect
+ * context command BUSY bit clears.  Logging is left to the caller so the
+ * write path can treat a timeout as fatal while the readback path can treat
+ * it as a warning.
+ *
+ * Return: 0 once not busy, -ETIMEDOUT on timeout, or a negative errno from
+ *         the register read.
+ */
+static int slash_qdma_hp_wait_ready(struct slash_qdma_dev *device, u32 *val_out)
+{
+    unsigned int waited_us = 0;
+    u32 val = 0;
+    int err;
+
+    do {
+        err = qdma_device_read_config_register(device->qdma_handle,
+                SLASH_QDMA_HP_CMD_ADDR, &val);
+        if (err)
+            return err;
+        if (!(val & SLASH_QDMA_HP_CMD_BUSY)) {
+            if (val_out)
+                *val_out = val;
+            return 0;
+        }
+        udelay(1);
+    } while (++waited_us < SLASH_QDMA_HP_POLL_US);
+
+    if (val_out)
+        *val_out = val;
+    return -ETIMEDOUT;
+}
+
+/**
+ * slash_qdma_hp_get_field() - Read a bit field from the host profile context.
+ * @words: Array of SLASH_QDMA_HP_NUM_WORDS u32s holding the 256-bit context
+ *         (word i covers bits [32*i+31 : 32*i]).
+ * @hi:    Most-significant bit index of the field (inclusive).
+ * @lo:    Least-significant bit index of the field (inclusive).
+ *
+ * Inverse of slash_qdma_hp_set_field(); handles fields that straddle a
+ * 32-bit word boundary (e.g. the C2H AXI4-MM steering field at bits
+ * [97:94], which spans words 2 and 3).
+ *
+ * Return: the value held in [hi:lo].
+ */
+static u32 slash_qdma_hp_get_field(const u32 *words, unsigned int hi,
+                                   unsigned int lo)
+{
+    unsigned int width = hi - lo + 1;
+    u32 fmask = (width >= 32) ? ~0u : ((1u << width) - 1u);
+    unsigned int word = lo >> 5;
+    unsigned int off = lo & 31;
+    u64 two = (u64)words[word];
+
+    if ((word + 1) < SLASH_QDMA_HP_NUM_WORDS)
+        two |= (u64)words[word + 1] << 32;
+
+    return (u32)((two >> off) & fmask);
+}
+
+/**
+ * slash_qdma_read_host_profile() - Read one CPM5 Host Profile entry back.
+ * @device:  QDMA device (provides the libqdma handle for register access).
+ * @host_id: Host Profile index to read.
+ * @out:     Array of SLASH_QDMA_HP_NUM_WORDS u32s that receives the 256-bit
+ *           context.
+ *
+ * Issues an indirect-context RD command for the host-profile selector,
+ * waits for the controller to complete it, and copies the IND_CTXT_DATA
+ * words back.  Used to verify a preceding write.
+ *
+ * Return: 0 on success, negative errno on register-access error or
+ *         -ETIMEDOUT if the BUSY bit never clears.
+ */
+static int slash_qdma_read_host_profile(struct slash_qdma_dev *device,
+                                        u32 host_id, u32 *out)
+{
+    u32 cmd = (host_id << 7) | (SLASH_QDMA_HP_OP_RD << 5) |
+              (SLASH_QDMA_HP_SEL << 1);
+    int err;
+    int i;
+
+    err = qdma_device_write_config_register(device->qdma_handle,
+            SLASH_QDMA_HP_CMD_ADDR, cmd);
+    if (err)
+        return err;
+
+    err = slash_qdma_hp_wait_ready(device, NULL);
+    if (err)
+        return err;
+
+    for (i = 0; i < SLASH_QDMA_HP_NUM_WORDS; i++) {
+        err = qdma_device_read_config_register(device->qdma_handle,
+                SLASH_QDMA_HP_DATA_ADDR + (i * sizeof(u32)), &out[i]);
+        if (err)
+            return err;
+    }
+
+    return 0;
+}
+
+/**
+ * slash_qdma_write_host_profile() - Program and verify one CPM5 Host Profile.
+ * @device:  QDMA device (provides the libqdma handle for register access).
+ * @host_id: Host Profile index to program (also the AXI4-MM steering value,
+ *           i.e. the target NoC channel).
+ *
+ * Builds the 256-bit host profile context with the SMID and H2C/C2H
+ * AXI4-MM steering fields, writes it through the indirect-context
+ * registers via the libqdma-exported config register accessors, and
+ * polls the command BUSY bit until the controller completes the write.
+ *
+ * Once the write completes it reads the profile back and verifies the
+ * programmed fields (SMID and the two steering fields); a readback error
+ * or field mismatch is logged but is non-fatal (the profile is still
+ * considered applied).
+ *
+ * Only the SMID and the two steering fields are non-zero; the AXI
+ * prot/cache attributes are left at 0.
+ *
+ * Return: 0 on success, negative errno on register-access error or
+ *         -ETIMEDOUT if the BUSY bit never clears.
+ */
+static int slash_qdma_write_host_profile(struct slash_qdma_dev *device,
+                                         u32 host_id)
+{
+    u32 data[SLASH_QDMA_HP_NUM_WORDS] = {0};
+    u32 smid = SLASH_QDMA_HP_SMID_BASE + host_id;
+    u32 cmd;
+    u32 val = 0;
+    int err;
+    int i;
+
+    /* SMID [201:192]; H2C steering [181:178]; C2H steering [97:94]. */
+    slash_qdma_hp_set_field(data, 201, 192, smid);
+    slash_qdma_hp_set_field(data, 181, 178, host_id);
+    slash_qdma_hp_set_field(data, 97, 94, host_id);
+
+    /* Context data words. */
+    for (i = 0; i < SLASH_QDMA_HP_NUM_WORDS; i++) {
+        err = qdma_device_write_config_register(device->qdma_handle,
+                SLASH_QDMA_HP_DATA_ADDR + (i * sizeof(u32)), data[i]);
+        if (err)
+            goto err_reg;
+    }
+
+    /* Context masks: write every bit. */
+    for (i = 0; i < SLASH_QDMA_HP_NUM_WORDS; i++) {
+        err = qdma_device_write_config_register(device->qdma_handle,
+                SLASH_QDMA_HP_MASK_ADDR + (i * sizeof(u32)), 0xFFFFFFFFu);
+        if (err)
+            goto err_reg;
+    }
+
+    /* Command: qid=host_id, op=WR, sel=HOST_PROFILE (0x34 for id 0, 0xB4 for id 1). */
+    cmd = (host_id << 7) | (SLASH_QDMA_HP_OP_WR << 5) | (SLASH_QDMA_HP_SEL << 1);
+    err = qdma_device_write_config_register(device->qdma_handle,
+            SLASH_QDMA_HP_CMD_ADDR, cmd);
+    if (err)
+        goto err_reg;
+
+    /* Wait for the controller to consume the command. */
+    err = slash_qdma_hp_wait_ready(device, &val);
+    if (err == -ETIMEDOUT) {
+        dev_err(&device->pdev->dev,
+                "qdma: host profile %u programming timed out (cmd=0x%x)\n",
+                host_id, val);
+        return -ETIMEDOUT;
+    }
+    if (err)
+        goto err_reg;
+
+    /*
+     * Read the profile back and verify the programmed fields.  A readback
+     * error or field mismatch is non-fatal: the write itself completed, so
+     * the profile is still considered applied.
+     */
+    {
+        u32 rb[SLASH_QDMA_HP_NUM_WORDS] = {0};
+        int rerr = slash_qdma_read_host_profile(device, host_id, rb);
+
+        if (rerr) {
+            dev_warn(&device->pdev->dev,
+                     "slash: qdma: host profile %u applied (cmd=0x%02x) but readback failed: %d\n",
+                     host_id, cmd, rerr);
+        } else {
+            u32 smid_rb = slash_qdma_hp_get_field(rb, 201, 192);
+            u32 h2c_rb = slash_qdma_hp_get_field(rb, 181, 178);
+            u32 c2h_rb = slash_qdma_hp_get_field(rb, 97, 94);
+
+            if (smid_rb == smid && h2c_rb == host_id && c2h_rb == host_id) {
+                dev_info(&device->pdev->dev,
+                         "slash: qdma: host profile %u applied and readback verified: H2C/C2H AXI-MM steering=%u (NoC channel %u), smid=0x%03x (cmd=0x%02x)\n",
+                         host_id, host_id, host_id, smid, cmd);
+            } else {
+                dev_err(&device->pdev->dev,
+                        "slash: qdma: host profile %u readback MISMATCH: smid exp=0x%03x got=0x%03x, h2c exp=%u got=%u, c2h exp=%u got=%u\n",
+                        host_id, smid, smid_rb, host_id, h2c_rb,
+                        host_id, c2h_rb);
+            }
+        }
+    }
+    return 0;
+
+err_reg:
+    dev_err(&device->pdev->dev,
+            "qdma: host profile %u register access failed: %d\n",
+            host_id, err);
+    return err;
+}
+
+/**
+ * slash_qdma_program_host_profiles() - Program the CPM5 Host Profiles.
+ * @device: QDMA device.
+ *
+ * Programs Host Profile 0 (steer to NoC Channel 0) and Host Profile 1
+ * (steer to NoC Channel 1).  Must run after qdma_device_open() (which
+ * clears all contexts) and before any queue context is programmed, per
+ * the CPM5 requirement that the host profile exist before AXI4-MM
+ * queues are set up.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+static int slash_qdma_program_host_profiles(struct slash_qdma_dev *device)
+{
+    u32 host_id;
+    int err;
+
+    dev_info(&device->pdev->dev,
+             "slash: qdma: programming CPM5 host profiles (host_id 0 -> NoC channel 0, host_id 1 -> NoC channel 1)\n");
+
+    for (host_id = 0; host_id <= 1; host_id++) {
+        err = slash_qdma_write_host_profile(device, host_id);
+        if (err)
+            return err;
+    }
+
+    dev_info(&device->pdev->dev,
+             "slash: qdma: CPM5 host profiles programmed\n");
+
+    return 0;
+}
+
 /* ─────────────────────────────────────────────────────────────────────
  * PCI probe / remove
  * ───────────────────────────────────────────────────────────────────── */
@@ -912,6 +1296,20 @@ static int slash_qdma_probe(struct pci_dev *pdev, const struct pci_device_id *id
                           device->qdma_handle);
     device->have_qdma_handle = true;
 
+    /*
+     * Program the CPM5 Host Profiles before exposing the miscdevice, so
+     * they exist before userspace can add any queue.  Host ID 0 steers
+     * AXI4-MM traffic to NoC Channel 0 and Host ID 1 to NoC Channel 1;
+     * the per-queue SW-context host_id (mirrored from mm_channel = qid & 1)
+     * selects between them.
+     */
+    err = slash_qdma_program_host_profiles(device);
+    if (err) {
+        dev_err(&pdev->dev,
+                "slash: qdma: could not program host profiles: %d", err);
+        goto err_free;
+    }
+
     /* Register the management miscdevice so userspace can issue ioctls. */
     err = misc_register(&device->misc);
     if (err) {
@@ -1236,13 +1634,15 @@ static void slash_qdma_conf_options(struct qdma_dev_conf *conf, struct pci_dev *
 static long slash_qdma_fop_ioctl(struct file *file, unsigned int op, unsigned long arg)
 {
     struct slash_qdma_dev *qdma_dev = file->private_data;
-    struct miscdevice *misc = &qdma_dev->misc;
+    struct miscdevice *misc;
     void __user *uarg = (void __user *)arg;
     long ret = 0;
 
     if (!qdma_dev)
         return -ENODEV;
 
+    misc = &qdma_dev->misc;
+
     SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, "ioctl op=0x%x\n", op);
 
     /* Early rejection if the device is shutting down. */
@@ -1270,6 +1670,10 @@ static long slash_qdma_fop_ioctl(struct file *file, unsigned int op, unsigned lo
         ret = slash_qdma_ioctl_qpair_get_fd_w(misc, qdma_dev, uarg);
         break;
 
+    case SLASH_QDMA_IOCTL_BUF_CREATE:
+        ret = slash_qdma_ioctl_buf_create_w(misc, qdma_dev, uarg);
+        break;
+
     default:
         ret = -ENOTTY;
         break;
@@ -1295,18 +1699,18 @@ static int slash_qdma_fop_open(struct inode *inode, struct file *file)
     struct miscdevice *misc = file->private_data;
     struct slash_qdma_dev *qdma_dev =
         container_of(misc, struct slash_qdma_dev, misc);
-    int ret = 0;
 
     mutex_lock(&qdma_dev->lock);
     if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) {
-        ret = -ENODEV;
-    } else {
-        kref_get(&qdma_dev->ref);
-        file->private_data = qdma_dev;
+        mutex_unlock(&qdma_dev->lock);
+        return -ENODEV;
     }
+    kref_get(&qdma_dev->ref);
     mutex_unlock(&qdma_dev->lock);
 
-    return ret;
+    file->private_data = qdma_dev;
+
+    return 0;
 }
 
 /**
@@ -1323,8 +1727,12 @@ static int slash_qdma_fop_release(struct inode *inode, struct file *file)
 {
     struct slash_qdma_dev *qdma_dev = file->private_data;
 
-    if (qdma_dev)
-        kref_put(&qdma_dev->ref, slash_qdma_dev_release);
+    if (!qdma_dev)
+        return 0;
+
+    kref_put(&qdma_dev->ref, slash_qdma_dev_release);
+
+    file->private_data = NULL;
 
     return 0;
 }
@@ -1481,6 +1889,12 @@ static int slash_qdma_ioctl_qpair_add_w(struct miscdevice *misc,
     if (req.h2c_ring_sz >= 16 || req.c2h_ring_sz >= 16 || req.cmpt_ring_sz >= 16)
         return -EINVAL;
 
+    /* Validate the per-queue AXI-MM channel selection. */
+    if (req.mm_channel != SLASH_QDMA_MM_CHANNEL_AUTO &&
+        req.mm_channel != SLASH_QDMA_MM_CHANNEL_0 &&
+        req.mm_channel != SLASH_QDMA_MM_CHANNEL_1)
+        return -EINVAL;
+
     mutex_lock(&qdma_dev->lock);
     if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) {
         mutex_unlock(&qdma_dev->lock);
@@ -1619,8 +2033,9 @@ static int slash_qdma_ioctl_qpair_add(struct miscdevice *misc,
  *     (required for poll-mode operation per the reference driver).
  *   - qconf.cmpl_stat_en = 1: enable completion status generation
  *     (required for poll-mode operation per the reference driver).
- *   - qconf.aperture_size = 4096: page-granularity (4 KB) for descriptor
- *     addressing.  Each descriptor addresses one page-sized chunk.
+ *   - qconf.aperture_size = 0: disables libqdma keyhole mode so MM
+ *     transfers advance linearly through endpoint memory.  Non-zero
+ *     values are keyhole apertures and wrap addresses within that window.
  *   - qconf.desc_rng_sz_idx: CSR table index (0-15) selecting the
  *     descriptor ring depth.  Not a raw descriptor count — the actual
  *     count is looked up from the global CSR ring-size table.
@@ -1664,7 +2079,28 @@ static int slash_qdma_ioctl_qpair_add_q(struct miscdevice *misc,
     qconf.cmpl_status_pend_chk = 1;                 /* Check pending completions (poll-mode req) */
     qconf.cmpl_stat_en = 1;                         /* Enable completion status generation */
 
-    qconf.aperture_size = 4096;                     /* Page-granularity descriptor addressing */
+    qconf.aperture_size = 0;                        /* Linear MM addressing; non-zero enables keyhole mode */
+    /*
+     * CPM5 exposes two MM channels.  The per-queue mm_channel selection
+     * (validated in slash_qdma_ioctl_qpair_add_w) chooses the channel: AUTO
+     * stripes across channels by (qid & 1); CHANNEL_0/CHANNEL_1 pin to a single
+     * channel.  libqdma mirrors mm_channel into the SW-context host_id, so this
+     * also selects the programmed Host Profile: channel 0 -> Host Profile 0
+     * (NoC Channel 0), channel 1 -> Host Profile 1 (NoC Channel 1).  See
+     * slash_qdma_program_host_profiles().
+     */
+    switch (req->mm_channel) {
+    case SLASH_QDMA_MM_CHANNEL_0:
+        qconf.mm_channel = 0;
+        break;
+    case SLASH_QDMA_MM_CHANNEL_1:
+        qconf.mm_channel = 1;
+        break;
+    case SLASH_QDMA_MM_CHANNEL_AUTO:
+    default:
+        qconf.mm_channel = req->qid & 1;
+        break;
+    }
 
     /* --- Per-direction ring configuration --- */
     switch (qtype) {
@@ -1688,8 +2124,9 @@ static int slash_qdma_ioctl_qpair_add_q(struct miscdevice *misc,
     }
 
     SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev,
-                          "qdma_queue_add start: qid=%u type=%u mode=%u\n",
-                          req->qid, qtype, req->mode);
+                          "queue add qid=%u type=%u mode=%u mm_channel=%u (req=%u)\n",
+                          req->qid, qtype, req->mode, qconf.mm_channel,
+                          req->mm_channel);
     err = qdma_queue_add(qdma_dev->qdma_handle, &qconf, &qhndl,
                             errbuf, sizeof(errbuf));
     if (err) {
@@ -1705,6 +2142,38 @@ static int slash_qdma_ioctl_qpair_add_q(struct miscdevice *misc,
                           "qdma_queue_add done: qid=%u type=%u qhndl=%lu\n",
                           req->qid, qtype, qhndl);
 
+    /*
+     * Reconfigure the queue immediately after adding it.
+     *
+     * qdma_queue_add() runs qdma_descq_config(..., reconfig=0), which on
+     * Versal hard IP does NOT mirror qconf.mm_channel into descq->channel --
+     * only the reconfig=1 branch does.  descq->channel feeds the SW-context
+     * mm_chn/host_id programmed when the queue is started; without this step
+     * it would stay 0 and collapse both queues onto NoC channel 0, defeating
+     * mm-channel selection.  Calling qdma_queue_config() here (the queue is in
+     * Q_STATE_ENABLED, before start) replays the same qconf through the
+     * reconfig=1 path, setting descq->channel.  This replaces the former
+     * 0002-libqdma-versal-channel.patch without modifying libqdma.
+     */
+    err = qdma_queue_config(qdma_dev->qdma_handle, qhndl, &qconf,
+                            errbuf, sizeof(errbuf));
+    if (err) {
+        SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev,
+                              "qdma_queue_config failed: qid=%u type=%u err=%d (%s)\n",
+                              req->qid, qtype, err, errbuf);
+        dev_err(&qdma_dev->pdev->dev,
+                "qdma: queue config failed (qid=%u, type=%u): %d (%s)\n",
+                req->qid, qtype, err, errbuf);
+        /*
+         * The queue was added but is not yet tracked in @entry, so the
+         * caller's rollback (keyed on its local added[] array) will not
+         * reach it.  Remove it here to avoid leaking the libqdma queue.
+         */
+        slash_qdma_queue_remove_safe(qdma_dev->qdma_handle, qhndl,
+                                     errbuf, sizeof(errbuf));
+        return err;
+    }
+
     /* Record the handle and mark this direction as active. */
     entry->qhndl[qtype] = qhndl;
     entry->dir_mask |= dir_bit;
@@ -1973,359 +2442,1052 @@ static int slash_qdma_ioctl_qpair_op_apply(struct slash_qdma_dev *qdma_dev,
 }
 
 /* ─────────────────────────────────────────────────────────────────────
- * DMA I/O: user buffer mapping, SGL construction, and transfer
+ * Kernel DMA buffers: page allocation, SGL, DMA mapping, mmap
+ *
+ * A buffer owns a set of individually-allocated 4 KiB base pages (not
+ * physically contiguous).  At creation time the pages are allocated, a
+ * one-descriptor-per-page SGL is built, and every page is DMA-mapped once;
+ * the steady-state transfer path then only slices the SGL, syncs the touched
+ * pages for the relevant DMA direction, and submits.  The same pages are also
+ * exposed to userspace through the buffer fd's mmap, so the CPU and the DMA
+ * engine share one allocation, coherent only at the transfer boundaries.
  * ───────────────────────────────────────────────────────────────────── */
 
 /**
- * slash_qdma_iocb_release() - Free resources in an I/O control block.
- * @iocb: The IOCB to clean up.
+ * slash_qdma_buf_dma_unmap() - Tear down the cached DMA mapping of a buffer.
+ * @buf: Buffer whose SGL entries were DMA-mapped.
  *
- * Frees the combined SGL + page-pointer allocation and clears the
- * pointers.  Does not unpin pages — that must be done separately via
- * slash_qdma_unmap_user_buf() before calling this.
+ * Unmaps every SGL entry that carries a non-zero dma_addr and clears it.
+ * Safe to call on a partially-mapped buffer (used on the create error path).
  */
-static inline void slash_qdma_iocb_release(struct slash_qdma_io_cb *iocb)
+static void slash_qdma_buf_dma_unmap(struct slash_qdma_buf *buf)
 {
-    if (iocb->pages)
-        iocb->pages = NULL;
+    struct device *dev = &buf->qdma_dev->pdev->dev;
+    unsigned int i;
 
-    kfree(iocb->sgl);
-    iocb->sgl = NULL;
-    iocb->buf = NULL;
+    if (!buf->sgl || !buf->dma_mapped)
+        return;
+
+    for (i = 0; i < buf->pages_nr; i++) {
+        struct qdma_sw_sg *sg = &buf->sgl[i];
+
+        if (sg->dma_addr) {
+            dma_unmap_page(dev, sg->dma_addr, sg->len, DMA_BIDIRECTIONAL);
+            sg->dma_addr = 0UL;
+        }
+    }
+
+    buf->dma_mapped = false;
 }
 
 /**
- * slash_qdma_unmap_user_buf() - Unpin user pages after a DMA transfer.
- * @iocb:  I/O control block with pinned pages.
- * @write: Transfer direction from the device's perspective.  If false
- *         (i.e., a C2H/read transfer), the pages were written to by the
- *         device and must be marked dirty so the VM knows the page
- *         contents have changed.
+ * slash_qdma_buf_dma_map() - DMA-map every SGL entry of a buffer.
+ * @buf: Buffer with a freshly built SGL.
  *
- * Iterates over pinned pages, marks them dirty if this was a read (C2H)
- * transfer (because the device wrote data into those user pages), and
- * releases each page reference acquired by get_user_pages_fast().
+ * Maps each page with DMA_BIDIRECTIONAL so the same cached mapping serves both
+ * H2C and C2H transfers.  On any failure all previously mapped entries are
+ * unmapped before returning.
+ *
+ * Return: 0 on success, negative errno on failure.
  */
-static void slash_qdma_unmap_user_buf(struct slash_qdma_io_cb *iocb, bool write)
+static int slash_qdma_buf_dma_map(struct slash_qdma_buf *buf)
 {
-    int i;
+    struct device *dev = &buf->qdma_dev->pdev->dev;
+    unsigned int i;
+
+    for (i = 0; i < buf->pages_nr; i++) {
+        struct qdma_sw_sg *sg = &buf->sgl[i];
+
+        sg->dma_addr = dma_map_page(dev, sg->pg, sg->offset, sg->len,
+                                    DMA_BIDIRECTIONAL);
+        if (dma_mapping_error(dev, sg->dma_addr)) {
+            sg->dma_addr = 0UL;
+            pr_err("slash: qdma: buffer DMA map failed at entry %u/%u\n",
+                   i, buf->pages_nr);
+            buf->dma_mapped = true; /* allow unmap of the entries done so far */
+            slash_qdma_buf_dma_unmap(buf);
+            return -ENOMEM;
+        }
+    }
 
-    if (!iocb->pages || !iocb->pages_nr)
-        return;
+    buf->dma_mapped = true;
+    return 0;
+}
 
-    for (i = 0; i < iocb->pages_nr; i++) {
-        if (iocb->pages[i]) {
-            /*
-             * For C2H (read) transfers (!write), the device wrote into
-             * these user pages, so mark them dirty to inform the VM.
-             */
-            if (!write)
-                set_page_dirty(iocb->pages[i]);
-            put_page(iocb->pages[i]);
-        } else {
-            break;
+/**
+ * slash_qdma_buf_free_pages() - Free a buffer's pages and SGL.
+ * @buf: Buffer to tear down.
+ *
+ * Releases each allocated page (put_page() so pages still mapped into a VMA
+ * stay alive until the last mapping is torn down) and frees the SGL/page
+ * arrays.  The DMA mapping must already have been removed.
+ */
+static void slash_qdma_buf_free_pages(struct slash_qdma_buf *buf)
+{
+    unsigned int i;
+
+    if (buf->pages) {
+        for (i = 0; i < buf->pages_nr; i++) {
+            if (buf->pages[i])
+                put_page(buf->pages[i]);
         }
     }
 
-    if (i != iocb->pages_nr)
-        pr_err("slash: qdma: sgl pages %d/%u.\n", i, iocb->pages_nr);
-
-    iocb->pages_nr = 0;
+    kvfree(buf->pages);
+    buf->pages = NULL;
+    kvfree(buf->sgl);
+    buf->sgl = NULL;
+    buf->pages_nr = 0;
 }
 
 /**
- * slash_qdma_map_user_buf_to_sgl() - Pin user pages and build a scatter-gather list.
- * @iocb:  I/O control block.  @iocb->buf and @iocb->len must be set
- *         before calling.  On success, @iocb->sgl, @iocb->pages, and
- *         @iocb->pages_nr are populated.
- * @write: Transfer direction (true = H2C write, false = C2H read).
+ * slash_qdma_buf_alloc() - Allocate pages, build the SGL, and DMA-map.
+ * @buf: Buffer with @length and @qdma_dev set; @granule defaults to PAGE_SIZE.
  *
- * Steps:
- *   1. Compute the number of pages spanned by the user buffer (accounting
- *      for the offset within the first page).
- *   2. Allocate a single contiguous block for the SGL entries and the
- *      page pointer array (avoids two allocations).
- *   3. Pin user pages via get_user_pages_fast() with write=1 (even for
- *      H2C, because libqdma may write status back).
- *   4. Build the qdma_sw_sg linked list: one entry per page, with the
- *      first entry's offset reflecting the sub-page position of the
- *      user buffer, and the last entry's length truncated to the
- *      remaining byte count.
- *   5. Flush the data cache for each page to ensure coherency between
- *      the CPU cache and the DMA engine's view of memory.
+ * Allocates @length / PAGE_SIZE individual base pages (not contiguous), builds
+ * a one-page-per-entry SGL, and DMA-maps every page.  All of this is the
+ * amortised, do-it-once setup cost paid by SLASH_QDMA_IOCTL_BUF_CREATE.
  *
- * Return: 0 on success, negative errno on failure (pages are unpinned
- *         and the SGL is freed on error).
+ * Return: 0 on success, negative errno on failure (partial state cleaned up).
  */
-static int slash_qdma_map_user_buf_to_sgl(struct slash_qdma_io_cb *iocb,
-                                          bool write)
+static int slash_qdma_buf_alloc(struct slash_qdma_buf *buf)
 {
-    unsigned long len = iocb->len;
-    char *buf = (char *)iocb->buf;
-    struct qdma_sw_sg *sg;
-    unsigned int pg_off = offset_in_page(buf);
-    unsigned int pages_nr = (len + pg_off + PAGE_SIZE - 1) >> PAGE_SHIFT;
-    int i;
+    size_t entries = buf->length / PAGE_SIZE;
+    unsigned int i;
     int rv;
 
-    if (len == 0)
-        pages_nr = 1;
-    if (pages_nr == 0)
+    if (buf->length == 0 || (buf->length % PAGE_SIZE) != 0 ||
+        entries == 0 || entries > UINT_MAX)
         return -EINVAL;
 
-    iocb->pages_nr = 0;
+    buf->granule = PAGE_SIZE;
+    buf->pages_nr = (unsigned int)entries;
 
-    /*
-     * Single allocation for both the SGL array and the page pointer
-     * array.  The page pointers are placed immediately after the SGL
-     * entries in memory.
-     */
-    sg = kmalloc(pages_nr * (sizeof(struct qdma_sw_sg) +
-                             sizeof(struct page *)), GFP_KERNEL);
-    if (!sg) {
-        pr_err("slash: qdma: sgl allocation failed for %u pages\n",
-               pages_nr);
+    buf->pages = kvcalloc(entries, sizeof(*buf->pages), GFP_KERNEL);
+    if (!buf->pages)
         return -ENOMEM;
-    }
-    memset(sg, 0, pages_nr * (sizeof(struct qdma_sw_sg) +
-                              sizeof(struct page *)));
-    iocb->sgl = sg;
-
-    /* Page pointer array lives right after the SGL entries. */
-    iocb->pages = (struct page **)(sg + pages_nr);
 
-    /*
-     * Pin the user pages into physical memory.  The write=1 flag tells
-     * the kernel these pages may be written to (needed for C2H, but we
-     * always request write permission for simplicity).
-     */
-    rv = get_user_pages_fast((unsigned long)buf, pages_nr,
-                             1 /* write */, iocb->pages);
-    if (rv < 0) {
-        pr_err("slash: qdma: unable to pin down %u user pages, %d\n",
-               pages_nr, rv);
-        goto err_out;
-    }
-    if (rv != pages_nr) {
-        pr_err("slash: qdma: unable to pin down all %u user pages, %d\n",
-               pages_nr, rv);
-        iocb->pages_nr = rv;
-        rv = -EFAULT;
-        goto err_out;
+    buf->sgl = kvcalloc(entries, sizeof(*buf->sgl), GFP_KERNEL);
+    if (!buf->sgl) {
+        kvfree(buf->pages);
+        buf->pages = NULL;
+        return -ENOMEM;
     }
 
-    /*
-     * Build the scatter-gather list.  Each entry describes one page's
-     * worth of data.  The first page may have a non-zero offset, and
-     * the last page may have fewer than PAGE_SIZE bytes.
-     */
-    sg = iocb->sgl;
-    for (i = 0; i < pages_nr; i++, sg++) {
-        unsigned int offset = offset_in_page(buf);
-        unsigned int nbytes = min_t(unsigned int,
-                                    PAGE_SIZE - offset, len);
-        struct page *pg = iocb->pages[i];
+    for (i = 0; i < entries; i++) {
+        struct page *pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        struct qdma_sw_sg *sg = &buf->sgl[i];
 
-        /* Ensure CPU cache is flushed so the DMA engine sees fresh data. */
-        flush_dcache_page(pg);
+        if (!pg) {
+            rv = -ENOMEM;
+            goto err_free;
+        }
 
-        sg->next = sg + 1;
+        buf->pages[i] = pg;
+        sg->next = (i + 1 < entries) ? &buf->sgl[i + 1] : NULL;
         sg->pg = pg;
-        sg->offset = offset;
-        sg->len = nbytes;
+        sg->offset = 0;
+        sg->len = PAGE_SIZE;
         sg->dma_addr = 0UL;
-
-        buf += nbytes;
-        len -= nbytes;
     }
 
-    /* Terminate the linked list. */
-    iocb->sgl[pages_nr - 1].next = NULL;
-    iocb->pages_nr = pages_nr;
-    return 0;
+    rv = slash_qdma_buf_dma_map(buf);
+    if (rv < 0)
+        goto err_free;
 
-err_out:
-    slash_qdma_unmap_user_buf(iocb, write);
-    slash_qdma_iocb_release(iocb);
+    return 0;
 
+err_free:
+    slash_qdma_buf_free_pages(buf);
     return rv;
 }
 
 /**
- * slash_qdma_qpair_read_write() - Perform a DMA transfer via a qpair fd.
- * @file:  The anon_inode file for this queue pair.
- * @buf:   User-space buffer (source for write/H2C, destination for read/C2H).
- * @count: Number of bytes to transfer.
- * @ppos:  File position — used as the device-side (endpoint) address.
- *         Updated on success to reflect the bytes transferred, enabling
- *         sequential positional I/O.
- * @write: true for H2C (host-to-card write), false for C2H (card-to-host read).
- *
- * Transfer flow:
- *   1. Validate context and check that the required direction (H2C or C2H)
- *      is enabled on this queue pair.
- *   2. Pin user pages and build a scatter-gather list.
- *   3. Populate a qdma_request:
- *      - ep_addr = *ppos: the device-side address (FPGA memory offset).
- *      - h2c_eot = 1: signals end-of-transfer to the FPGA, allowing it to
- *        process the complete data packet.
- *      - timeout_ms = 10000 (10 seconds): if the transfer doesn't complete
- *        in this time, qdma_request_submit returns an error.
- *      - fp_done = NULL: synchronous mode — the call blocks until completion.
- *        If fp_done were set, libqdma would call it asynchronously.
- *      - dma_mapped = 0: libqdma handles the DMA mapping internally.
- *   4. Submit to libqdma via qdma_request_submit().
- *   5. On success, advance *ppos by the number of bytes transferred.
- *   6. Unpin pages and free the SGL.
- *
- * Return: Number of bytes transferred (>= 0) on success, negative errno
- *         on failure.
- */
-static ssize_t slash_qdma_qpair_read_write(struct file *file, char __user *buf,
-                                           size_t count, loff_t *ppos,
-                                           bool write)
+ * slash_qdma_buf_sync_for_device() - Hand a transfer slice to the device.
+ * @buf:         Buffer being transferred.
+ * @start_entry: First page index of the slice.
+ * @n_entries:   Number of pages in the slice.
+ * @dir:         DMA direction (DMA_TO_DEVICE for H2C, DMA_FROM_DEVICE for C2H).
+ *
+ * Synchronises CPU-written data out to the device (and/or invalidates CPU
+ * caches) for exactly the pages a sub-transfer touches.  On cache-coherent
+ * hosts these are no-ops; on others they bound coherency to the transfer.
+ */
+static void slash_qdma_buf_sync_for_device(struct slash_qdma_buf *buf,
+                                           u64 start_entry, u64 n_entries,
+                                           enum dma_data_direction dir)
 {
-    struct slash_qdma_qpair_file_ctx *ctx = file->private_data;
-    struct slash_qdma_dev *qdma_dev;
-    struct slash_qdma_qpair_entry *entry;
-    struct slash_qdma_io_cb iocb;
-    struct qdma_request *req;
-    unsigned long qhndl;
-    ssize_t res;
-    int rv;
+    struct device *dev = &buf->qdma_dev->pdev->dev;
+    u64 i;
 
-    if (!ctx)
-        return -EINVAL;
-
-    qdma_dev = ctx->qdma_dev;
-    entry = ctx->entry;
-
-    if (!qdma_dev || !entry)
-        return -ENODEV;
-
-    /* Check device liveness and resolve the queue handle for the direction. */
-    mutex_lock(&qdma_dev->lock);
-    if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) {
-        mutex_unlock(&qdma_dev->lock);
-        return -ENODEV;
-    }
+    for (i = 0; i < n_entries; i++) {
+        struct qdma_sw_sg *sg = &buf->sgl[start_entry + i];
 
-    if (write) {
-        /* H2C: writing data from host to card */
-        if (!(entry->dir_mask & SLASH_QDMA_DIR_H2C) ||
-            !slash_qdma_qhndl_is_valid(entry->qhndl[Q_H2C])) {
-            mutex_unlock(&qdma_dev->lock);
-            return -ENODEV;
-        }
-        qhndl = entry->qhndl[Q_H2C];
-    } else {
-        /* C2H: reading data from card to host */
-        if (!(entry->dir_mask & SLASH_QDMA_DIR_C2H) ||
-            !slash_qdma_qhndl_is_valid(entry->qhndl[Q_C2H])) {
-            mutex_unlock(&qdma_dev->lock);
-            return -ENODEV;
-        }
-        qhndl = entry->qhndl[Q_C2H];
+        dma_sync_single_for_device(dev, sg->dma_addr, sg->len, dir);
     }
-    mutex_unlock(&qdma_dev->lock);
-
-    /* Pin user pages and build the scatter-gather list. */
-    memset(&iocb, 0, sizeof(iocb));
-    iocb.buf = buf;
-    iocb.len = count;
-    rv = slash_qdma_map_user_buf_to_sgl(&iocb, write);
-    if (rv < 0)
-        return rv;
-
-    /* Populate the libqdma request structure. */
-    req = &iocb.req;
-    req->sgcnt = iocb.pages_nr;         /* Number of SGL entries */
-    req->sgl = iocb.sgl;                /* Scatter-gather list */
-    req->write = write ? 1 : 0;         /* Direction flag for libqdma */
-    req->dma_mapped = 0;                /* Let libqdma handle DMA mapping */
-    req->udd_len = 0;                   /* No user-defined data */
-    req->ep_addr = (u64)*ppos;           /* Device-side (endpoint) address */
-    req->count = count;                  /* Total byte count */
-    req->timeout_ms = 10 * 1000;         /* 10-second timeout */
-    req->fp_done = NULL;                 /* Synchronous: block until complete */
-    req->h2c_eot = 1;                   /* End-of-transfer marker for FPGA */
-
-    SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev,
-                          "qdma_request_submit start: qid=%u qhndl=%lu write=%d count=%zu ep_addr=0x%llx\n",
-                          ctx->qid, qhndl, req->write, req->count,
-                          (unsigned long long)req->ep_addr);
-    res = qdma_request_submit(qdma_dev->qdma_handle, qhndl, req);
-    SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev,
-                          "qdma_request_submit done: qid=%u qhndl=%lu res=%zd\n",
-                          ctx->qid, qhndl, res);
+}
 
-    /* Advance the file position by the number of bytes transferred. */
-    if (res > 0)
-        *ppos += res;
+/**
+ * slash_qdma_buf_sync_for_cpu() - Reclaim a transfer slice for the CPU.
+ * @buf:         Buffer being transferred.
+ * @start_entry: First page index of the slice.
+ * @n_entries:   Number of pages in the slice.
+ * @dir:         DMA direction (DMA_FROM_DEVICE for a completed C2H read).
+ *
+ * Makes device-written data visible to the CPU for exactly the pages a C2H
+ * sub-transfer touched.  Called after the transfer completes.
+ */
+static void slash_qdma_buf_sync_for_cpu(struct slash_qdma_buf *buf,
+                                        u64 start_entry, u64 n_entries,
+                                        enum dma_data_direction dir)
+{
+    struct device *dev = &buf->qdma_dev->pdev->dev;
+    u64 i;
 
-    /* Unpin pages (marking dirty for C2H reads) and free the SGL. */
-    slash_qdma_unmap_user_buf(&iocb, write);
-    slash_qdma_iocb_release(&iocb);
+    for (i = 0; i < n_entries; i++) {
+        struct qdma_sw_sg *sg = &buf->sgl[start_entry + i];
 
-    return res;
+        dma_sync_single_for_cpu(dev, sg->dma_addr, sg->len, dir);
+    }
 }
 
 /**
- * slash_qdma_qpair_read() - Read (C2H) file operation for a qpair fd.
- * @file:  Anon_inode file for the queue pair.
- * @buf:   User-space destination buffer.
- * @count: Number of bytes to read.
- * @ppos:  Device-side address to read from.
- *
- * Thin wrapper that delegates to slash_qdma_qpair_read_write() with
- * write=false (C2H direction).
+ * slash_qdma_buf_release() - kref release callback for a buffer.
+ * @ref: kref embedded in the slash_qdma_buf being freed.
  *
- * Return: Bytes transferred or negative errno.
+ * Runs when the last reference drops (fd ref, every live VMA ref, and any
+ * in-flight transfer ref).  Tears down the DMA mapping, frees the pages and
+ * SGL, drops the device reference, and frees the struct.
  */
-static ssize_t slash_qdma_qpair_read(struct file *file, char __user *buf,
-                                     size_t count, loff_t *ppos)
+static void slash_qdma_buf_release(struct kref *ref)
 {
-    return slash_qdma_qpair_read_write(file, buf, count, ppos, false);
+    struct slash_qdma_buf *buf =
+        container_of(ref, struct slash_qdma_buf, ref);
+    struct slash_qdma_dev *qdma_dev = buf->qdma_dev;
+
+    slash_qdma_buf_dma_unmap(buf);
+    slash_qdma_buf_free_pages(buf);
+    if (qdma_dev)
+        kref_put(&qdma_dev->ref, slash_qdma_dev_release);
+    kfree(buf);
 }
 
-/**
- * slash_qdma_qpair_write() - Write (H2C) file operation for a qpair fd.
- * @file:  Anon_inode file for the queue pair.
- * @buf:   User-space source buffer.
- * @count: Number of bytes to write.
- * @ppos:  Device-side address to write to.
- *
- * Thin wrapper that delegates to slash_qdma_qpair_read_write() with
- * write=true (H2C direction).
- *
- * Return: Bytes transferred or negative errno.
- */
-static ssize_t slash_qdma_qpair_write(struct file *file, const char __user *buf,
-                                      size_t count, loff_t *ppos)
+static inline void slash_qdma_buf_get(struct slash_qdma_buf *buf)
 {
-    return slash_qdma_qpair_read_write(file, (char __user *)buf,
-                                       count, ppos, true);
+    kref_get(&buf->ref);
 }
 
-/**
+static void slash_qdma_buf_put(struct slash_qdma_buf *buf)
+{
+    kref_put(&buf->ref, slash_qdma_buf_release);
+}
+
+/* ─────────────────────────────────────────────────────────────────────
+ * Buffer fd: mmap support and lifetime
+ * ───────────────────────────────────────────────────────────────────── */
+
+/**
+ * slash_qdma_buf_vm_open() - VMA open callback (fork / VMA split).
+ * @vma: The VMA gaining an independent reference.
+ *
+ * Each live VMA holds one buffer reference so the pages (and DMA mapping)
+ * outlive the buffer fd if userspace keeps the mapping after close().
+ */
+static void slash_qdma_buf_vm_open(struct vm_area_struct *vma)
+{
+    struct slash_qdma_buf *buf = vma->vm_private_data;
+
+    if (buf)
+        slash_qdma_buf_get(buf);
+}
+
+/**
+ * slash_qdma_buf_vm_close() - VMA close callback (munmap / exit).
+ * @vma: The VMA being torn down.
+ */
+static void slash_qdma_buf_vm_close(struct vm_area_struct *vma)
+{
+    struct slash_qdma_buf *buf = vma->vm_private_data;
+
+    if (buf)
+        slash_qdma_buf_put(buf);
+}
+
+static const struct vm_operations_struct slash_qdma_buf_vm_ops = {
+    .open  = slash_qdma_buf_vm_open,
+    .close = slash_qdma_buf_vm_close,
+};
+
+/**
+ * slash_qdma_buf_mmap() - mmap a kernel buffer's pages into userspace.
+ * @file: The buffer fd.
+ * @vma:  The mapping request.
+ *
+ * Maps the whole buffer (offset 0, full length) into the calling process.
+ * The pages are ordinary kernel pages, so vm_map_pages_zero() inserts them
+ * directly; each VMA takes a buffer reference (initial one here, duplicated by
+ * the .open callback) so the pages stay valid for the life of the mapping.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+static int slash_qdma_buf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+    struct slash_qdma_buf *buf = file->private_data;
+    unsigned long span = vma->vm_end - vma->vm_start;
+    int rv;
+
+    if (!buf)
+        return -ENODEV;
+
+    /* Only a full, offset-0 mapping of the buffer is supported. */
+    if (vma->vm_pgoff != 0)
+        return -EINVAL;
+    if (span != (unsigned long)buf->length)
+        return -EINVAL;
+
+    /*
+     * Normal page mapping (no VM_PFNMAP): keep it from being expanded beyond
+     * the buffer and excluded from core dumps.
+     */
+    slash_vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
+
+    rv = vm_map_pages_zero(vma, buf->pages, buf->pages_nr);
+    if (rv)
+        return rv;
+
+    vma->vm_ops = &slash_qdma_buf_vm_ops;
+    vma->vm_private_data = buf;
+    slash_qdma_buf_get(buf); /* dropped by vm_close when this VMA goes away */
+
+    return 0;
+}
+
+/**
+ * slash_qdma_buf_fop_release() - Release callback for a buffer fd.
+ * @inode: Unused (anon inode).
+ * @file:  The buffer fd being closed.
+ *
+ * Drops the fd's buffer reference.  Pages survive until any remaining VMA
+ * references are dropped too.
+ *
+ * Return: Always 0.
+ */
+static int slash_qdma_buf_fop_release(struct inode *inode, struct file *file)
+{
+    struct slash_qdma_buf *buf = file->private_data;
+
+    (void)inode;
+
+    if (buf) {
+        slash_qdma_buf_put(buf);
+        file->private_data = NULL;
+    }
+
+    return 0;
+}
+
+/**
+ * slash_qdma_buf_fops - File operations for buffer fds.
+ *
+ * mmap     maps the buffer's pages for CPU access.
+ * release  drops the fd's reference on the buffer.
+ */
+static const struct file_operations slash_qdma_buf_fops = {
+    .owner   = THIS_MODULE,
+    .mmap    = slash_qdma_buf_mmap,
+    .release = slash_qdma_buf_fop_release,
+};
+
+/**
+ * slash_qdma_buf_from_file() - Resolve a buffer fd to its buffer object.
+ * @file: A file obtained from fget() on a candidate buffer fd.
+ *
+ * Return: The buffer if @file is a SLASH buffer fd, else NULL.
+ */
+static struct slash_qdma_buf *slash_qdma_buf_from_file(struct file *file)
+{
+    if (!file || file->f_op != &slash_qdma_buf_fops)
+        return NULL;
+    return file->private_data;
+}
+
+/* ─────────────────────────────────────────────────────────────────────
+ * IOCTL: buffer create
+ * ───────────────────────────────────────────────────────────────────── */
+
+/**
+ * slash_qdma_ioctl_buf_create_w() - Allocate a kernel buffer and return its fd.
+ * @misc:     Miscdevice handle (for logging).
+ * @qdma_dev: QDMA device the buffer is bound to (for DMA mapping).
+ * @uarg:     User pointer to a struct slash_qdma_buf_create.
+ *
+ * Allocates the buffer's pages, builds the SGL, and DMA-maps everything once,
+ * then wraps it in an anon_inode fd whose mmap exposes the pages for CPU
+ * access.  The fd is returned as the ioctl return value (same convention as
+ * the BAR/queue-pair fd ioctls).  Closing the fd (and unmapping any VMA)
+ * releases the buffer.
+ *
+ * Return: The new buffer fd (>= 0) on success, negative errno on failure.
+ */
+static int slash_qdma_ioctl_buf_create_w(struct miscdevice *misc,
+                                         struct slash_qdma_dev *qdma_dev,
+                                         void __user *uarg)
+{
+    struct slash_qdma_buf_create req;
+    struct slash_qdma_buf *buf;
+    struct file *file;
+    __u32 user_size = 0;
+    size_t copy_size;
+    int fd;
+    int rv;
+
+    if (copy_from_user(&user_size, uarg, sizeof(user_size)))
+        return -EFAULT;
+
+    if (user_size < SLASH_QDMA_BUF_CREATE_MIN_SIZE) {
+        dev_warn(misc->this_device,
+                 "qdma: BUF_CREATE size too small (%u)\n", user_size);
+        return -EINVAL;
+    }
+
+    memset(&req, 0, sizeof(req));
+    if (copy_from_user(&req, uarg, min_t(size_t, user_size, sizeof(req))))
+        return -EFAULT;
+
+    if (req.flags & ~O_CLOEXEC)
+        return -EINVAL;
+
+    if (req.length == 0 || (req.length % PAGE_SIZE) != 0)
+        return -EINVAL;
+
+    buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+    if (!buf)
+        return -ENOMEM;
+
+    kref_init(&buf->ref);
+    /* The buffer holds a device reference for its whole lifetime. */
+    kref_get(&qdma_dev->ref);
+    buf->qdma_dev = qdma_dev;
+    buf->length = req.length;
+
+    rv = slash_qdma_buf_alloc(buf);
+    if (rv < 0) {
+        kref_put(&qdma_dev->ref, slash_qdma_dev_release);
+        kfree(buf);
+        return rv;
+    }
+
+    file = anon_inode_getfile("slash_qdma_buf", &slash_qdma_buf_fops, buf,
+                              O_RDWR | (req.flags & O_CLOEXEC));
+    if (IS_ERR(file)) {
+        rv = PTR_ERR(file);
+        slash_qdma_buf_put(buf); /* drops the only ref: frees buf + dev ref */
+        return rv;
+    }
+
+    fd = get_unused_fd_flags(req.flags & O_CLOEXEC);
+    if (fd < 0) {
+        fput(file); /* triggers buf release */
+        return fd;
+    }
+
+    SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev,
+                          "buf create: fd=%d len=%llu granule=%u pages=%u\n",
+                          fd, (unsigned long long)req.length,
+                          buf->granule, buf->pages_nr);
+
+    /* Fill the output fields before installing the fd. */
+    req.size = sizeof(req);
+    req.granule = buf->granule;
+    req.transfer_hint = SLASH_QDMA_TRANSFER_HINT_V80;
+    copy_size = min_t(size_t, user_size, sizeof(req));
+    if (copy_to_user(uarg, &req, copy_size)) {
+        put_unused_fd(fd);
+        fput(file);
+        return -EFAULT;
+    }
+    if (user_size > sizeof(req)) {
+        if (clear_user((void __user *)((unsigned long)uarg + sizeof(req)),
+                       user_size - sizeof(req))) {
+            put_unused_fd(fd);
+            fput(file);
+            return -EFAULT;
+        }
+    }
+
+    fd_install(fd, file);
+
+    return fd;
+}
+
+/**
+ * struct slash_qdma_xfer_req - Runtime state for one sub-transfer submission.
+ * @qreq:           libqdma request (built by slash_qdma_xfer_prep()).
+ * @done:           Completion signalled by @qreq.fp_done for async submissions.
+ * @buf:            Kernel buffer the transfer references (one ref held).
+ * @qhndl:          Resolved libqdma queue handle for the direction/qpair.
+ * @start_entry:    First page index of the buffer slice being transferred.
+ * @n_entries:      Number of pages in the slice (for the DMA sync).
+ * @dma_dir:        DMA direction for the streaming sync calls.
+ * @is_c2h:         True for a C2H (device-to-host) sub-transfer, so the slice
+ *                  is synced back for the CPU after completion.
+ * @bytes_done:     Bytes transferred, filled on completion.
+ * @err:            Negative errno if the sub-transfer failed, else 0.
+ * @async_inflight: True once queued asynchronously and awaiting fp_done.
+ *
+ * Allocated as an array (one per sub-transfer) for the duration of a transfer
+ * batch.  @qreq must outlive the in-flight async request, so the array stays
+ * alive until every async completion has fired.
+ */
+struct slash_qdma_xfer_req {
+    struct qdma_request qreq;
+    struct completion done;
+    struct slash_qdma_buf *buf;
+    unsigned long qhndl;
+    u64 start_entry;
+    u64 n_entries;
+    enum dma_data_direction dma_dir;
+    bool is_c2h;
+    unsigned int bytes_done;
+    int err;
+    bool async_inflight;
+};
+
+/**
+ * slash_qdma_xfer_done() - libqdma fp_done callback for async sub-transfers.
+ * @qreq:       The completed request (embedded in a slash_qdma_xfer_req).
+ * @bytes_done: Bytes transferred.
+ * @err:        Negative errno on failure, else 0.
+ *
+ * Records the result and wakes the submitter waiting on @done.  Runs in
+ * libqdma worker-thread context.
+ *
+ * Return: Always 0 (libqdma may free/re-task the request).
+ */
+static int slash_qdma_xfer_done(struct qdma_request *qreq,
+                                unsigned int bytes_done, int err)
+{
+    struct slash_qdma_xfer_req *xr =
+        container_of(qreq, struct slash_qdma_xfer_req, qreq);
+
+    xr->bytes_done = bytes_done;
+    xr->err = err;
+    complete(&xr->done);
+    return 0;
+}
+
+/**
+ * slash_qdma_xfer_prep() - Validate one sub-transfer and build its request.
+ * @qdma_dev: QDMA device.
+ * @entry:    Queue pair entry selected by the sub-transfer's qpair_index.
+ * @desc:     User-supplied sub-transfer descriptor.
+ * @xr:       [out] Receives a built (but not yet submitted) request, and a
+ *            reference on the kernel buffer it targets.
+ *
+ * Shared submit core used by both the synchronous transfer ioctl and the
+ * optional io_uring uring_cmd path.  Resolves the buffer fd named by the
+ * descriptor and refs the buffer, validates the slice against the buffer's
+ * page granule and length, resolves the queue handle for the requested
+ * direction, syncs the slice for the device, and fills the cached,
+ * pre-DMA-mapped SGL slice into @xr->qreq (dma_mapped = 1, fp_done = NULL).
+ * No pages are allocated or DMA-mapped here; that was amortised at creation.
+ *
+ * On success the caller owns the buffer ref in @xr->buf and must release it
+ * with slash_qdma_buf_put() once the request is no longer in flight.
+ *
+ * Return: 0 on success, negative errno on failure (no ref held on failure).
+ */
+static int slash_qdma_xfer_prep(struct slash_qdma_dev *qdma_dev,
+                                struct slash_qdma_qpair_entry *entry,
+                                const struct slash_qdma_subxfer *desc,
+                                struct slash_qdma_xfer_req *xr)
+{
+    struct slash_qdma_buf *buf;
+    struct file *file;
+    unsigned long qhndl;
+    bool write;
+    u32 dir_bit;
+    enum queue_type_t qtype;
+    enum dma_data_direction dma_dir;
+    u64 start_entry, n_entries;
+
+    switch (desc->direction) {
+    case SLASH_QDMA_XFER_H2C:
+        write = true;
+        dir_bit = SLASH_QDMA_DIR_H2C;
+        qtype = Q_H2C;
+        dma_dir = DMA_TO_DEVICE;
+        break;
+    case SLASH_QDMA_XFER_C2H:
+        write = false;
+        dir_bit = SLASH_QDMA_DIR_C2H;
+        qtype = Q_C2H;
+        dma_dir = DMA_FROM_DEVICE;
+        break;
+    default:
+        return -EINVAL;
+    }
+
+    /* libqdma's request count is a 32-bit byte count. */
+    if (desc->length == 0 || desc->length > UINT_MAX)
+        return -EINVAL;
+
+    /* Resolve the buffer fd and take a ref that outlives the fd. */
+    file = fget(desc->buf_fd);
+    if (!file)
+        return -EBADF;
+    buf = slash_qdma_buf_from_file(file);
+    if (!buf) {
+        fput(file);
+        return -EINVAL;
+    }
+    /* DMA mappings are device-specific: the buffer must belong to this device. */
+    if (buf->qdma_dev != qdma_dev) {
+        fput(file);
+        return -EINVAL;
+    }
+    slash_qdma_buf_get(buf);
+    fput(file);
+
+    /* Validate the requested slice against the buffer's page granule. */
+    if (buf->granule == 0 ||
+        (desc->buf_offset % buf->granule) != 0 ||
+        (desc->length % buf->granule) != 0) {
+        slash_qdma_buf_put(buf);
+        return -EINVAL;
+    }
+    if (desc->buf_offset > buf->length ||
+        desc->length > buf->length - desc->buf_offset) {
+        slash_qdma_buf_put(buf);
+        return -EINVAL;
+    }
+
+    start_entry = desc->buf_offset / buf->granule;
+    n_entries = desc->length / buf->granule;
+    if (start_entry + n_entries > buf->pages_nr) {
+        slash_qdma_buf_put(buf);
+        return -EINVAL;
+    }
+
+    /* Check device liveness and resolve the queue handle for the direction. */
+    mutex_lock(&qdma_dev->lock);
+    if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) {
+        mutex_unlock(&qdma_dev->lock);
+        slash_qdma_buf_put(buf);
+        return -ENODEV;
+    }
+    if (!(entry->dir_mask & dir_bit) ||
+        !slash_qdma_qhndl_is_valid(entry->qhndl[qtype])) {
+        mutex_unlock(&qdma_dev->lock);
+        slash_qdma_buf_put(buf);
+        return -ENODEV;
+    }
+    qhndl = entry->qhndl[qtype];
+    mutex_unlock(&qdma_dev->lock);
+
+    /*
+     * Hand the touched pages to the device.  The mapping is persistent
+     * (dma_mapped = 1); only this slice is synced, so coherency cost scales
+     * with the transfer, not the whole buffer.
+     */
+    slash_qdma_buf_sync_for_device(buf, start_entry, n_entries, dma_dir);
+
+    /*
+     * Build the request from the cached SGL slice.  dma_mapped = 1 tells
+     * libqdma the SGL is already DMA-mapped (dma_addr filled at creation),
+     * so it skips the per-request map/unmap entirely.
+     */
+    memset(&xr->qreq, 0, sizeof(xr->qreq));
+    xr->qreq.sgcnt = (unsigned int)n_entries;
+    xr->qreq.sgl = &buf->sgl[start_entry];
+    xr->qreq.write = write ? 1 : 0;
+    xr->qreq.dma_mapped = 1;
+    xr->qreq.udd_len = 0;
+    xr->qreq.ep_addr = (u64)desc->dev_addr;
+    xr->qreq.count = (unsigned int)desc->length;
+    xr->qreq.timeout_ms = 10 * 1000;
+    xr->qreq.fp_done = NULL;
+    xr->qreq.h2c_eot = 1;
+
+    xr->buf = buf;
+    xr->qhndl = qhndl;
+    xr->start_entry = start_entry;
+    xr->n_entries = n_entries;
+    xr->dma_dir = dma_dir;
+    xr->is_c2h = !write;
+    xr->bytes_done = 0;
+    xr->err = 0;
+    xr->async_inflight = false;
+    return 0;
+}
+
+/**
+ * slash_qdma_xfer_finish() - Post-completion DMA sync + buffer ref drop.
+ * @xr: A prepared (and now completed) sub-transfer request.
+ *
+ * For a C2H sub-transfer that moved data, makes the device-written pages
+ * visible to the CPU before releasing the buffer reference taken in prep.
+ */
+static void slash_qdma_xfer_finish(struct slash_qdma_xfer_req *xr)
+{
+    if (xr->is_c2h && xr->bytes_done)
+        slash_qdma_buf_sync_for_cpu(xr->buf, xr->start_entry, xr->n_entries,
+                                    xr->dma_dir);
+    slash_qdma_buf_put(xr->buf);
+}
+
+/**
+ * slash_qdma_qpair_transfer() - Buffer DMA transfer batch on a queue-pair fd.
+ * @file: Anon_inode file for the queue-pair collection.
+ * @uarg: User pointer to a struct slash_qdma_transfer (1..N sub-transfers).
+ *
+ * Validates and prepares every sub-transfer, then submits them so those that
+ * target distinct queue pairs run concurrently: all but the last are submitted
+ * asynchronously (fp_done set), the last is submitted synchronously (blocking),
+ * and the async ones are then waited on.  A single sub-transfer therefore takes
+ * the plain blocking path with no async overhead.
+ *
+ * Return: total number of bytes transferred (>= 0) on success; the first
+ *         sub-transfer error (negative errno) on failure.
+ */
+static long slash_qdma_qpair_transfer(struct file *file, void __user *uarg)
+{
+    struct slash_qdma_qpair_file_ctx *ctx = file->private_data;
+    struct slash_qdma_dev *qdma_dev;
+    struct slash_qdma_transfer req;
+    struct slash_qdma_xfer_req *xrs;
+    __u32 user_size = 0;
+    u32 count, i, last;
+    u64 total = 0;
+    int first_err = 0;
+    ssize_t res;
+
+    if (!ctx)
+        return -EINVAL;
+
+    qdma_dev = ctx->qdma_dev;
+
+    if (!qdma_dev || ctx->n_qpairs == 0)
+        return -ENODEV;
+
+    if (copy_from_user(&user_size, uarg, sizeof(user_size)))
+        return -EFAULT;
+
+    if (user_size < SLASH_QDMA_TRANSFER_MIN_SIZE)
+        return -EINVAL;
+
+    memset(&req, 0, sizeof(req));
+    if (copy_from_user(&req, uarg, min_t(size_t, user_size, sizeof(req))))
+        return -EFAULT;
+
+    count = req.count;
+    if (count == 0 || count > SLASH_QDMA_FD_MAX_QPAIRS)
+        return -EINVAL;
+
+    xrs = kcalloc(count, sizeof(*xrs), GFP_KERNEL);
+    if (!xrs)
+        return -ENOMEM;
+
+    /* Validate and prepare every sub-transfer (each takes a buffer ref). */
+    for (i = 0; i < count; i++) {
+        const struct slash_qdma_subxfer *d = &req.xfers[i];
+        int rv;
+
+        if (d->qpair_index >= ctx->n_qpairs)
+            rv = -EINVAL;
+        else
+            rv = slash_qdma_xfer_prep(qdma_dev,
+                                      ctx->entries[d->qpair_index], d,
+                                      &xrs[i]);
+        if (rv) {
+            while (i-- > 0)
+                slash_qdma_buf_put(xrs[i].buf);
+            kfree(xrs);
+            return rv;
+        }
+
+        SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev,
+                              "transfer[%u]: qid=%u buf_fd=%d off=%llu dev=0x%llx len=%llu dir=%s\n",
+                              i, ctx->qids[d->qpair_index], d->buf_fd,
+                              (unsigned long long)d->buf_offset,
+                              (unsigned long long)d->dev_addr,
+                              (unsigned long long)d->length,
+                              d->direction == SLASH_QDMA_XFER_H2C ? "H2C" : "C2H");
+    }
+
+    last = count - 1;
+
+    /*
+     * Submit all but the last asynchronously so the sub-transfers run on their
+     * (distinct) queue pairs in parallel; libqdma calls fp_done on completion.
+     */
+    for (i = 0; i < last; i++) {
+        init_completion(&xrs[i].done);
+        xrs[i].qreq.fp_done = slash_qdma_xfer_done;
+        res = qdma_request_submit(qdma_dev->qdma_handle, xrs[i].qhndl,
+                                  &xrs[i].qreq);
+        if (res < 0)
+            xrs[i].err = (int)res; /* not queued: fp_done will not fire */
+        else
+            xrs[i].async_inflight = true;
+    }
+
+    /* Submit the last sub-transfer synchronously (blocks until complete). */
+    res = qdma_request_submit(qdma_dev->qdma_handle, xrs[last].qhndl,
+                              &xrs[last].qreq);
+    if (res < 0)
+        xrs[last].err = (int)res;
+    else
+        xrs[last].bytes_done = (unsigned int)res;
+
+    /* Wait for the async sub-transfers, then aggregate (first error wins). */
+    for (i = 0; i < last; i++) {
+        if (xrs[i].async_inflight)
+            wait_for_completion(&xrs[i].done);
+    }
+
+    for (i = 0; i < count; i++) {
+        if (xrs[i].err && !first_err)
+            first_err = xrs[i].err;
+        total += xrs[i].bytes_done;
+        slash_qdma_xfer_finish(&xrs[i]);
+    }
+
+    kfree(xrs);
+
+    if (first_err)
+        return (long)first_err;
+
+    return (long)total;
+}
+
+#if defined(SLASH_HAVE_URING_CMD)
+/**
+ * struct slash_qdma_uring_cmd_ctx - Async state for one uring_cmd transfer.
+ * @cmd:         The io_uring command being served.
+ * @xrs:         Per-sub-transfer requests (buffer refs held until completion).
+ * @count:       Number of sub-transfers.
+ * @outstanding: Sub-transfers not yet completed; the one that drops it to 0
+ *               schedules the completion task-work.
+ * @total_bytes: Aggregate bytes transferred.
+ * @first_err:   First negative errno seen, or 0.
+ *
+ * Heap-allocated for the lifetime of an asynchronous transfer; a pointer to it
+ * is stashed in cmd->pdu so the completion task-work can recover it.
+ */
+struct slash_qdma_uring_cmd_ctx {
+    struct io_uring_cmd *cmd;
+    struct slash_qdma_xfer_req xrs[SLASH_QDMA_FD_MAX_QPAIRS];
+    u32 count;
+    atomic_t outstanding;
+    atomic_long_t total_bytes;
+    atomic_t first_err;
+};
+
+/**
+ * slash_qdma_uring_cmd_complete() - Task-work that finishes a uring_cmd.
+ * @cmd:         The io_uring command.
+ * @issue_flags: io_uring issue flags for io_uring_cmd_done().
+ *
+ * Runs in task context once all sub-transfers have completed: drops the
+ * buffer refs, completes the CQE with the total bytes (or first error), and
+ * frees the command context.
+ */
+static void slash_qdma_uring_cmd_complete(struct io_uring_cmd *cmd,
+                                          unsigned int issue_flags)
+{
+    struct slash_qdma_uring_cmd_ctx *uc;
+    int err;
+    long ret;
+    u32 i;
+
+    memcpy(&uc, cmd->pdu, sizeof(uc));
+    err = atomic_read(&uc->first_err);
+    ret = err ? err : atomic_long_read(&uc->total_bytes);
+
+    for (i = 0; i < uc->count; i++)
+        slash_qdma_xfer_finish(&uc->xrs[i]);
+
+    io_uring_cmd_done(cmd, ret, 0, issue_flags);
+    kfree(uc);
+}
+
+/**
+ * slash_qdma_uring_xfer_done() - fp_done for async uring_cmd sub-transfers.
+ * @qreq:       Completed request (embedded in a slash_qdma_xfer_req).
+ * @bytes_done: Bytes transferred.
+ * @err:        Negative errno on failure, else 0.
+ *
+ * Accumulates the result and, when the last sub-transfer of the command
+ * completes, schedules the completion task-work.  Runs in libqdma worker
+ * context.
+ *
+ * Return: Always 0.
+ */
+static int slash_qdma_uring_xfer_done(struct qdma_request *qreq,
+                                      unsigned int bytes_done, int err)
+{
+    struct slash_qdma_xfer_req *xr =
+        container_of(qreq, struct slash_qdma_xfer_req, qreq);
+    struct slash_qdma_uring_cmd_ctx *uc =
+        (struct slash_qdma_uring_cmd_ctx *)qreq->uld_data;
+
+    xr->bytes_done = bytes_done;
+    xr->err = err;
+    if (bytes_done)
+        atomic_long_add(bytes_done, &uc->total_bytes);
+    if (err)
+        atomic_cmpxchg(&uc->first_err, 0, err);
+
+    if (atomic_dec_and_test(&uc->outstanding))
+        io_uring_cmd_complete_in_task(uc->cmd,
+                                      slash_qdma_uring_cmd_complete);
+    return 0;
+}
+
+/**
+ * slash_qdma_qpair_uring_cmd() - Asynchronous transfer batch via io_uring.
+ * @cmd:         The io_uring command; its inline SQE data is a single __u64
+ *               userspace pointer to a struct slash_qdma_transfer.
+ * @issue_flags: io_uring issue flags.
+ *
+ * The optional async sibling of SLASH_QDMA_QPAIR_IOCTL_TRANSFER: it prepares
+ * every sub-transfer, submits them all asynchronously (so they run on their
+ * distinct queue pairs concurrently), and completes the CQE from task-work
+ * once they all finish.  Many such commands can be in flight at once, which is
+ * the intended multi-buffer optimization.
+ *
+ * Return: -EIOCBQUEUED once submission is under way (completion arrives via
+ *         the CQE); a negative errno if the command is rejected before any
+ *         sub-transfer is queued; -EAGAIN to defer a non-blocking issue.
+ */
+static int slash_qdma_qpair_uring_cmd(struct io_uring_cmd *cmd,
+                                      unsigned int issue_flags)
+{
+    struct file *file = cmd->file;
+    struct slash_qdma_qpair_file_ctx *ctx = file->private_data;
+    struct slash_qdma_dev *qdma_dev;
+    struct slash_qdma_uring_cmd_ctx *uc;
+    struct slash_qdma_transfer req;
+    u64 uptr = 0;
+    u32 count, i;
+    ssize_t res;
+
+    if (cmd->cmd_op != SLASH_QDMA_URING_CMD_TRANSFER)
+        return -EOPNOTSUPP;
+
+    if (!ctx)
+        return -EINVAL;
+
+    qdma_dev = ctx->qdma_dev;
+    if (!qdma_dev || ctx->n_qpairs == 0)
+        return -ENODEV;
+
+    /*
+     * Copying the descriptor from userspace may fault and sleep, so defer a
+     * non-blocking issue to a blocking io_uring context.
+     */
+    if (issue_flags & IO_URING_F_NONBLOCK)
+        return -EAGAIN;
+
+    /* The SQE inline command carries the user pointer to the descriptor. */
+    memcpy(&uptr, slash_qdma_uring_cmd_payload(cmd), sizeof(uptr));
+
+    memset(&req, 0, sizeof(req));
+    if (copy_from_user(&req, u64_to_user_ptr(uptr), sizeof(req)))
+        return -EFAULT;
+
+    count = req.count;
+    if (count == 0 || count > SLASH_QDMA_FD_MAX_QPAIRS)
+        return -EINVAL;
+
+    uc = kzalloc(sizeof(*uc), GFP_KERNEL);
+    if (!uc)
+        return -ENOMEM;
+
+    uc->cmd = cmd;
+    uc->count = count;
+    atomic_set(&uc->outstanding, count);
+    atomic_long_set(&uc->total_bytes, 0);
+    atomic_set(&uc->first_err, 0);
+
+    /* Validate and prepare every sub-transfer before queueing any of them. */
+    for (i = 0; i < count; i++) {
+        const struct slash_qdma_subxfer *d = &req.xfers[i];
+        int rv;
+
+        if (d->qpair_index >= ctx->n_qpairs)
+            rv = -EINVAL;
+        else
+            rv = slash_qdma_xfer_prep(qdma_dev,
+                                      ctx->entries[d->qpair_index], d,
+                                      &uc->xrs[i]);
+        if (rv) {
+            while (i-- > 0)
+                slash_qdma_buf_put(uc->xrs[i].buf);
+            kfree(uc);
+            return rv;
+        }
+        uc->xrs[i].qreq.uld_data = (unsigned long)uc;
+        uc->xrs[i].qreq.fp_done = slash_qdma_uring_xfer_done;
+    }
+
+    /* Stash the context for the completion task-work. */
+    memcpy(cmd->pdu, &uc, sizeof(uc));
+
+    /*
+     * Submit all sub-transfers asynchronously.  Completion (success or the
+     * inline submit-failure path below) is funnelled through the outstanding
+     * counter so the CQE is posted exactly once from task-work.
+     */
+    for (i = 0; i < count; i++) {
+        res = qdma_request_submit(qdma_dev->qdma_handle, uc->xrs[i].qhndl,
+                                  &uc->xrs[i].qreq);
+        if (res < 0) {
+            /* Not queued: fp_done will not fire, account for it here. */
+            uc->xrs[i].err = (int)res;
+            atomic_cmpxchg(&uc->first_err, 0, (int)res);
+            if (atomic_dec_and_test(&uc->outstanding))
+                io_uring_cmd_complete_in_task(uc->cmd,
+                                              slash_qdma_uring_cmd_complete);
+        }
+    }
+
+    return -EIOCBQUEUED;
+}
+#endif /* SLASH_HAVE_URING_CMD */
+
+/**
  * slash_qdma_qpair_ioctl() - Ioctl handler for per-qpair anon_inode fds.
  * @file: Anon_inode file.
  * @cmd:  Ioctl command number.
  * @arg:  User-space argument.
  *
- * Currently a stub — no per-fd ioctls are defined.  Returns -ENOTTY
- * for all commands.
+ * Supports SLASH_QDMA_IOCTL_BUF_CREATE (allocate a kernel buffer for clients
+ * that hold only a queue-pair fd) and SLASH_QDMA_QPAIR_IOCTL_TRANSFER (buffer
+ * DMA transfer).
  *
- * Return: -ENOTTY (no valid ioctl).
+ * Return: bytes transferred (>= 0) for TRANSFER, a new fd for BUF_CREATE, or
+ *         -ENOTTY for any other command.
  */
 static long slash_qdma_qpair_ioctl(struct file *file,
                                    unsigned int cmd, unsigned long arg)
 {
-    (void)file;
-    (void)cmd;
-    (void)arg;
+    struct slash_qdma_qpair_file_ctx *ctx = file->private_data;
 
-    return -ENOTTY;
+    if (!ctx || !ctx->qdma_dev)
+        return -ENODEV;
+
+    switch (cmd) {
+    case SLASH_QDMA_IOCTL_BUF_CREATE:
+        return slash_qdma_ioctl_buf_create_w(&ctx->qdma_dev->misc,
+                                             ctx->qdma_dev,
+                                             (void __user *)arg);
+    case SLASH_QDMA_QPAIR_IOCTL_TRANSFER:
+        return slash_qdma_qpair_transfer(file, (void __user *)arg);
+    default:
+        return -ENOTTY;
+    }
 }
 
 /**
@@ -2346,12 +3508,15 @@ static long slash_qdma_qpair_ioctl(struct file *file,
 static int slash_qdma_qpair_release(struct inode *inode, struct file *file)
 {
     struct slash_qdma_qpair_file_ctx *ctx = file->private_data;
+    u32 i;
 
     (void)inode;
 
     if (ctx) {
-        if (ctx->entry)
-            slash_qdma_qpair_put(ctx->entry);
+        for (i = 0; i < ctx->n_qpairs; i++) {
+            if (ctx->entries[i])
+                slash_qdma_qpair_put(ctx->entries[i]);
+        }
         if (ctx->qdma_dev)
             kref_put(&ctx->qdma_dev->ref, slash_qdma_dev_release);
         kfree(ctx);
@@ -2366,24 +3531,26 @@ static int slash_qdma_qpair_release(struct inode *inode, struct file *file)
  * ───────────────────────────────────────────────────────────────────── */
 
 /**
- * slash_qdma_ioctl_qpair_get_fd_w() - Create an anon_inode fd for a queue pair.
+ * slash_qdma_ioctl_qpair_get_fd_w() - Create an anon_inode fd for queue I/O.
  * @misc:     Miscdevice handle (unused).
  * @qdma_dev: QDMA device.
  * @uarg:     User-space pointer to a slash_qdma_qpair_fd_request struct.
  *
- * Creates an anonymous inode file descriptor that userspace can use
- * for read() (C2H) and write() (H2C) DMA transfers on the specified
- * queue pair.  The fd holds references to both the qpair entry and the
- * device, preventing either from being freed while the fd is open.
+ * Creates an anonymous inode file descriptor that userspace can use for
+ * buffer transfer ioctls.  The fd is a collection of one or two queue pairs
+ * (see slash_qdma_qpair_fd_request): @qpair_count == 0 binds the single qpair
+ * named by @qid (back-compat), otherwise @qpair_count IDs from @qpair_ids are
+ * bound, their array index becoming the transfer qpair_index.
  *
- * The only supported flag is O_CLOEXEC (close-on-exec).
+ * The fd holds references to each bound qpair entry and the device, preventing
+ * either from being freed while the fd is open.  Each bound qpair keeps the
+ * per-qpair configuration (mm_channel, ring sizes, directions) it was given at
+ * add time, so the channels can differ.
  *
- * The file is created with FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE
- * enabled, allowing pread/pwrite and lseek to set the device-side
- * address for DMA transfers.
+ * The only supported flag is O_CLOEXEC (close-on-exec).
  *
- * Error handling: on any failure after resources are acquired, all
- * refs and allocations are cleaned up before returning.
+ * Error handling: on any failure after resources are acquired, all refs and
+ * allocations are cleaned up before returning.
  *
  * Return: The new fd (>= 0) on success, negative errno on failure.
  */
@@ -2393,8 +3560,10 @@ static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc,
 {
     struct slash_qdma_qpair_fd_request req;
     __u32 user_size = 0;
+    __u32 ids[SLASH_QDMA_FD_MAX_QPAIRS];
+    u32 n_qpairs;
+    u32 i;
     size_t copy_size;
-    struct slash_qdma_qpair_entry *entry;
     struct slash_qdma_qpair_file_ctx *ctx;
     struct file *file;
     int fd;
@@ -2418,55 +3587,75 @@ static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc,
     if (req.flags & ~O_CLOEXEC)
         return -EINVAL;
 
-    /* Look up the qpair entry and take refs while holding the lock. */
+    /*
+     * Resolve the requested qpair-id set.  qpair_count == 0 is the legacy
+     * single-qpair form using @qid; otherwise bind @qpair_count ids.
+     */
+    if (req.qpair_count == 0) {
+        n_qpairs = 1;
+        ids[0] = req.qid;
+    } else {
+        if (req.qpair_count > SLASH_QDMA_FD_MAX_QPAIRS)
+            return -EINVAL;
+        n_qpairs = req.qpair_count;
+        for (i = 0; i < n_qpairs; i++)
+            ids[i] = req.qpair_ids[i];
+    }
+
+    /* Allocate the per-fd context. */
+    ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+    if (!ctx)
+        return -ENOMEM;
+
+    /* Look up each qpair entry and take refs while holding the lock. */
     mutex_lock(&qdma_dev->lock);
     if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) {
         mutex_unlock(&qdma_dev->lock);
+        kfree(ctx);
         return -ENODEV;
     }
 
-    entry = slash_qdma_qpair_lookup(qdma_dev, req.qid);
-    if (!entry || !entry->dir_mask) {
-        mutex_unlock(&qdma_dev->lock);
-        return -ENOENT;
+    for (i = 0; i < n_qpairs; i++) {
+        struct slash_qdma_qpair_entry *entry =
+            slash_qdma_qpair_lookup(qdma_dev, ids[i]);
+
+        if (!entry || !entry->dir_mask) {
+            /* Drop refs taken so far for the earlier entries. */
+            while (i-- > 0)
+                slash_qdma_qpair_put(ctx->entries[i]);
+            mutex_unlock(&qdma_dev->lock);
+            kfree(ctx);
+            return -ENOENT;
+        }
+
+        /*
+         * Take a ref on the entry.  These refs are held by the file context
+         * and released when the fd is closed, ensuring the entries cannot be
+         * freed prematurely.
+         */
+        slash_qdma_qpair_get(entry);
+        ctx->entries[i] = entry;
+        ctx->qids[i] = ids[i];
     }
+    ctx->n_qpairs = n_qpairs;
 
-    /*
-     * Take a ref on the entry and the device.  These refs are held by
-     * the file context and released when the fd is closed, ensuring
-     * neither the entry nor the device can be freed prematurely.
-     */
-    slash_qdma_qpair_get(entry);
     kref_get(&qdma_dev->ref);
     mutex_unlock(&qdma_dev->lock);
 
-    /* Allocate the per-fd context. */
-    ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
-    if (!ctx) {
-        slash_qdma_qpair_put(entry);
-        kref_put(&qdma_dev->ref, slash_qdma_dev_release);
-        return -ENOMEM;
-    }
-
     ctx->qdma_dev = qdma_dev;
-    ctx->entry = entry;
-    ctx->qid = req.qid;
 
     /* Create the anonymous inode file with read/write access. */
     file = anon_inode_getfile("slash_qdma_qpair", &slash_qdma_qpair_fops,
                               ctx, O_RDWR | (req.flags & O_CLOEXEC));
     if (IS_ERR(file)) {
         err = PTR_ERR(file);
-        slash_qdma_qpair_put(entry);
+        for (i = 0; i < ctx->n_qpairs; i++)
+            slash_qdma_qpair_put(ctx->entries[i]);
         kref_put(&qdma_dev->ref, slash_qdma_dev_release);
         kfree(ctx);
         return err;
     }
 
-    /* Enable seek and positional read/write for device-address control. */
-    file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
-
-
     /* Allocate a file descriptor number. */
     fd = get_unused_fd_flags(req.flags & O_CLOEXEC);
     if (fd < 0) {
diff --git a/driver/tests/test_slash_qdma.c b/driver/tests/test_slash_qdma.c
index 07784dc0..904b3e81 100644
--- a/driver/tests/test_slash_qdma.c
+++ b/driver/tests/test_slash_qdma.c
@@ -2,9 +2,10 @@
 /*
  * QDMA control device (/dev/slash_qdma_ctl<N>) ABI tests.
  *
- * Covers QPAIR_ADD / Q_OP / QPAIR_GET_FD / INFO and the per-qpair
- * anon-inode fd (read/write/lseek/pread/pwrite, multi-fd, wrong-direction,
- * mmap-unsupported, HBM/DDR region round trips).  See
+ * Covers QPAIR_ADD / Q_OP / QPAIR_GET_FD / INFO, the kernel-owned buffer fd
+ * (BUF_CREATE + mmap), and the per-qpair anon-inode transfer fd
+ * (TRANSFER ioctl, multi-fd, wrong-direction, read/write/lseek/mmap
+ * unsupported, HBM/DDR region round trips).  See
  * docs/reference/kernel-abi/index.rst for the spec.
  */
 
@@ -35,6 +36,61 @@ static void fill_pattern(uint8_t *buf, size_t len)
 		buf[i] = (uint8_t)(i & 0xff);
 }
 
+/*
+ * Create a kernel-owned DMA buffer via BUF_CREATE on @ioctl_fd (control fd or
+ * queue-pair fd).  Returns the new buffer fd (>= 0), or -errno on failure.
+ */
+static int qdma_buf_create(int ioctl_fd, uint64_t length, uint32_t *granule,
+			   uint32_t *transfer_hint)
+{
+	struct slash_qdma_buf_create req;
+	int fd;
+
+	memset(&req, 0, sizeof(req));
+	req.size = sizeof(req);
+	req.flags = O_CLOEXEC;
+	req.length = length;
+
+	fd = ioctl(ioctl_fd, SLASH_QDMA_IOCTL_BUF_CREATE, &req);
+	if (fd < 0)
+		return -errno;
+
+	if (granule)
+		*granule = req.granule;
+	if (transfer_hint)
+		*transfer_hint = req.transfer_hint;
+	return fd;
+}
+
+/* mmap a buffer fd for CPU access; returns the mapping or MAP_FAILED. */
+static void *qdma_buf_map(int buf_fd, uint64_t length)
+{
+	return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED, buf_fd, 0);
+}
+
+/*
+ * Issue a single-sub-transfer buffer transfer on a qpair fd (qpair_index 0);
+ * returns the ioctl result (bytes transferred or -1 with errno set).
+ */
+static long qdma_buf_transfer(int io_fd, int buf_fd, uint64_t buf_offset,
+			      uint64_t dev_addr, uint64_t length,
+			      uint32_t direction)
+{
+	struct slash_qdma_transfer req;
+
+	memset(&req, 0, sizeof(req));
+	req.size = sizeof(req);
+	req.count = 1;
+	req.xfers[0].qpair_index = 0;
+	req.xfers[0].direction = direction;
+	req.xfers[0].buf_fd = buf_fd;
+	req.xfers[0].buf_offset = buf_offset;
+	req.xfers[0].dev_addr = dev_addr;
+	req.xfers[0].length = length;
+
+	return ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req);
+}
+
 /* ---------- fixture ---------- */
 
 FIXTURE(qdma)
@@ -126,30 +182,118 @@ TEST_F(qdma, qpair_lifecycle)
 
 TEST_F(qdma, write_read_verify)
 {
-	uint8_t *write_buf, *read_buf;
 	uint64_t dma_addr = get_dma_addr();
-	ssize_t ret;
+	int write_fd, read_fd;
+	uint8_t *write_buf, *read_buf;
+	long ret;
 
 	bring_up_qpair(_metadata, self, 0x3);
 
-	write_buf = aligned_alloc(4096, TRANSFER_SIZE);
-	ASSERT_NE(NULL, write_buf);
-	read_buf = aligned_alloc(4096, TRANSFER_SIZE);
-	ASSERT_NE(NULL, read_buf);
+	write_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(write_fd, 0);
+	read_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(read_fd, 0);
+
+	write_buf = qdma_buf_map(write_fd, TRANSFER_SIZE);
+	ASSERT_NE(MAP_FAILED, write_buf);
+	read_buf = qdma_buf_map(read_fd, TRANSFER_SIZE);
+	ASSERT_NE(MAP_FAILED, read_buf);
 
 	fill_pattern(write_buf, TRANSFER_SIZE);
 	memset(read_buf, 0, TRANSFER_SIZE);
 
-	ret = pwrite(self->io_fd, write_buf, TRANSFER_SIZE, (off_t)dma_addr);
+	ret = qdma_buf_transfer(self->io_fd, write_fd, 0, dma_addr,
+				TRANSFER_SIZE, SLASH_QDMA_XFER_H2C);
 	ASSERT_EQ(TRANSFER_SIZE, ret);
 
-	ret = pread(self->io_fd, read_buf, TRANSFER_SIZE, (off_t)dma_addr);
+	ret = qdma_buf_transfer(self->io_fd, read_fd, 0, dma_addr,
+				TRANSFER_SIZE, SLASH_QDMA_XFER_C2H);
 	ASSERT_EQ(TRANSFER_SIZE, ret);
 
 	EXPECT_EQ(0, memcmp(write_buf, read_buf, TRANSFER_SIZE));
 
-	free(write_buf);
-	free(read_buf);
+	munmap(write_buf, TRANSFER_SIZE);
+	munmap(read_buf, TRANSFER_SIZE);
+	close(write_fd);
+	close(read_fd);
+}
+
+/* ---------- buffer fd behaviour ---------- */
+
+TEST_F(qdma, buf_create_zero_length_returns_einval)
+{
+	EXPECT_EQ(-EINVAL, qdma_buf_create(self->ctl_fd, 0, NULL, NULL));
+}
+
+TEST_F(qdma, buf_create_unaligned_length_returns_einval)
+{
+	/* Length must be a multiple of the page size. */
+	EXPECT_EQ(-EINVAL,
+		  qdma_buf_create(self->ctl_fd, TRANSFER_SIZE + 1, NULL, NULL));
+}
+
+TEST_F(qdma, buf_create_reports_granule_and_hint)
+{
+	uint32_t granule = 0;
+	uint32_t hint = 0;
+	int buf_fd;
+
+	buf_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, &granule, &hint);
+	ASSERT_GE(buf_fd, 0);
+	EXPECT_EQ(4096u, granule);
+	EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, hint);
+	close(buf_fd);
+}
+
+TEST_F(qdma, buf_create_via_qpair_fd)
+{
+	int buf_fd;
+	uint8_t *map;
+	long ret;
+	uint64_t dma_addr = get_dma_addr();
+
+	bring_up_qpair(_metadata, self, 0x3);
+
+	/* Buffers can be created through the queue-pair fd too (SCM_RIGHTS use). */
+	buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(buf_fd, 0);
+
+	map = qdma_buf_map(buf_fd, TRANSFER_SIZE);
+	ASSERT_NE(MAP_FAILED, map);
+	fill_pattern(map, TRANSFER_SIZE);
+
+	ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, dma_addr, TRANSFER_SIZE,
+				SLASH_QDMA_XFER_H2C);
+	ASSERT_EQ(TRANSFER_SIZE, ret);
+
+	munmap(map, TRANSFER_SIZE);
+	close(buf_fd);
+}
+
+TEST_F(qdma, buf_fd_mapping_outlives_fd_close)
+{
+	int buf_fd;
+	uint8_t *map;
+	uint64_t dma_addr = get_dma_addr();
+	long ret;
+
+	bring_up_qpair(_metadata, self, 0x3);
+
+	buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(buf_fd, 0);
+	map = qdma_buf_map(buf_fd, TRANSFER_SIZE);
+	ASSERT_NE(MAP_FAILED, map);
+
+	/* Closing the fd must not invalidate an existing mapping. */
+	close(buf_fd);
+
+	fill_pattern(map, TRANSFER_SIZE);
+	/* The mapping is still valid; the bytes are readable. */
+	EXPECT_EQ(0u, map[0]);
+	(void)dma_addr;
+	(void)ret;
+
+	munmap(map, TRANSFER_SIZE);
 }
 
 /* ---------- error paths ---------- */
@@ -271,62 +415,56 @@ TEST_F(qdma, qpair_get_fd_unknown_qid)
 
 TEST_F(qdma, io_read_on_h2c_only_returns_enodev)
 {
-	uint8_t *buf;
-	ssize_t ret;
+	int buf_fd;
+	long ret;
 
 	bring_up_qpair(_metadata, self, 0x1); /* H2C only */
 
-	buf = aligned_alloc(4096, TRANSFER_SIZE);
-	ASSERT_NE(NULL, buf);
+	buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(buf_fd, 0);
 
-	ret = pread(self->io_fd, buf, TRANSFER_SIZE, (off_t)SLASH_TEST_HBM_BASE);
+	ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, SLASH_TEST_HBM_BASE,
+				TRANSFER_SIZE, SLASH_QDMA_XFER_C2H);
 	EXPECT_EQ(-1, ret);
 	EXPECT_EQ(ENODEV, errno);
 
-	free(buf);
+	close(buf_fd);
 }
 
 TEST_F(qdma, io_write_on_c2h_only_returns_enodev)
 {
-	uint8_t *buf;
-	ssize_t ret;
+	int buf_fd;
+	long ret;
 
 	bring_up_qpair(_metadata, self, 0x2); /* C2H only */
 
-	buf = aligned_alloc(4096, TRANSFER_SIZE);
-	ASSERT_NE(NULL, buf);
+	buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(buf_fd, 0);
 
-	ret = pwrite(self->io_fd, buf, TRANSFER_SIZE, (off_t)SLASH_TEST_HBM_BASE);
+	ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, SLASH_TEST_HBM_BASE,
+				TRANSFER_SIZE, SLASH_QDMA_XFER_H2C);
 	EXPECT_EQ(-1, ret);
 	EXPECT_EQ(ENODEV, errno);
 
-	free(buf);
+	close(buf_fd);
 }
 
-/*
- * TODO: spec at docs/reference/kernel-abi/index.rst:417 documents zero-length
- * transfers as returning -EINVAL, but the kernel's map_user_buf_to_sgl path
- * (slash_qdma.c:2033-2034) explicitly patches around the len==0 case
- * (`if (len == 0) pages_nr = 1;`), making the -EINVAL branch unreachable.
- * The observed behaviour is ret == 0.  Desired behaviour is under
- * investigation — keep this test as-is so the discrepancy is visible.
- */
 TEST_F(qdma, io_zero_length_returns_einval)
 {
-	SKIP(return, "Test is disabled since the desired behavior is under investigation");
-	uint8_t *buf;
-	ssize_t ret;
+	int buf_fd;
+	long ret;
 
 	bring_up_qpair(_metadata, self, 0x3);
 
-	buf = aligned_alloc(4096, TRANSFER_SIZE);
-	ASSERT_NE(NULL, buf);
+	buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(buf_fd, 0);
 
-	ret = pwrite(self->io_fd, buf, 0, (off_t)SLASH_TEST_HBM_BASE);
+	ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, SLASH_TEST_HBM_BASE,
+				0, SLASH_QDMA_XFER_H2C);
 	EXPECT_EQ(-1, ret);
 	EXPECT_EQ(EINVAL, errno);
 
-	free(buf);
+	close(buf_fd);
 }
 
 TEST_F(qdma, io_mmap_unsupported)
@@ -335,17 +473,17 @@ TEST_F(qdma, io_mmap_unsupported)
 
 	bring_up_qpair(_metadata, self, 0x3);
 
+	/* The transfer (queue-pair) fd is not mappable — only buffer fds are. */
 	p = mmap(NULL, 4096, PROT_READ, MAP_SHARED, self->io_fd, 0);
 	EXPECT_EQ(MAP_FAILED, p);
 	if (p != MAP_FAILED)
 		munmap(p, 4096);
 }
 
-TEST_F(qdma, io_ioctl_returns_enotty)
+TEST_F(qdma, io_junk_ioctl_returns_enotty)
 {
-	/* The per-qpair anon_inode fd defines no ioctls; the handler
-	 * returns -ENOTTY for any cmd. Exercising this path keeps the stub
-	 * formally covered. */
+	/* The per-qpair fd defines only BUF_CREATE / TRANSFER; any other cmd
+	 * returns -ENOTTY. */
 	unsigned int junk = _IO('v', 0xFE);
 
 	bring_up_qpair(_metadata, self, 0x3);
@@ -354,138 +492,99 @@ TEST_F(qdma, io_ioctl_returns_enotty)
 	EXPECT_EQ(ENOTTY, errno);
 }
 
-TEST_F(qdma, io_lseek_set_cur_end)
+TEST_F(qdma, io_lseek_unsupported)
 {
 	off_t pos;
 
 	bring_up_qpair(_metadata, self, 0x3);
 
 	pos = lseek(self->io_fd, (off_t)SLASH_TEST_HBM_BASE, SEEK_SET);
-	EXPECT_EQ((off_t)SLASH_TEST_HBM_BASE, pos);
-
-	pos = lseek(self->io_fd, 0, SEEK_CUR);
-	EXPECT_EQ((off_t)SLASH_TEST_HBM_BASE, pos);
-
-	pos = lseek(self->io_fd, 4096, SEEK_CUR);
-	EXPECT_EQ((off_t)(SLASH_TEST_HBM_BASE + 4096), pos);
-
-	/*
-	 * SEEK_END semantics are driver-defined for this anon-inode; the
-	 * contract is "doesn't error", not any specific value.
-	 */
-	pos = lseek(self->io_fd, 0, SEEK_END);
-	EXPECT_NE((off_t)-1, pos);
+	EXPECT_EQ((off_t)-1, pos);
+	EXPECT_EQ(ESPIPE, errno);
 }
 
-TEST_F(qdma, io_write_advances_file_position)
+TEST_F(qdma, io_read_write_unsupported)
 {
-	uint8_t *buf;
-	off_t pos;
-	ssize_t ret;
+	uint8_t buf[TRANSFER_SIZE];
+	long ret;
 
 	bring_up_qpair(_metadata, self, 0x3);
 
-	buf = aligned_alloc(4096, TRANSFER_SIZE);
-	ASSERT_NE(NULL, buf);
-	fill_pattern(buf, TRANSFER_SIZE);
-
-	ASSERT_EQ((off_t)SLASH_TEST_HBM_BASE,
-			  lseek(self->io_fd, (off_t)SLASH_TEST_HBM_BASE, SEEK_SET));
-
 	ret = write(self->io_fd, buf, TRANSFER_SIZE);
-	ASSERT_EQ(TRANSFER_SIZE, ret);
-
-	pos = lseek(self->io_fd, 0, SEEK_CUR);
-	EXPECT_EQ((off_t)(SLASH_TEST_HBM_BASE + TRANSFER_SIZE), pos);
-
-	free(buf);
-}
-
-TEST_F(qdma, io_pwrite_does_not_advance_file_position)
-{
-	uint8_t *buf;
-	off_t pos;
-	ssize_t ret;
-
-	bring_up_qpair(_metadata, self, 0x3);
-
-	buf = aligned_alloc(4096, TRANSFER_SIZE);
-	ASSERT_NE(NULL, buf);
-	fill_pattern(buf, TRANSFER_SIZE);
-
-	ASSERT_EQ((off_t)0, lseek(self->io_fd, 0, SEEK_SET));
-
-	ret = pwrite(self->io_fd, buf, TRANSFER_SIZE,
-				 (off_t)SLASH_TEST_HBM_BASE);
-	ASSERT_EQ(TRANSFER_SIZE, ret);
-
-	/* p* variants must not advance the file position. */
-	pos = lseek(self->io_fd, 0, SEEK_CUR);
-	EXPECT_EQ((off_t)0, pos);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
 
-	free(buf);
+	ret = read(self->io_fd, buf, TRANSFER_SIZE);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
 }
 
 TEST_F(qdma, io_multiple_fds_same_qpair)
 {
+	int write_fd, read_fd, io_fd_b;
 	uint8_t *write_buf, *read_buf;
-	int io_fd_b;
-	ssize_t ret;
+	long ret;
 
 	bring_up_qpair(_metadata, self, 0x3);
 
 	io_fd_b = slash_qpair_get_fd(self->ctl_fd, self->qid, O_CLOEXEC);
 	ASSERT_GE(io_fd_b, 0);
 
-	write_buf = aligned_alloc(4096, TRANSFER_SIZE);
-	ASSERT_NE(NULL, write_buf);
-	read_buf = aligned_alloc(4096, TRANSFER_SIZE);
-	ASSERT_NE(NULL, read_buf);
+	write_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(write_fd, 0);
+	read_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(read_fd, 0);
+
+	write_buf = qdma_buf_map(write_fd, TRANSFER_SIZE);
+	ASSERT_NE(MAP_FAILED, write_buf);
+	read_buf = qdma_buf_map(read_fd, TRANSFER_SIZE);
+	ASSERT_NE(MAP_FAILED, read_buf);
 
 	fill_pattern(write_buf, TRANSFER_SIZE);
 	memset(read_buf, 0, TRANSFER_SIZE);
 
-	ret = pwrite(self->io_fd, write_buf, TRANSFER_SIZE,
-				 (off_t)SLASH_TEST_HBM_BASE);
+	ret = qdma_buf_transfer(self->io_fd, write_fd, 0, SLASH_TEST_HBM_BASE,
+				TRANSFER_SIZE, SLASH_QDMA_XFER_H2C);
 	ASSERT_EQ(TRANSFER_SIZE, ret);
 
-	ret = pread(io_fd_b, read_buf, TRANSFER_SIZE,
-				(off_t)SLASH_TEST_HBM_BASE);
+	ret = qdma_buf_transfer(io_fd_b, read_fd, 0, SLASH_TEST_HBM_BASE,
+				TRANSFER_SIZE, SLASH_QDMA_XFER_C2H);
 	ASSERT_EQ(TRANSFER_SIZE, ret);
 
 	EXPECT_EQ(0, memcmp(write_buf, read_buf, TRANSFER_SIZE));
 
+	munmap(write_buf, TRANSFER_SIZE);
+	munmap(read_buf, TRANSFER_SIZE);
+	close(write_fd);
+	close(read_fd);
 	close(io_fd_b);
-	free(write_buf);
-	free(read_buf);
 }
 
 TEST_F(qdma, io_fd_outlives_qpair_del)
 {
-	uint8_t *buf;
-	ssize_t ret;
+	int buf_fd;
+	long ret;
 
 	bring_up_qpair(_metadata, self, 0x3);
 
+	buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(buf_fd, 0);
+
 	/* DEL the qpair while io_fd is still open. */
 	ASSERT_EQ(0, slash_qpair_op(self->ctl_fd, self->qid,
 								SLASH_QDMA_QUEUE_OP_DEL));
 	self->qpair_added = 0;
 	self->qpair_started = 0;
 
-	buf = aligned_alloc(4096, TRANSFER_SIZE);
-	ASSERT_NE(NULL, buf);
-
 	/*
-	 * fd is still valid but the qpair's HW queues are gone.  The spec
-	 * (index.rst:613-616) does not name a specific errno, so we only
-	 * assert the call fails — not which errno it returns.
+	 * fd is still valid but the qpair's HW queues are gone.  The spec does
+	 * not name a specific errno, so we only assert the call fails.
 	 */
-	ret = pwrite(self->io_fd, buf, TRANSFER_SIZE,
-				 (off_t)SLASH_TEST_HBM_BASE);
+	ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, SLASH_TEST_HBM_BASE,
+				TRANSFER_SIZE, SLASH_QDMA_XFER_H2C);
 	EXPECT_EQ(-1, ret);
 
-	free(buf);
+	close(buf_fd);
 	/* close(io_fd) happens in fixture teardown — must not crash. */
 }
 
@@ -494,31 +593,41 @@ TEST_F(qdma, io_fd_outlives_qpair_del)
 static void region_round_trip(struct __test_metadata *_metadata,
 							  FIXTURE_DATA(qdma) * self, uint64_t base)
 {
+	int write_fd, read_fd;
 	uint8_t *write_buf, *read_buf;
-	ssize_t ret;
+	long ret;
 
 	bring_up_qpair(_metadata, self, 0x3);
 
-	write_buf = aligned_alloc(4096, TRANSFER_SIZE);
-	ASSERT_NE(NULL, write_buf);
-	read_buf = aligned_alloc(4096, TRANSFER_SIZE);
-	ASSERT_NE(NULL, read_buf);
+	write_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(write_fd, 0);
+	read_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(read_fd, 0);
+
+	write_buf = qdma_buf_map(write_fd, TRANSFER_SIZE);
+	ASSERT_NE(MAP_FAILED, write_buf);
+	read_buf = qdma_buf_map(read_fd, TRANSFER_SIZE);
+	ASSERT_NE(MAP_FAILED, read_buf);
 
 	fill_pattern(write_buf, TRANSFER_SIZE);
 	memset(read_buf, 0, TRANSFER_SIZE);
 
-	ret = pwrite(self->io_fd, write_buf, TRANSFER_SIZE, (off_t)base);
+	ret = qdma_buf_transfer(self->io_fd, write_fd, 0, base,
+				TRANSFER_SIZE, SLASH_QDMA_XFER_H2C);
 	ASSERT_EQ(TRANSFER_SIZE, ret)
-	TH_LOG("pwrite to 0x%llx failed: %s",
+	TH_LOG("H2C transfer to 0x%llx failed: %s",
 		   (unsigned long long)base, strerror(errno));
 
-	ret = pread(self->io_fd, read_buf, TRANSFER_SIZE, (off_t)base);
+	ret = qdma_buf_transfer(self->io_fd, read_fd, 0, base,
+				TRANSFER_SIZE, SLASH_QDMA_XFER_C2H);
 	ASSERT_EQ(TRANSFER_SIZE, ret);
 
 	EXPECT_EQ(0, memcmp(write_buf, read_buf, TRANSFER_SIZE));
 
-	free(write_buf);
-	free(read_buf);
+	munmap(write_buf, TRANSFER_SIZE);
+	munmap(read_buf, TRANSFER_SIZE);
+	close(write_fd);
+	close(read_fd);
 }
 
 TEST_F(qdma, transfer_hbm)
@@ -707,4 +816,146 @@ TEST_F(qdma, qpair_get_fd_oversized_struct_zeros_tail)
 	free(buf);
 }
 
+TEST_F(qdma, reject_partial_4k_transfer)
+{
+	int buf_fd;
+	uint64_t dma_addr = get_dma_addr();
+	long ret;
+
+	bring_up_qpair(_metadata, self, 0x3);
+
+	buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(buf_fd, 0);
+
+	/* A sub-page length is not a multiple of the buffer granule. */
+	ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, dma_addr,
+				TRANSFER_SIZE / 2, SLASH_QDMA_XFER_H2C);
+	ASSERT_EQ(-1, ret);
+	ASSERT_EQ(EINVAL, errno);
+
+	close(buf_fd);
+}
+
+TEST_F(qdma, multipage_4k_write_read_verify)
+{
+	const size_t xfer_size = TRANSFER_SIZE * 8; /* 8 base pages, one request */
+	int write_fd, read_fd;
+	uint8_t *write_buf, *read_buf;
+	uint64_t dma_addr = get_dma_addr();
+	long ret;
+
+	bring_up_qpair(_metadata, self, 0x3);
+
+	write_fd = qdma_buf_create(self->ctl_fd, xfer_size, NULL, NULL);
+	ASSERT_GE(write_fd, 0);
+	read_fd = qdma_buf_create(self->ctl_fd, xfer_size, NULL, NULL);
+	ASSERT_GE(read_fd, 0);
+
+	write_buf = qdma_buf_map(write_fd, xfer_size);
+	ASSERT_NE(MAP_FAILED, write_buf);
+	read_buf = qdma_buf_map(read_fd, xfer_size);
+	ASSERT_NE(MAP_FAILED, read_buf);
+
+	fill_pattern(write_buf, xfer_size);
+	memset(read_buf, 0, xfer_size);
+
+	ret = qdma_buf_transfer(self->io_fd, write_fd, 0, dma_addr, xfer_size,
+				SLASH_QDMA_XFER_H2C);
+	ASSERT_EQ((ssize_t)xfer_size, ret);
+
+	ret = qdma_buf_transfer(self->io_fd, read_fd, 0, dma_addr, xfer_size,
+				SLASH_QDMA_XFER_C2H);
+	ASSERT_EQ((ssize_t)xfer_size, ret);
+
+	EXPECT_EQ(0, memcmp(write_buf, read_buf, xfer_size));
+
+	munmap(write_buf, xfer_size);
+	munmap(read_buf, xfer_size);
+	close(write_fd);
+	close(read_fd);
+}
+
+/* ---------- transfer error paths ---------- */
+
+TEST_F(qdma, transfer_size_below_input_min_returns_einval)
+{
+	struct slash_qdma_transfer req;
+
+	bring_up_qpair(_metadata, self, 0x3);
+
+	memset(&req, 0, sizeof(req));
+	req.size = sizeof(__u32); /* below the trailing input field */
+	EXPECT_EQ(-1, ioctl(self->io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req));
+	EXPECT_EQ(EINVAL, errno);
+}
+
+TEST_F(qdma, transfer_invalid_buf_fd_returns_einval)
+{
+	long ret;
+
+	bring_up_qpair(_metadata, self, 0x3);
+
+	/* The control fd is a valid fd but not a buffer fd. */
+	ret = qdma_buf_transfer(self->io_fd, self->ctl_fd, 0,
+				get_dma_addr(), TRANSFER_SIZE,
+				SLASH_QDMA_XFER_H2C);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+}
+
+TEST_F(qdma, transfer_bad_fd_returns_ebadf)
+{
+	long ret;
+
+	bring_up_qpair(_metadata, self, 0x3);
+
+	ret = qdma_buf_transfer(self->io_fd, -1, 0,
+				get_dma_addr(), TRANSFER_SIZE,
+				SLASH_QDMA_XFER_H2C);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EBADF, errno);
+}
+
+TEST_F(qdma, transfer_wrong_direction_returns_enodev)
+{
+	int buf_fd;
+	uint32_t transfer_hint = 0;
+	long ret;
+
+	bring_up_qpair(_metadata, self, 0x1); /* H2C only */
+
+	buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, &transfer_hint);
+	ASSERT_GE(buf_fd, 0);
+	EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, transfer_hint);
+
+	/* C2H is not enabled on this qpair. */
+	ret = qdma_buf_transfer(self->io_fd, buf_fd, 0,
+				get_dma_addr(), TRANSFER_SIZE,
+				SLASH_QDMA_XFER_C2H);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ENODEV, errno);
+
+	close(buf_fd);
+}
+
+TEST_F(qdma, transfer_out_of_range_returns_einval)
+{
+	int buf_fd;
+	long ret;
+
+	bring_up_qpair(_metadata, self, 0x3);
+
+	buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL);
+	ASSERT_GE(buf_fd, 0);
+
+	/* Slice extends past the buffer length. */
+	ret = qdma_buf_transfer(self->io_fd, buf_fd, TRANSFER_SIZE,
+				get_dma_addr(), TRANSFER_SIZE,
+				SLASH_QDMA_XFER_H2C);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	close(buf_fd);
+}
+
 TEST_HARNESS_MAIN
diff --git a/packaging/debian/control b/packaging/debian/control
index ce02f2dd..09534d96 100644
--- a/packaging/debian/control
+++ b/packaging/debian/control
@@ -48,7 +48,7 @@ Description: SLASH/VRT System for simulation and emulation
 
 Package: slash-dkms
 Architecture: all
-Depends: dkms, gcc, make, ${misc:Depends}
+Depends: dkms, gcc, make, patch, ${misc:Depends}
 Provides: slash-kernel-module
 Description: SLASH kernel module (DKMS)
 
@@ -89,7 +89,7 @@ Description: VRT Runtime (development files)
 
 Package: v80-smi
 Architecture: any
-Depends: libvrt (= ${binary:Version}), ${shlibs:Depends}, ${misc:Depends}
+Depends: libvrt (= ${binary:Version}), libslash (= ${binary:Version}), ${shlibs:Depends}, ${misc:Depends}
 Description: V80 System Management Interface
 
 Package: slashkit
diff --git a/packaging/debian/slash-dkms.install b/packaging/debian/slash-dkms.install
index d0496a33..c377f69f 100644
--- a/packaging/debian/slash-dkms.install
+++ b/packaging/debian/slash-dkms.install
@@ -22,6 +22,7 @@ driver/*.c            usr/src/slash-@VERSION@/driver/
 driver/*.h            usr/src/slash-@VERSION@/driver/
 driver/Makefile       usr/src/slash-@VERSION@/driver/
 driver/kcompat        usr/src/slash-@VERSION@/driver/
+driver/patches        usr/src/slash-@VERSION@/driver/
 driver/libslash/include/slash/uapi  usr/src/slash-@VERSION@/driver/libslash/include/slash/
 
 submodules/qdma_drv/QDMA/linux-kernel/driver/libqdma/   usr/src/slash-@VERSION@/driver/
diff --git a/packaging/rpm/slash.spec b/packaging/rpm/slash.spec
index a18ccd59..3568a859 100644
--- a/packaging/rpm/slash.spec
+++ b/packaging/rpm/slash.spec
@@ -90,7 +90,7 @@ SLASH/VRT System for simulation and emulation (development files)
 
 %package -n     slash-dkms
 Summary:        SLASH kernel module (DKMS)
-Requires:       dkms, gcc, make
+Requires:       dkms, gcc, make, patch
 BuildArch:      noarch
 
 %description -n slash-dkms
@@ -157,6 +157,7 @@ VRT Runtime (development files)
 %package -n     v80-smi
 Summary:        V80 System Management Interface
 Requires:       libvrt = %{version}-%{release}
+Requires:       libslash = %{version}-%{release}
 
 %description -n v80-smi
 V80 System Management Interface
@@ -211,6 +212,8 @@ install -m 0644 driver/Makefile %{buildroot}%{_usrsrc}/%{dkms_name}-%{dkms_versi
 
 cp -a driver/kcompat %{buildroot}%{_usrsrc}/%{dkms_name}-%{dkms_version}/driver/
 
+cp -a driver/patches %{buildroot}%{_usrsrc}/%{dkms_name}-%{dkms_version}/driver/
+
 cp -a driver/libslash/include/slash/uapi \
     %{buildroot}%{_usrsrc}/%{dkms_name}-%{dkms_version}/driver/libslash/include/slash/
 
diff --git a/scripts/package-ami.sh b/scripts/package-ami.sh
index f12cc6f6..66ff2833 100755
--- a/scripts/package-ami.sh
+++ b/scripts/package-ami.sh
@@ -33,14 +33,23 @@ ARTIFACTS_DIR="${ARTIFACTS_DIR:-$(pwd)/ami}"
 AMI_BUILD_DIR="$(pwd)/ami-build"
 AVED_DIR="$(pwd)/submodules/AVED"
 AMI_DIR="${AVED_DIR}/sw/AMI"
-PKG_PY="${AMI_DIR}/scripts/package_data/pkg.py"
-GEN_PKG_PY="${AMI_DIR}/scripts/gen_package.py"
+AMI_SRC_DIR="${AMI_BUILD_DIR}/src/AMI"
+AMI_OUTPUT_DIR="${AMI_BUILD_DIR}/pkg"
+PKG_PY="${AMI_SRC_DIR}/scripts/package_data/pkg.py"
+GEN_PKG_PY="${AMI_SRC_DIR}/scripts/gen_package.py"
 
 rm -rf "${AMI_BUILD_DIR}"
 mkdir -p "${ARTIFACTS_DIR}"
+mkdir -p "$(dirname "${AMI_SRC_DIR}")"
+cp -a "${AMI_DIR}" "${AMI_SRC_DIR}"
 
-# Restore submodule files and clean up build directory on exit
-trap 'git -C "${AVED_DIR}" checkout -- sw/AMI/scripts/package_data/pkg.py sw/AMI/scripts/gen_package.py; rm -rf "${AMI_BUILD_DIR}"' EXIT
+# Clean up build directory on exit. Packaging patches a disposable AMI copy so
+# this also works from source trees copied without usable submodule gitdirs.
+trap 'rm -rf "${AMI_BUILD_DIR}"' EXIT
+
+# Avoid stale generated headers from copied build trees. gen_package.py will
+# otherwise prefer api/build/ami_version.h over the checked-in version header.
+rm -f "${AMI_SRC_DIR}/api/build/ami_version.h"
 
 # Patch in Rocky Linux support (RHEL-compatible, RPM-based)
 sed -i "/^DIST_ID_RHEL /a DIST_ID_ROCKY   = 'rocky'" "${PKG_PY}"
@@ -48,13 +57,17 @@ sed -i "/^    DIST_ID_RHEL,$/a\\    DIST_ID_ROCKY," "${PKG_PY}"
 sed -i "s/DIST_RPM = \[DIST_ID_CENTOS, DIST_ID_REDHAT, DIST_ID_REDHAT2, DIST_ID_SLES, DIST_ID_RHEL\]/DIST_RPM = [DIST_ID_CENTOS, DIST_ID_REDHAT, DIST_ID_REDHAT2, DIST_ID_SLES, DIST_ID_RHEL, DIST_ID_ROCKY]/" "${PKG_PY}"
 sed -i "s/DIST_ID_CENTOS, DIST_ID_REDHAT, DIST_ID_REDHAT2, DIST_ID_RHEL\]/DIST_ID_CENTOS, DIST_ID_REDHAT, DIST_ID_REDHAT2, DIST_ID_RHEL, DIST_ID_ROCKY]/" "${GEN_PKG_PY}"
 
-cd "${AMI_DIR}"
+cd "${AMI_SRC_DIR}"
 # --no_driver skips a pre-flight driver compilation check (build+clean) only;
 # it does NOT affect which files are included in the package.
 # We skip it here so the packaging can run in environments (eg. containers)
 # that may not have linux-headers available to compile the driver.
-python3 scripts/gen_package.py --no_driver -o "${AMI_BUILD_DIR}"
+#
+# --no_gen_version skips AVED's git-based version regeneration. This wrapper is
+# often run from copied worktrees where the submodule .git file points back to a
+# non-existent source checkout, causing an empty hash and an invalid RPM Release.
+python3 scripts/gen_package.py --no_driver --no_gen_version -o "${AMI_OUTPUT_DIR}"
 
 # Copy only the package files to the artifacts directory
-cp "${AMI_BUILD_DIR}"/*.rpm "${ARTIFACTS_DIR}/" 2>/dev/null || \
-cp "${AMI_BUILD_DIR}"/*.deb "${ARTIFACTS_DIR}/" 2>/dev/null || true
+cp "${AMI_OUTPUT_DIR}"/*.rpm "${ARTIFACTS_DIR}/" 2>/dev/null || \
+cp "${AMI_OUTPUT_DIR}"/*.deb "${ARTIFACTS_DIR}/" 2>/dev/null || true
diff --git a/scripts/test-fresh-install.sh b/scripts/test-fresh-install.sh
index 247d6b2b..cd20e363 100755
--- a/scripts/test-fresh-install.sh
+++ b/scripts/test-fresh-install.sh
@@ -197,7 +197,7 @@ elif [[ "${PKG_TYPE}" == "rpm" ]]; then
 
     if [[ ${#INSTALLED[@]} -gt 0 ]]; then
         echo "Removing: ${INSTALLED[*]}"
-        dnf remove -y "${INSTALLED[@]}"
+        dnf remove -y --setopt='*.skip_if_unavailable=True' "${INSTALLED[@]}"
     else
         echo "No SLASH packages currently installed."
     fi
@@ -226,7 +226,7 @@ elif [[ "${PKG_TYPE}" == "rpm" ]]; then
     # Exclude source, debuginfo, and debugsource RPMs
     mapfile -t RPMS < <(find "${ARTIFACTS_DIR}" -maxdepth 1 -name '*.rpm' \
         ! -name '*.src.rpm' ! -name '*-debuginfo-*' ! -name '*-debugsource-*')
-    dnf install -y "${RPMS[@]}"
+    dnf install -y  --setopt='*.skip_if_unavailable=True' "${RPMS[@]}"
 fi
 
 # =========================================================================
diff --git a/smi/CMakeLists.txt b/smi/CMakeLists.txt
index 58cf9771..46ae8c16 100644
--- a/smi/CMakeLists.txt
+++ b/smi/CMakeLists.txt
@@ -40,6 +40,7 @@ project(
 )
 
 option(SMI_INCLUDE_VRT "Include vrtd as subdirectory instead of building from system" OFF)
+option(SMI_ENABLE_QDMA_DRIVER_BACKEND "Build validate --use-qdma-driver backend" ON)
 
 include(GNUInstallDirs)
 
@@ -55,6 +56,10 @@ if(NOT TARGET vrt::vrt)
     "Build and install vrt first (cmake --install), then configure smi again.")
 endif()
 
+if(NOT TARGET slash::slash)
+  find_package(slash REQUIRED CONFIG)
+endif()
+
 find_package(CLI11 CONFIG REQUIRED)
 
 configure_file(
diff --git a/smi/README.md b/smi/README.md
index d528ed92..a7fef4fe 100644
--- a/smi/README.md
+++ b/smi/README.md
@@ -178,43 +178,119 @@ programmed with the static SLASH design.
 
 ### validate
 
-Reset a board, then test HBM and DDR memory for data integrity and
-bandwidth.
+Optionally reset a board, then test HBM and DDR memory for data integrity and
+bandwidth. Raw transfer modes skip reset and bypass the default VRTD buffer
+path for data movement.
 
 ```
-v80-smi validate -d <BDF> [-j <threads>]
+v80-smi validate -d <BDF> [-j <threads>] [-R] [--mm-channel <spec>] [--buffer-size <size>] [--offset <size>] [--starting-offset <size>] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation <auto|paired>] [--channel-region-stride <size>] [--ring-size-index <0-15>] [--bandwidth-iterations <N>] [--bandwidth-duration <seconds>]
 ```
 
 | Flag              | Description                                          |
 |-------------------|------------------------------------------------------|
 | `-d,--device`     | Board address (required), e.g. `03:00` or `0000:03:00` |
-| `-j,--threads`    | Parallel buffers/threads, 1-64 (default 8)           |
-
-Each buffer is 64 MB.  The integrity test writes a pattern, syncs to
-device, clears host memory, syncs back, and verifies.  The bandwidth
-test runs parallel H2C writes and C2H reads.
+| `-j,--threads`    | Parallel buffers/threads, 1-64 (default 8). Bidirectional phases use `2 * threads` logical positions in each enabled memory space. |
+| `-R,--no-reset`   | Skip the device reset step before running memory tests |
+| `--mm-channel`    | AXI-MM/NoC channel per buffer queue: `auto` (default; driver stripes by `qid&1`), `0`, or `1`, or a comma-separated list with exactly one entry per buffer position (`2 x --threads` entries, e.g. `-j 1` -> `0,1`); no repeating, wrong length errors. Independent of `--channel-allocation`; also honored by `--use-qdma-driver`. |
+| `--buffer-size`   | Size of each test buffer, accepting bytes or `k`/`K`/`m`/`M` suffixes (default `512M`, maximum `512M`) |
+| `--offset`        | Distance between logical buffer positions (default `512M`) |
+| `--starting-offset` | Offset from each memory-space base for logical position 0 (default `0`) |
+| `--raw-transfer-test` | Use libslash raw QDMA transfers instead of VRTD buffers; implies `--no-reset` |
+| `--use-qdma-driver` | Run the raw transfer test over the off-the-shelf Xilinx QDMA driver instead of SLASH; implies `--no-reset`; mutually exclusive with `--raw-transfer-test` |
+| `--ddr-only`      | Run only DDR memory tests (skip HBM); mutually exclusive with `--hbm-only` |
+| `--hbm-only`      | Run only HBM memory tests (skip DDR); mutually exclusive with `--ddr-only` |
+| `--channel-allocation` | Raw-transfer-only placement: `auto` (default; mm-channel `qid&1`, linear addressing) or `paired` (couple mm-channel to a distinct memory region/NSU: even positions -> region 0/channel 0, odd -> region 1/channel 1). `paired` mirrors dma-perf `offset_ch0`/`offset_ch1` so both NoC NMUs drive independent memory endpoints. |
+| `--channel-region-stride` | In `--channel-allocation paired`, byte distance between the two per-channel regions (NSU stride). Default `16G` (half the per-memory space); accepts `k`/`K`/`m`/`M`/`g`/`G`. |
+| `--ring-size-index` | Raw-transfer-only descriptor-ring size index, `0`-`15`. Overrides the backend default when creating SLASH raw qpairs or starting stock-driver queues. |
+| `--bandwidth-iterations` | Raw-transfer-only sustained bandwidth mode: repeat each whole-buffer transfer this many times in each bandwidth phase (default `1`). |
+| `--bandwidth-duration` | Raw-transfer-only duration mode: repeat whole-buffer transfers until this many seconds have elapsed; `0` disables duration mode and uses `--bandwidth-iterations`. |
+
+Each buffer defaults to 512 MB (one HBM/DDR allocator region).  The integrity test
+writes a pattern, syncs to device, clears host memory, syncs back, and
+verifies.  Each bandwidth
+phase reports single-direction C2H reads, single-direction H2C writes,
+and simultaneous bidirectional throughput (read, write, and total).  After
+the per-memory phases, a final parallel phase drives HBM and DDR together
+using `2 x <threads>` buffers for single-direction tests and `4 x <threads>`
+threads for bidirectional tests; it is skipped when `--ddr-only` or
+`--hbm-only` is given.  With `--raw-transfer-test`, the command bypasses
+VRTD for transfers and opens the board's SLASH QDMA device directly, so
+the SLASH QDMA driver node must be present.
+
+Buffers are placed at `memory_base + starting-offset + position * offset`.
+The position sequence is `0..N-1` for single-direction phases and `0..2N-1`
+for bidirectional phases (reads on even positions, writes on odd positions).
+`--buffer-size`, `--offset`, and `--starting-offset` must be 4 KiB-aligned,
+`--offset` must be at least
+`--buffer-size`, and the highest buffer must fit within the 64 x 512 MB DDR/HBM
+address space. If any placement option is
+specified in default VRTD mode, `validate` uses raw VRTD buffers so the exact
+addresses are honored; this requires raw memory access permission.
+
+The largest phase maps up to `4 x <threads> x <buffer-size>` of host buffers
+when HBM and DDR are both enabled, or `2 x <threads> x <buffer-size>` with
+`--ddr-only` or `--hbm-only`; `validate` fails early if that footprint exceeds
+currently available host memory.
+
+Raw transfer modes can repeat the bandwidth phases without changing buffer
+placement. `--bandwidth-iterations` repeats each whole-buffer
+transfer a fixed number of times, while `--bandwidth-duration` runs each
+bandwidth phase for a wall-clock duration and counts completed whole-buffer
+transfers. Integrity checks remain one-shot.
+`--ring-size-index` can override the QDMA descriptor-ring size index for these
+raw modes; useful A/B values for 4 KiB descriptor throughput are `0`, `11`,
+`13`, and `15`.
+
+With `--use-qdma-driver`, the command runs the same raw test over the
+off-the-shelf Xilinx QDMA driver (`submodules/qdma_drv`) instead of SLASH.
+smi provisions the queues itself: it raises the function's `qmax` via sysfs
+if needed, creates and starts bidirectional AXI-MM queue pairs over generic
+netlink (the same `xnl_pf` interface `dma-ctl` uses), then transfers over the
+per-queue char devices `/dev/qdma<idx>-MM-<qid>`.  This requires the stock
+`qdma-pf` driver to be bound to the board's PF (it cannot be bound at the same
+time as the SLASH driver), and typically needs root to raise `qmax` and open
+the queue devices.  The device memory addresses tested (HBM/DDR) are the same
+AXI addresses used by the SLASH path.
+
+Requirements depend on the selected mode: the default path needs VRTD and root
+for reset unless `--no-reset` is used; `--raw-transfer-test` needs the SLASH
+QDMA driver node; `--use-qdma-driver` needs a build with
+`SMI_ENABLE_QDMA_DRIVER_BACKEND=ON` and the stock QDMA driver bound to the
+board.
 
 ```console
 $ v80-smi validate -d 03:00
 Resetting device 0000:03:00...
 Testing HBM data integrity (8 regions)...
-    HBM0: OK
-    HBM1: OK
-    ...
-Testing HBM bandwidth (8 threads)...
+    8/8 OK
+Testing HBM read bandwidth (8 threads)...
+    Read: 9547.22 MB/s
+Testing HBM write bandwidth (8 threads)...
     Write: 9832.10 MB/s
-    Read:  9547.22 MB/s
+Testing HBM bidirectional bandwidth (16 threads)...
+    Read:  9210.15 MB/s
+    Write: 9475.81 MB/s
+    Total: 18685.96 MB/s
 Testing DDR data integrity (8 buffers)...
-    DDR0: OK
-    DDR1: OK
-    ...
-Testing DDR bandwidth (8 threads)...
+    8/8 OK
+Testing DDR read bandwidth (8 threads)...
+    Read: 4980.33 MB/s
+Testing DDR write bandwidth (8 threads)...
     Write: 5120.45 MB/s
-    Read:  4980.33 MB/s
+Testing DDR bidirectional bandwidth (16 threads)...
+    Read:  4860.12 MB/s
+    Write: 5012.34 MB/s
+    Total: 9872.46 MB/s
+Testing HBM+DDR read bandwidth (16 threads)...
+    Read: 11890.55 MB/s
+Testing HBM+DDR write bandwidth (16 threads)...
+    Write: 12450.78 MB/s
+Testing HBM+DDR bidirectional bandwidth (32 threads)...
+    Read:  11340.12 MB/s
+    Write: 12020.34 MB/s
+    Total: 23360.46 MB/s
 ```
 
-Requires root access and a running VRTD daemon.
-
 ### debug bar-poke
 
 Perform low-level BAR reads or writes for troubleshooting.
@@ -364,6 +440,8 @@ since v80-smi always operates at board granularity.
 |------------|--------------------------------------------------|
 | libvrt     | VRT runtime library (device, kernel, vrtbin APIs) |
 | vrtd       | Runtime daemon (sensors, reset, validate, query)  |
+| libslash   | Raw SLASH QDMA backend for `validate --raw-transfer-test` |
+| qdma_nl.h  | Optional stock QDMA-driver backend (`SMI_ENABLE_QDMA_DRIVER_BACKEND=ON`) |
 
 ## Project layout
 
@@ -376,6 +454,8 @@ smi/
     program.cpp/hpp   Device programming
     reset.cpp/hpp     Hardware reset via VRTD
     validate.cpp/hpp  Memory integrity and bandwidth testing
+    raw_transfer.hpp  Shared raw QDMA host mapping and transfer helpers
+    qdma_driver_backend.cpp/hpp  Optional stock QDMA-driver validate backend
     debug/bar_poke.cpp/hpp  BAR read/write debug command
     debug/mem_poke.cpp/hpp  Raw device memory read/write command
     debug/clockwiz.cpp/hpp  Clock read/set debug command
diff --git a/smi/src/CMakeLists.txt b/smi/src/CMakeLists.txt
index 30e509aa..ad0c721a 100644
--- a/smi/src/CMakeLists.txt
+++ b/smi/src/CMakeLists.txt
@@ -32,6 +32,21 @@ add_executable(
     smi.cpp
 )
 
+if(SMI_ENABLE_QDMA_DRIVER_BACKEND)
+    target_sources(v80-smi PRIVATE qdma_driver_backend.cpp)
+
+    # Off-the-shelf Xilinx QDMA driver netlink UAPI header (qdma_nl.h), used by
+    # the --use-qdma-driver validate backend.
+    set(QDMA_DRV_APPS_INCLUDE
+        "${CMAKE_CURRENT_SOURCE_DIR}/../../submodules/qdma_drv/QDMA/linux-kernel/apps/include")
+    if(NOT EXISTS "${QDMA_DRV_APPS_INCLUDE}/qdma_nl.h")
+      message(FATAL_ERROR
+        "Missing ${QDMA_DRV_APPS_INCLUDE}/qdma_nl.h. "
+        "Initialize submodules (git submodule update --init submodules/qdma_drv), "
+        "or configure with -DSMI_ENABLE_QDMA_DRIVER_BACKEND=OFF.")
+    endif()
+endif()
+
 target_compile_features(v80-smi PRIVATE cxx_std_20)
 
 target_include_directories(
@@ -43,11 +58,17 @@ target_include_directories(
     ${CMAKE_CURRENT_BINARY_DIR}/../generated # For version.hpp
 )
 
+if(SMI_ENABLE_QDMA_DRIVER_BACKEND)
+    target_include_directories(v80-smi PRIVATE ${QDMA_DRV_APPS_INCLUDE})
+    target_compile_definitions(v80-smi PRIVATE SMI_ENABLE_QDMA_DRIVER_BACKEND=1)
+endif()
+
 target_link_libraries(
     v80-smi
 
     PRIVATE
 
     vrt::vrt
+    slash::slash
     CLI11::CLI11
 )
diff --git a/smi/src/qdma_driver_backend.cpp b/smi/src/qdma_driver_backend.cpp
new file mode 100644
index 00000000..da47d6aa
--- /dev/null
+++ b/smi/src/qdma_driver_backend.cpp
@@ -0,0 +1,550 @@
+/**
+ * The MIT License (MIT)
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software
+ * and associated documentation files (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/// @file qdma_driver_backend.cpp
+/// @brief Implementation of the off-the-shelf QDMA-driver raw-transfer backend.
+
+#include "qdma_driver_backend.hpp"
+
+#include <array>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include <linux/genetlink.h>
+#include <linux/netlink.h>
+
+// qdma_nl.h defines unused file-scope static lookup arrays (xnl_attr_str /
+// xnl_op_str); silence the resulting -Wunused warnings without touching the
+// vendored upstream header.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-const-variable"
+extern "C" {
+#include <qdma_nl.h>
+}
+#pragma GCC diagnostic pop
+
+#include "bdf.hpp"
+
+namespace smi::qdma_driver {
+
+namespace {
+
+/// Generous receive buffer: the device list dump grows with the number of
+/// queues/functions, so keep this comfortably larger than XNL_RESP_BUFLEN_MAX.
+constexpr size_t RESP_BUF_LEN = 256 * 1024;
+
+[[noreturn]] void throwSystemError(const std::string& message) {
+    throw std::runtime_error(message + ": " + std::strerror(errno));
+}
+
+} // namespace
+
+/// Minimal generic-netlink client for the QDMA driver's "xnl_pf" family.
+///
+/// This is a focused port of the netlink plumbing in the upstream `dma-ctl`
+/// utility (QDMA/linux-kernel/apps/dma-utils/dmactl.c): resolve the family id,
+/// send a command carrying a handful of u32 attributes, and parse the reply's
+/// attributes / generic message text.
+class XnlClient {
+public:
+    XnlClient() {
+        fd_ = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+        if (fd_ < 0) {
+            throwSystemError("Failed to open QDMA netlink socket");
+        }
+
+        struct sockaddr_nl addr{};
+        addr.nl_family = AF_NETLINK;
+        if (bind(fd_, reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr)) < 0) {
+            const int err = errno;
+            close(fd_);
+            fd_ = -1;
+            errno = err;
+            throwSystemError("Failed to bind QDMA netlink socket");
+        }
+
+        // Don't block forever if the driver isn't present / doesn't answer.
+        struct timeval tv{};
+        tv.tv_sec = 5;
+        tv.tv_usec = 0;
+        (void)setsockopt(fd_, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv));
+
+        family_ = resolveFamily(XNL_NAME_PF);
+    }
+
+    ~XnlClient() {
+        if (fd_ >= 0) {
+            close(fd_);
+        }
+    }
+
+    XnlClient(const XnlClient&) = delete;
+    XnlClient& operator=(const XnlClient&) = delete;
+
+    /// Parsed netlink response: scalar attributes plus any generic message text.
+    struct Response {
+        std::array<uint32_t, XNL_ATTR_MAX> attrs{};
+        std::array<bool, XNL_ATTR_MAX> present{};
+        std::string genmsg;
+    };
+
+    /// Send command @p op for device index @p devIndex with the given u32
+    /// attributes (DEV_IDX and a response-buffer-length hint are added
+    /// automatically) and return the parsed response.
+    Response sendCmd(uint8_t op, uint32_t devIndex,
+                     const std::vector<std::pair<uint16_t, uint32_t>>& attrs) {
+        std::vector<char> buf(RESP_BUF_LEN, 0);
+        auto* n = reinterpret_cast<struct nlmsghdr*>(buf.data());
+
+        n->nlmsg_type = family_;
+        n->nlmsg_flags = NLM_F_REQUEST;
+        n->nlmsg_pid = getpid();
+        n->nlmsg_seq = seq_++;
+        n->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+
+        auto* g = reinterpret_cast<struct genlmsghdr*>(NLMSG_DATA(n));
+        g->cmd = op;
+        g->version = XNL_VERSION;
+
+        addIntAttr(n, XNL_ATTR_DEV_IDX, devIndex);
+        for (const auto& [type, val] : attrs) {
+            addIntAttr(n, type, val);
+        }
+        // Tell the kernel how large a response we can accept.
+        addIntAttr(n, XNL_ATTR_RSP_BUF_LEN, static_cast<uint32_t>(buf.size()));
+
+        sendMsg(n);
+        return recvMsg(buf);
+    }
+
+private:
+    static uint16_t alignedAttrLen(uint16_t payload) {
+        return static_cast<uint16_t>(NLA_HDRLEN + payload);
+    }
+
+    static void addIntAttr(struct nlmsghdr* n, uint16_t type, uint32_t value) {
+        auto* attr = reinterpret_cast<struct nlattr*>(reinterpret_cast<char*>(n) + n->nlmsg_len);
+        attr->nla_type = type;
+        attr->nla_len = alignedAttrLen(sizeof(uint32_t));
+        std::memcpy(reinterpret_cast<char*>(attr) + NLA_HDRLEN, &value, sizeof(value));
+        n->nlmsg_len += NLMSG_ALIGN(attr->nla_len);
+    }
+
+    static void addStrAttr(struct nlmsghdr* n, uint16_t type, const char* s) {
+        auto* attr = reinterpret_cast<struct nlattr*>(reinterpret_cast<char*>(n) + n->nlmsg_len);
+        const size_t len = std::strlen(s) + 1;
+        attr->nla_type = type;
+        attr->nla_len = alignedAttrLen(static_cast<uint16_t>(len));
+        std::memcpy(reinterpret_cast<char*>(attr) + NLA_HDRLEN, s, len);
+        n->nlmsg_len += NLMSG_ALIGN(attr->nla_len);
+    }
+
+    void sendMsg(struct nlmsghdr* n) {
+        struct sockaddr_nl addr{};
+        addr.nl_family = AF_NETLINK;
+        ssize_t rv = sendto(fd_, n, n->nlmsg_len, 0,
+                            reinterpret_cast<struct sockaddr*>(&addr), sizeof(addr));
+        if (rv < 0 || static_cast<uint32_t>(rv) != n->nlmsg_len) {
+            throwSystemError("QDMA netlink send failed");
+        }
+    }
+
+    Response recvMsg(std::vector<char>& buf) {
+        std::memset(buf.data(), 0, buf.size());
+        ssize_t rv = recv(fd_, buf.data(), buf.size(), 0);
+        if (rv < 0) {
+            throwSystemError("QDMA netlink receive failed");
+        }
+
+        auto* n = reinterpret_cast<struct nlmsghdr*>(buf.data());
+        if (n->nlmsg_type == NLMSG_ERROR) {
+            int err = 0;
+            if (n->nlmsg_len >= NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
+                auto* nlerr = reinterpret_cast<struct nlmsgerr*>(NLMSG_DATA(n));
+                err = nlerr->error;
+            }
+            throw std::runtime_error("QDMA netlink returned an error response (" +
+                                     std::to_string(err) + ")");
+        }
+
+        Response resp;
+        auto* p = reinterpret_cast<unsigned char*>(buf.data()) + NLMSG_LENGTH(GENL_HDRLEN);
+        int maxlen = static_cast<int>(n->nlmsg_len) - static_cast<int>(NLMSG_LENGTH(GENL_HDRLEN));
+        while (maxlen > 0) {
+            auto* na = reinterpret_cast<struct nlattr*>(p);
+            if (na->nla_len < NLA_HDRLEN) {
+                break;
+            }
+            const int len = NLA_ALIGN(na->nla_len);
+            const char* payload = reinterpret_cast<const char*>(na) + NLA_HDRLEN;
+
+            if (na->nla_type == XNL_ATTR_GENMSG) {
+                resp.genmsg.assign(payload);
+            } else if (na->nla_type < XNL_ATTR_MAX) {
+                uint32_t v = 0;
+                std::memcpy(&v, payload, sizeof(v));
+                resp.attrs[na->nla_type] = v;
+                resp.present[na->nla_type] = true;
+            }
+
+            p += len;
+            maxlen -= len;
+        }
+        return resp;
+    }
+
+    uint16_t resolveFamily(const char* name) {
+        std::vector<char> buf(RESP_BUF_LEN, 0);
+        auto* n = reinterpret_cast<struct nlmsghdr*>(buf.data());
+
+        n->nlmsg_type = GENL_ID_CTRL;
+        n->nlmsg_flags = NLM_F_REQUEST;
+        n->nlmsg_pid = getpid();
+        n->nlmsg_seq = seq_++;
+        n->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+
+        auto* g = reinterpret_cast<struct genlmsghdr*>(NLMSG_DATA(n));
+        g->cmd = CTRL_CMD_GETFAMILY;
+        g->version = XNL_VERSION;
+
+        addStrAttr(n, CTRL_ATTR_FAMILY_NAME, name);
+        sendMsg(n);
+
+        std::memset(buf.data(), 0, buf.size());
+        ssize_t rv = recv(fd_, buf.data(), buf.size(), 0);
+        if (rv < 0) {
+            throwSystemError(std::string("Failed to resolve QDMA netlink family '") + name +
+                             "' (is the upstream qdma driver loaded?)");
+        }
+        if (n->nlmsg_type == NLMSG_ERROR) {
+            int err = 0;
+            if (n->nlmsg_len >= NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
+                auto* nlerr = reinterpret_cast<struct nlmsgerr*>(NLMSG_DATA(n));
+                err = nlerr->error;
+            }
+            throw std::runtime_error(std::string("QDMA netlink family '") + name +
+                                     "' not found (netlink error " + std::to_string(err) +
+                                     "; is the upstream qdma driver loaded?)");
+        }
+
+        auto* p = reinterpret_cast<unsigned char*>(buf.data()) + NLMSG_LENGTH(GENL_HDRLEN);
+        int maxlen = static_cast<int>(n->nlmsg_len) - static_cast<int>(NLMSG_LENGTH(GENL_HDRLEN));
+        while (maxlen > 0) {
+            auto* na = reinterpret_cast<struct nlattr*>(p);
+            if (na->nla_len < NLA_HDRLEN) {
+                break;
+            }
+            if (na->nla_type == CTRL_ATTR_FAMILY_ID) {
+                uint16_t id = 0;
+                std::memcpy(&id, reinterpret_cast<char*>(na) + NLA_HDRLEN, sizeof(id));
+                return id;
+            }
+            const int len = NLA_ALIGN(na->nla_len);
+            p += len;
+            maxlen -= len;
+        }
+        throw std::runtime_error(std::string("QDMA netlink family '") + name +
+                                 "' id not present in response");
+    }
+
+    int fd_ = -1;
+    uint16_t family_ = 0;
+    uint32_t seq_ = 0;
+};
+
+namespace {
+
+/// Queue flags for a bidirectional AXI-MM queue pair.
+constexpr uint32_t QFLAG_MM_BI = XNL_F_QMODE_MM | XNL_F_QDIR_BOTH;
+
+/// Queue flags for `q start`.  In addition to mode/direction, this must enable
+/// the descriptor-ring writeback/completion-status reporting and fetch credit,
+/// exactly as `dma-ctl q start` does by default (see
+/// QDMA/linux-kernel/apps/dma-ctl/cmd_parse.c). Without the writeback bits the
+/// poll-mode driver never observes MM completion and every transfer times out.
+constexpr uint32_t QFLAG_MM_BI_START =
+    QFLAG_MM_BI |
+    XNL_F_CMPL_STATUS_EN | XNL_F_CMPL_STATUS_ACC_EN |
+    XNL_F_CMPL_STATUS_PEND_CHK | XNL_F_CMPL_STATUS_DESC_EN |
+    XNL_F_FETCH_CREDIT;
+
+/// Default descriptor-ring size index for `q start`, matching `dma-ctl`'s
+/// default ("ring size set to 2048").
+constexpr uint32_t QRNGSZ_IDX_DEFAULT = 9;
+
+} // namespace
+
+QdmaDriverDevice::QdmaDriverDevice(const std::string& boardBdf,
+                                   std::optional<uint32_t> ringSizeIndex)
+    : nl_(std::make_unique<XnlClient>()),
+      ringSizeIndex_(ringSizeIndex.value_or(QRNGSZ_IDX_DEFAULT)) {
+    const ParsedBdf board = parseBdf(boardBdf);
+
+    // Enumerate the driver's devices and find the QDMA function on this board.
+    // Each PF line looks like: "qdma61001\t0000:61:00.1\tmax QP: 512, 0~511".
+    XnlClient::Response resp = nl_->sendCmd(XNL_CMD_DEV_LIST, /*devIndex=*/0, {});
+    if (resp.genmsg.empty()) {
+        throw std::runtime_error(
+            "Upstream QDMA driver reported no devices (dev list empty). "
+            "Ensure the stock qdma driver is bound to the board.");
+    }
+
+    bool found = false;
+    std::istringstream lines(resp.genmsg);
+    std::string line;
+    while (std::getline(lines, line)) {
+        std::istringstream tokens(line);
+        std::string name;
+        std::string bdfStr;
+        if (!(tokens >> name >> bdfStr)) {
+            continue;
+        }
+        if (name.rfind("qdma", 0) != 0 || name.rfind("qdmavf", 0) == 0) {
+            continue; // not a PF entry
+        }
+
+        ParsedBdf entry;
+        try {
+            entry = parseBdf(bdfStr);
+        } catch (const std::exception&) {
+            continue;
+        }
+        if (entry.base() != board.base()) {
+            continue;
+        }
+
+        index_ = static_cast<unsigned>(std::stoul(name.substr(4), nullptr, 16));
+        functionBdf_ = bdfStr;
+
+        const auto pos = line.find("max QP:");
+        if (pos != std::string::npos) {
+            qmax_ = static_cast<unsigned>(std::strtoul(line.c_str() + pos + 7, nullptr, 10));
+        }
+        found = true;
+        if (entry.function.value_or(0) == 1) {
+            break; // Prefer the QDMA PF used by SLASH/V80.
+        }
+    }
+
+    if (!found) {
+        throw std::runtime_error(
+            "No upstream QDMA function found for board " + board.base() +
+            " (is the stock qdma driver bound to this board's PF?)");
+    }
+
+    // Ask the driver how many MM (memory-mapped) DMA engine channels this
+    // function exposes so we can spread queues across them.  CPM5 (V80)
+    // reports 2; older/soft IPs report 1.  Best-effort: if the query fails or
+    // the attribute is absent, fall back to a single channel (channel 0).
+    try {
+        XnlClient::Response info = nl_->sendCmd(XNL_CMD_DEV_INFO, index_, {});
+        if (info.present[XNL_ATTR_DEV_MM_CHANNEL_MAX] &&
+            info.attrs[XNL_ATTR_DEV_MM_CHANNEL_MAX] > 0) {
+            mmChannelMax_ = info.attrs[XNL_ATTR_DEV_MM_CHANNEL_MAX];
+        }
+    } catch (const std::exception&) {
+        mmChannelMax_ = 1;
+    }
+}
+
+QdmaDriverDevice::~QdmaDriverDevice() = default;
+
+void QdmaDriverDevice::refreshQmax() {
+    XnlClient::Response resp = nl_->sendCmd(XNL_CMD_DEV_LIST, /*devIndex=*/0, {});
+    std::istringstream lines(resp.genmsg);
+    std::string line;
+
+    while (std::getline(lines, line)) {
+        std::istringstream tokens(line);
+        std::string name;
+        std::string bdfStr;
+        if (!(tokens >> name >> bdfStr) || bdfStr != functionBdf_) {
+            continue;
+        }
+
+        const auto pos = line.find("max QP:");
+        if (pos == std::string::npos) {
+            throw std::runtime_error("QDMA device list entry for " + functionBdf_ +
+                                     " does not report max QP");
+        }
+
+        qmax_ = static_cast<unsigned>(std::strtoul(line.c_str() + pos + 7, nullptr, 10));
+        return;
+    }
+
+    throw std::runtime_error("QDMA function " + functionBdf_ +
+                             " disappeared from driver device list after qmax update");
+}
+
+void QdmaDriverDevice::ensureQmax(unsigned needed) {
+    if (qmax_ >= needed) {
+        return;
+    }
+
+    const std::string path = "/sys/bus/pci/devices/" + functionBdf_ + "/qdma/qmax";
+    std::ofstream qmaxFile(path);
+    if (!qmaxFile.is_open()) {
+        throw std::runtime_error(
+            "Need at least " + std::to_string(needed) + " queues but qmax is " +
+            std::to_string(qmax_) + " and cannot open " + path +
+            " to raise it (run as root, or set qmax manually with dma-ctl)");
+    }
+    qmaxFile << needed << std::endl;
+    qmaxFile.close();
+    if (qmaxFile.fail()) {
+        throw std::runtime_error(
+            "Failed to write qmax=" + std::to_string(needed) + " to " + path +
+            " (queues may be active; stop them or reload the driver)");
+    }
+    refreshQmax();
+    if (qmax_ < needed) {
+        throw std::runtime_error(
+            "QDMA qmax update requested " + std::to_string(needed) +
+            " queues, but driver reports only " + std::to_string(qmax_));
+    }
+}
+
+void QdmaDriverDevice::queueAdd(uint32_t qid) {
+    XnlClient::Response resp = nl_->sendCmd(XNL_CMD_Q_ADD, index_,
+        {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI}});
+    if (resp.present[XNL_ATTR_ERROR] && resp.attrs[XNL_ATTR_ERROR] != 0) {
+        throw std::runtime_error("QDMA q add failed for qid " + std::to_string(qid) + ": " +
+                                 (resp.genmsg.empty() ? "netlink error" : resp.genmsg));
+    }
+}
+
+void QdmaDriverDevice::queueStart(uint32_t qid, uint32_t channel) {
+    // The caller chooses the MM engine channel for this queue pair.  It has to
+    // be carried on `q start`: the driver only reads XNL_ATTR_MM_CHANNEL in its
+    // start handler (via qdma_queue_config) and defaults the queue to channel 0
+    // whenever the attribute is absent.  mmChannelMax_ is always >= 1, so the
+    // modulo keeps an out-of-range request inside the device's channel count.
+    channel %= mmChannelMax_;
+    XnlClient::Response resp = nl_->sendCmd(XNL_CMD_Q_START, index_,
+        {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI_START},
+         {XNL_ATTR_QRNGSZ_IDX, ringSizeIndex_}, {XNL_ATTR_MM_CHANNEL, channel}});
+    if (resp.present[XNL_ATTR_ERROR] && resp.attrs[XNL_ATTR_ERROR] != 0) {
+        throw std::runtime_error("QDMA q start failed for qid " + std::to_string(qid) + ": " +
+                                 (resp.genmsg.empty() ? "netlink error" : resp.genmsg));
+    }
+}
+
+void QdmaDriverDevice::queueStop(uint32_t qid) noexcept {
+    try {
+        (void)nl_->sendCmd(XNL_CMD_Q_STOP, index_,
+            {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI}});
+    } catch (...) {
+        // Best-effort teardown.
+    }
+}
+
+void QdmaDriverDevice::queueDel(uint32_t qid) noexcept {
+    try {
+        (void)nl_->sendCmd(XNL_CMD_Q_DEL, index_,
+            {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI}});
+    } catch (...) {
+        // Best-effort teardown.
+    }
+}
+
+std::string QdmaDriverDevice::charDevPath(uint32_t qid) const {
+    char name[64];
+    std::snprintf(name, sizeof(name), "/dev/qdma%05x-MM-%u", index_, qid);
+    return std::string(name);
+}
+
+QdmaDriverBuffer::QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid,
+                                   uint64_t physAddr, uint64_t size,
+                                   int mmChannel)
+    : device_(&device), qid_(qid), physAddr_(physAddr) {
+    try {
+        mapping_ = raw::createHostMapping(size, physAddr);
+
+        // mmChannel < 0 means auto: spread the queue across channels by qid.
+        const uint32_t channel = (mmChannel < 0)
+            ? qid_
+            : static_cast<uint32_t>(mmChannel);
+
+        device_->queueAdd(qid_);
+        queueAdded_ = true;
+        device_->queueStart(qid_, channel);
+        queueStarted_ = true;
+
+        const std::string path = device_->charDevPath(qid_);
+        fd_ = open(path.c_str(), O_RDWR | O_CLOEXEC);
+        if (fd_ < 0) {
+            throwSystemError("Failed to open QDMA char device " + path);
+        }
+    } catch (...) {
+        cleanup();
+        throw;
+    }
+}
+
+QdmaDriverBuffer::~QdmaDriverBuffer() {
+    cleanup();
+}
+
+void QdmaDriverBuffer::moveFrom(QdmaDriverBuffer& other) noexcept {
+    device_ = other.device_;
+    qid_ = other.qid_;
+    queueAdded_ = other.queueAdded_;
+    queueStarted_ = other.queueStarted_;
+    fd_ = other.fd_;
+    physAddr_ = other.physAddr_;
+    mapping_ = other.mapping_;
+
+    other.device_ = nullptr;
+    other.qid_ = 0;
+    other.queueAdded_ = false;
+    other.queueStarted_ = false;
+    other.fd_ = -1;
+    other.physAddr_ = 0;
+    other.mapping_ = raw::HostMapping{};
+}
+
+void QdmaDriverBuffer::cleanup() noexcept {
+    if (fd_ >= 0) {
+        (void)close(fd_);
+        fd_ = -1;
+    }
+    if (device_ != nullptr && queueStarted_) {
+        device_->queueStop(qid_);
+        queueStarted_ = false;
+    }
+    if (device_ != nullptr && queueAdded_) {
+        device_->queueDel(qid_);
+        queueAdded_ = false;
+    }
+    raw::destroyHostMapping(mapping_);
+}
+
+} // namespace smi::qdma_driver
diff --git a/smi/src/qdma_driver_backend.hpp b/smi/src/qdma_driver_backend.hpp
new file mode 100644
index 00000000..e0d94fa8
--- /dev/null
+++ b/smi/src/qdma_driver_backend.hpp
@@ -0,0 +1,160 @@
+/**
+ * The MIT License (MIT)
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software
+ * and associated documentation files (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SMI_QDMA_DRIVER_BACKEND_HPP
+#define SMI_QDMA_DRIVER_BACKEND_HPP
+
+/// @file qdma_driver_backend.hpp
+/// @brief Raw-transfer backend for the off-the-shelf Xilinx QDMA driver.
+///
+/// This backend mirrors the surface of validate.cpp's SLASH RawTransferBuffer
+/// (data()/getSize()/syncToDevice()/syncFromDevice()) so the templated
+/// integrity and bandwidth tests work unchanged, but it drives the upstream
+/// QDMA driver (submodules/qdma_drv) instead of SLASH/libslash:
+///
+///   - Queue lifecycle (add/start/stop/del) is performed over generic netlink
+///     (family "xnl_pf"), exactly as the `dma-ctl` utility does.
+///   - The function's `qmax` is provisioned via sysfs if it is too small.
+///   - Data movement uses the per-queue char device /dev/qdma<idx>-MM-<qid>
+///     with the device address carried as the file offset.
+///
+/// Unlike SLASH there is no control device or custom ioctl ABI; the stock
+/// driver must be bound to the function for any of this to work.
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "raw_transfer.hpp"
+
+namespace smi::qdma_driver {
+
+/// Opaque generic-netlink client used to talk to the QDMA driver.
+class XnlClient;
+
+/// Represents a single PCIe function managed by the upstream QDMA driver.
+///
+/// Resolves the driver's device index from the board BDF, ensures enough
+/// queues are provisioned (qmax), and provides queue lifecycle operations.
+class QdmaDriverDevice {
+public:
+    /// @param boardBdf Board-level BDF "DDDD:BB:DD" (function is resolved by
+    ///                 enumerating the driver's device list).
+    explicit QdmaDriverDevice(const std::string& boardBdf,
+                              std::optional<uint32_t> ringSizeIndex = std::nullopt);
+    ~QdmaDriverDevice();
+
+    QdmaDriverDevice(const QdmaDriverDevice&) = delete;
+    QdmaDriverDevice& operator=(const QdmaDriverDevice&) = delete;
+
+    /// Ensure the function has at least @p needed queues provisioned, writing
+    /// the sysfs `qmax` entry (which re-initializes the queue set) if required.
+    void ensureQmax(unsigned needed);
+
+    /// Add + start a bidirectional AXI-MM queue pair at relative index @p qid.
+    ///
+    /// queueStart pins the pair to MM engine channel `qid % mmChannelMax()`,
+    /// spreading queues across the device's MM channels (the channel only
+    /// takes effect on `q start`; the driver ignores it on `q add`).
+    void queueAdd(uint32_t qid);
+    /// Start queue @p qid pinned to MM engine @p channel (0-based, clamped to
+    /// the device's channel count).
+    void queueStart(uint32_t qid, uint32_t channel);
+
+    /// Stop + delete a queue pair.  Best-effort; never throws (safe in dtors).
+    void queueStop(uint32_t qid) noexcept;
+    void queueDel(uint32_t qid) noexcept;
+
+    /// Char-device path for queue @p qid, e.g. "/dev/qdma61001-MM-0".
+    std::string charDevPath(uint32_t qid) const;
+
+    /// Resolved 0000:BB:DD.F PCI address of the QDMA function.
+    const std::string& functionBdf() const { return functionBdf_; }
+
+    /// Number of MM (memory-mapped) DMA engine channels the function exposes.
+    /// CPM5 (V80) reports 2; older/soft IPs report 1.  Always >= 1.
+    unsigned mmChannelMax() const { return mmChannelMax_; }
+
+private:
+    void refreshQmax();
+
+    std::unique_ptr<XnlClient> nl_;
+    unsigned index_ = 0;          ///< Driver device index (qdma<index>).
+    std::string functionBdf_;     ///< Full BDF including function.
+    unsigned qmax_ = 0;           ///< Currently provisioned queue count.
+    unsigned mmChannelMax_ = 1;   ///< Number of MM engine channels (>= 1).
+    uint32_t ringSizeIndex_ = 0;  ///< QRNGSZ_IDX used when starting queues.
+};
+
+/// One host buffer bound to a freshly-created upstream QDMA queue pair.
+///
+/// Satisfies the buffer concept used by validate.cpp's testDataIntegrity() /
+/// testBandwidth() templates.
+class QdmaDriverBuffer {
+public:
+    /// @param mmChannel Concrete MM channel to pin to, or -1 to spread the
+    ///                  queue across channels by qid % channel-count.
+    QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid, uint64_t physAddr, uint64_t size,
+                     int mmChannel);
+
+    QdmaDriverBuffer(const QdmaDriverBuffer&) = delete;
+    QdmaDriverBuffer& operator=(const QdmaDriverBuffer&) = delete;
+
+    QdmaDriverBuffer(QdmaDriverBuffer&& other) noexcept { moveFrom(other); }
+    QdmaDriverBuffer& operator=(QdmaDriverBuffer&& other) noexcept {
+        if (this != &other) {
+            cleanup();
+            moveFrom(other);
+        }
+        return *this;
+    }
+
+    ~QdmaDriverBuffer();
+
+    void* data() { return mapping_.data; }
+    uint64_t getSize() const { return mapping_.size; }
+
+    void syncToDevice(uint64_t offset, uint64_t size) {
+        raw::validateSyncRange(offset, size, mapping_.size, physAddr_, mapping_.step);
+        raw::rawTransfer(fd_, mapping_.data, physAddr_, offset, size, mapping_.step, /*toDevice=*/true);
+    }
+
+    void syncFromDevice(uint64_t offset, uint64_t size) {
+        raw::validateSyncRange(offset, size, mapping_.size, physAddr_, mapping_.step);
+        raw::rawTransfer(fd_, mapping_.data, physAddr_, offset, size, mapping_.step, /*toDevice=*/false);
+    }
+
+private:
+    void moveFrom(QdmaDriverBuffer& other) noexcept;
+    void cleanup() noexcept;
+
+    QdmaDriverDevice* device_ = nullptr;
+    uint32_t qid_ = 0;
+    bool queueAdded_ = false;
+    bool queueStarted_ = false;
+    int fd_ = -1;
+    uint64_t physAddr_ = 0;
+    raw::HostMapping mapping_{};
+};
+
+} // namespace smi::qdma_driver
+
+#endif // SMI_QDMA_DRIVER_BACKEND_HPP
diff --git a/smi/src/raw_transfer.hpp b/smi/src/raw_transfer.hpp
new file mode 100644
index 00000000..41988d31
--- /dev/null
+++ b/smi/src/raw_transfer.hpp
@@ -0,0 +1,242 @@
+/**
+ * The MIT License (MIT)
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software
+ * and associated documentation files (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SMI_RAW_TRANSFER_HPP
+#define SMI_RAW_TRANSFER_HPP
+
+/// @file raw_transfer.hpp
+/// @brief Backend-agnostic helpers for the raw QDMA memory-mapped transfer
+///        tests used by `smi validate`.
+///
+/// The SLASH backend (libslash queue-pair fds) and the off-the-shelf Xilinx
+/// QDMA-driver backend (/dev/qdma<idx>-MM-<qid> char devices) share the exact
+/// same host-side buffer setup and pread/pwrite transfer loop -- only the way
+/// the file descriptor and device address get provisioned differs.  Those
+/// shared pieces live here so both backends behave (and time) identically.
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <cerrno>
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <stdexcept>
+#include <string>
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+/// Per-transfer timing instrumentation.
+///
+/// When SLASH_QDMA_TIMING is non-zero (compile-time flag, e.g. built with
+/// -DSLASH_QDMA_TIMING=1), the raw-transfer path logs the wall-clock cost of
+/// each pwrite/pread syscall plus the aggregate per-transfer time and
+/// effective bandwidth.  This is the userspace counterpart to the kernel's
+/// SLASH_QDMA_TIMING breakdown.
+#ifndef SLASH_QDMA_TIMING
+#define SLASH_QDMA_TIMING 0
+#endif
+
+namespace smi::raw {
+
+/// Host transfer sizes mirror libvrtd's QDMA staging policy.
+static constexpr uint64_t BASE_TRANSFER_STEP_SIZE = 4ULL * 1024ULL;
+
+[[noreturn]] inline void throwSystemError(const std::string& message) {
+    throw std::runtime_error(message + ": " + std::strerror(errno));
+}
+
+/// A host staging buffer plus the DMA granule it is backed by.
+///
+/// `step` is always BASE_TRANSFER_STEP_SIZE (4 KiB base pages).  It is used
+/// only for range/alignment validation: the whole range is transferred in a
+/// single syscall and the kernel builds one DMA descriptor per page.
+struct HostMapping {
+    void* data = nullptr;
+    uint64_t size = 0;
+    uint64_t step = 0;
+};
+
+/// Create a host staging buffer of 4 KiB base pages for raw transfers.  @p
+/// physAddr is the device address this buffer backs and is only used to make
+/// error messages actionable.
+inline HostMapping createHostMapping(uint64_t size, uint64_t physAddr) {
+    HostMapping mapping;
+    mapping.size = size;
+
+    // Map regular base pages.  MAP_POPULATE is deliberately omitted: it would
+    // pre-fault the whole buffer during mmap(), i.e. before the MADV_NOHUGEPAGE
+    // below can take effect. On hosts with transparent hugepages set to
+    // "always", those early faults hand back 2 MiB THP compound pages, and
+    // MADV_NOHUGEPAGE does not split pages that are already faulted in. The
+    // driver's strict 4 KiB base-page path (slash_qdma_map_user_base_pages_to_sgl)
+    // then rejects every transfer with -EINVAL ("4 KiB transfer is not backed by
+    // a base page").
+    (void)physAddr;
+    mapping.data = mmap(nullptr,
+                        size,
+                        PROT_READ | PROT_WRITE,
+                        MAP_PRIVATE | MAP_ANONYMOUS,
+                        -1,
+                        0);
+    if (mapping.data == MAP_FAILED) {
+        throwSystemError("Failed to mmap raw transfer host buffer");
+    }
+
+    // Disable THP for this region *before* any page is faulted in, so that
+    // every fault below allocates a genuine 4 KiB base page.
+    if (madvise(mapping.data, size, MADV_NOHUGEPAGE) != 0) {
+        const int savedErrno = errno;
+        (void)munmap(mapping.data, size);
+        mapping.data = nullptr;
+        errno = savedErrno;
+        throwSystemError("Failed to disable transparent hugepages for raw transfer host buffer");
+    }
+
+    // Pre-fault the buffer as base pages, replacing the MAP_POPULATE dropped
+    // above. Touching one byte per page now that VM_NOHUGEPAGE is set forces
+    // the kernel to back each page with a 4 KiB base page up front (and keeps
+    // the page-fault cost out of the timed transfer loop).
+    {
+        volatile uint8_t* touch = static_cast<volatile uint8_t*>(mapping.data);
+        for (uint64_t off = 0; off < size; off += BASE_TRANSFER_STEP_SIZE) {
+            touch[off] = 0;
+        }
+    }
+
+    mapping.step = BASE_TRANSFER_STEP_SIZE;
+    return mapping;
+}
+
+/// Release a host mapping created by createHostMapping().
+inline void destroyHostMapping(HostMapping& mapping) noexcept {
+    if (mapping.data != nullptr && mapping.data != MAP_FAILED) {
+        (void)munmap(mapping.data, mapping.size);
+        mapping.data = nullptr;
+    }
+}
+
+/// Validate that a [offset, offset+size) request is aligned and in range for a
+/// buffer of @p bufSize bytes backing device address @p physAddr, given the
+/// mapping's @p step.
+inline void validateSyncRange(uint64_t offset, uint64_t size, uint64_t bufSize,
+                              uint64_t physAddr, uint64_t step) {
+    if (step == 0 || size == 0) {
+        throw std::invalid_argument("Invalid raw transfer size");
+    }
+    if ((offset % step) != 0 || (size % step) != 0 ||
+        (bufSize % step) != 0 || (physAddr % step) != 0) {
+        throw std::invalid_argument("Raw transfer range is not aligned to the host mapping step");
+    }
+    if (offset > bufSize || size > bufSize - offset) {
+        throw std::out_of_range("Raw transfer range exceeds buffer size");
+    }
+    // Both granules transfer the whole range in a single pread/pwrite, so the
+    // size must fit in ssize_t regardless of step.
+    if (size > static_cast<uint64_t>(std::numeric_limits<ssize_t>::max())) {
+        throw std::invalid_argument("Raw transfer size exceeds syscall limit");
+    }
+}
+
+/// Perform a raw memory-mapped QDMA transfer over @p fd using pread/pwrite,
+/// with the device (endpoint) address encoded as the file offset.
+///
+/// @param fd        Per-queue char device / queue-pair fd.
+/// @param data      Host staging buffer base.
+/// @param physAddr  Device-side base address for this buffer.
+/// @param offset    Byte offset within the buffer (and added to physAddr).
+/// @param size      Number of bytes to transfer.
+/// @param step      Mapping step size (see HostMapping::step).
+/// @param toDevice  true for H2C (pwrite), false for C2H (pread).
+inline void rawTransfer(int fd, void* data, uint64_t physAddr, uint64_t offset,
+                        uint64_t size, [[maybe_unused]] uint64_t step,
+                        bool toDevice) {
+    // Issue the whole range in a single syscall regardless of page granule.
+    // The kernel pins every page in the range and builds one descriptor per
+    // page, submitting a single multi-descriptor libqdma request (libqdma
+    // refills the descriptor ring as needed). This keeps syscall/submit
+    // overhead independent of the page size -- the 4 KiB path no longer costs
+    // one syscall (and one single-descriptor DMA) per page.
+    const uint64_t syscallSize = size;
+    const uint64_t endOffset = offset + size;
+#if SLASH_QDMA_TIMING
+    const auto xferStart = std::chrono::steady_clock::now();
+#endif
+
+    for (uint64_t currOffset = offset; currOffset < endOffset; currOffset += syscallSize) {
+        uint64_t transferred = 0;
+        while (transferred < syscallSize) {
+            const auto* src = static_cast<const uint8_t*>(data) + currOffset + transferred;
+            auto* dst = static_cast<uint8_t*>(data) + currOffset + transferred;
+            const size_t remaining = static_cast<size_t>(syscallSize - transferred);
+            const off_t devOffset = static_cast<off_t>(physAddr + currOffset + transferred);
+
+#if SLASH_QDMA_TIMING
+            const auto callStart = std::chrono::steady_clock::now();
+#endif
+            ssize_t ret = toDevice
+                ? pwrite(fd, src, remaining, devOffset)
+                : pread(fd, dst, remaining, devOffset);
+
+            if (ret < 0 && errno == EINTR) {
+                continue;
+            }
+            if (ret <= 0) {
+                throwSystemError(toDevice ? "Raw QDMA write failed" : "Raw QDMA read failed");
+            }
+#if SLASH_QDMA_TIMING
+            const auto callNs = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                    std::chrono::steady_clock::now() - callStart)
+                                    .count();
+            std::fprintf(stderr,
+                         "validate: timing %s dev=0x%llx bytes=%zu ret=%zd syscall=%lld ns\n",
+                         toDevice ? "H2C" : "C2H",
+                         static_cast<unsigned long long>(devOffset), remaining, ret,
+                         static_cast<long long>(callNs));
+#endif
+            transferred += static_cast<uint64_t>(ret);
+        }
+    }
+
+#if SLASH_QDMA_TIMING
+    const auto totalNs = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                             std::chrono::steady_clock::now() - xferStart)
+                             .count();
+    const double mb = static_cast<double>(size) / (1024.0 * 1024.0);
+    const double sec = static_cast<double>(totalNs) / 1e9;
+    std::fprintf(stderr,
+                 "validate: timing %s xfer dev=0x%llx size=%llu step=%llu total=%lld ns (%.1f MB/s)\n",
+                 toDevice ? "H2C" : "C2H",
+                 static_cast<unsigned long long>(physAddr + offset),
+                 static_cast<unsigned long long>(size),
+                 static_cast<unsigned long long>(step), static_cast<long long>(totalNs),
+                 sec > 0.0 ? mb / sec : 0.0);
+#endif
+}
+
+} // namespace smi::raw
+
+#endif // SMI_RAW_TRANSFER_HPP
diff --git a/smi/src/smi.cpp b/smi/src/smi.cpp
index 063fdb8e..e9da5e3d 100644
--- a/smi/src/smi.cpp
+++ b/smi/src/smi.cpp
@@ -27,6 +27,9 @@
 /// reset, validate, debug).
 
 #include <iostream>
+#include <limits>
+#include <map>
+#include <string>
 #include <string_view>
 
 #include <CLI/CLI.hpp>
@@ -108,11 +111,90 @@ static int smiMain(int argc, char **argv) {
     // -- validate (memory integrity + bandwidth) --
     auto* validateCommand = app.add_subcommand("validate", "Validate board memory (integrity + bandwidth)");
     Validate::Options validateOptions;
+    auto addValidateSizeOption = [&](const char* name, uint64_t* target, const char* description) {
+        return validateCommand->add_option_function<std::string>(
+            name,
+            [target, name, &validateOptions](const std::string& value) {
+                try {
+                    *target = Validate::parseByteSizeOption(value);
+                    validateOptions.placementExplicit = true;
+                } catch (const std::exception& e) {
+                    throw CLI::ValidationError(name, e.what());
+                }
+            },
+            description);
+    };
     validateCommand->add_option("-d,--device", validateOptions.bdf, "Board address (e.g. 03:00 or 0000:03:00)")->required();
     validateCommand->add_option("-j,--threads", validateOptions.threads,
         "Number of parallel buffers/threads (1-64)")->default_val(8)->check(CLI::Range(1u, 64u));
     validateCommand->add_flag("-R,--no-reset", validateOptions.noReset,
         "Skip the device reset step before running memory tests");
+    validateCommand->add_option_function<std::string>("--mm-channel",
+        [&validateOptions](const std::string& value) {
+            try {
+                validateOptions.mmChannels = Validate::parseMmChannelSpec(value);
+            } catch (const std::exception& e) {
+                throw CLI::ValidationError("--mm-channel", e.what());
+            }
+        },
+        "AXI-MM/NoC channel per buffer: auto|0|1 applied to all buffers, or a "
+        "comma-separated list with exactly one entry per buffer position "
+        "(2 x --threads entries, e.g. -j 1 -> '0,1'); no repeating. "
+        "auto stripes across channels by qid&1. Default auto.")
+        ->default_str("auto");
+    addValidateSizeOption("--buffer-size", &validateOptions.bufferSize,
+        "Size of each validate buffer; accepts bytes or k/K/m/M suffixes (max 512M)")
+        ->default_str("512M");
+    addValidateSizeOption("--offset", &validateOptions.offset,
+        "Distance between logical validate buffer positions; accepts bytes or k/K/m/M suffixes")
+        ->default_str("512M");
+    addValidateSizeOption("--starting-offset", &validateOptions.startingOffset,
+        "Offset from each memory-space base for logical position 0; accepts bytes or k/K/m/M suffixes")
+        ->default_str("0");
+    auto* rawTransferFlag = validateCommand->add_flag("--raw-transfer-test", validateOptions.rawTransferTest,
+        "Use libslash raw QDMA transfers instead of VRTD buffers (implies --no-reset)");
+    auto* useQdmaDriverFlag = validateCommand->add_flag("--use-qdma-driver", validateOptions.useQdmaDriver,
+        "Run the raw transfer test over the off-the-shelf Xilinx QDMA driver "
+        "(/dev/qdma<idx>-MM-<qid>) instead of SLASH; requires the stock qdma driver "
+        "bound to the board. Implies --no-reset; mutually exclusive with --raw-transfer-test");
+    rawTransferFlag->excludes(useQdmaDriverFlag);
+    useQdmaDriverFlag->excludes(rawTransferFlag);
+    auto* ddrOnlyFlag = validateCommand->add_flag("--ddr-only", validateOptions.ddrOnly,
+        "Run only DDR memory tests (skip HBM)");
+    auto* hbmOnlyFlag = validateCommand->add_flag("--hbm-only", validateOptions.hbmOnly,
+        "Run only HBM memory tests (skip DDR)");
+    ddrOnlyFlag->excludes(hbmOnlyFlag);
+    hbmOnlyFlag->excludes(ddrOnlyFlag);
+    const std::map<std::string, Validate::Options::ChannelAllocation> channelAllocationMap{
+        {"auto", Validate::Options::ChannelAllocation::Auto},
+        {"paired", Validate::Options::ChannelAllocation::Paired},
+    };
+    validateCommand->add_option("--channel-allocation", validateOptions.channelAllocation,
+        "Raw-transfer NoC channel/memory placement (raw modes only): "
+        "auto (interleaved: mm-channel=qid&1, linear addressing; default) or "
+        "paired (couple mm-channel to a distinct memory region/NSU per "
+        "--channel-region-stride, mirroring dma-perf offset_ch0/offset_ch1)")
+        ->transform(CLI::CheckedTransformer(channelAllocationMap, CLI::ignore_case))
+        ->default_str("auto");
+    addValidateSizeOption("--channel-region-stride", &validateOptions.channelRegionStride,
+        "In --channel-allocation paired mode, byte distance between the two per-channel "
+        "memory regions (NSU/pseudo-channel stride); accepts k/K/m/M/g/G suffixes")
+        ->default_str("16G");
+    validateCommand->add_option_function<uint32_t>("--ring-size-index",
+        [&validateOptions](uint32_t value) {
+            validateOptions.ringSizeIndex = value;
+        },
+        "Raw-transfer queue descriptor-ring size index (0-15). Overrides the backend default.")
+        ->check(CLI::Range(0u, 15u))
+        ->default_str("backend default");
+    validateCommand->add_option("--bandwidth-iterations", validateOptions.bandwidthIterations,
+        "Raw-transfer bandwidth mode only: repeat each whole-buffer transfer this many times")
+        ->default_val(1)->check(CLI::Range(static_cast<uint64_t>(1),
+                                           std::numeric_limits<uint64_t>::max()));
+    validateCommand->add_option("--bandwidth-duration", validateOptions.bandwidthDuration,
+        "Raw-transfer bandwidth mode only: repeat whole-buffer transfers for this many seconds "
+        "(0 disables duration mode)")
+        ->default_val(0.0)->check(CLI::NonNegativeNumber);
 
     // -- debug (low-level debug utilities) --
     auto* debugCommand = app.add_subcommand("debug", "Low-level debug utilities");
diff --git a/smi/src/validate.cpp b/smi/src/validate.cpp
index 605e9abc..60f28159 100644
--- a/smi/src/validate.cpp
+++ b/smi/src/validate.cpp
@@ -33,22 +33,733 @@
 /// TODO: Decide whether vrt::Device should gain a vrtbin-less constructor so
 ///       that commands like validate can go through the standard vrt:: layer.
 
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include "validate.hpp"
 
+#include <algorithm>
+#include <cerrno>
+#include <cctype>
+#include <algorithm>
 #include <chrono>
+#include <exception>
 #include <cstdint>
+#include <cstdio>
 #include <cstring>
+#include <filesystem>
+#include <fstream>
 #include <iomanip>
 #include <iostream>
+#include <limits>
+#include <stdexcept>
+#include <string>
+#include <string_view>
 #include <thread>
+#include <utility>
 #include <vector>
 
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
 #include <vrtd/session.hpp>
 
 #include "bdf.hpp"
+#include "raw_transfer.hpp"
+
+#ifdef SMI_ENABLE_QDMA_DRIVER_BACKEND
+#include "qdma_driver_backend.hpp"
+#endif
+
+extern "C" {
+#include <slash/qdma.h>
+}
+
+namespace {
+
+using smi::raw::throwSystemError;
+
+/// Region constants mirror vrt/vrtd/src/allocator.h, which is private.
+static constexpr uint64_t HBM_BASE = 0x4000000000ULL;
+static constexpr uint64_t DDR_BASE = 0x60000000000ULL;
+static constexpr uint64_t MEM_REGION_SIZE = 512ULL * 1024 * 1024;
+static constexpr uint64_t MEMORY_SPACE_SIZE = 64ULL * MEM_REGION_SIZE;
+static constexpr uint64_t MAX_BUFFER_SIZE = MEM_REGION_SIZE;
+static constexpr uint64_t TRANSFER_ALIGNMENT = 4096ULL;
+
+static constexpr uint32_t QDMA_Q_MODE_MM = 0;
+static constexpr uint32_t QDMA_DIR_H2C = 0x1;
+static constexpr uint32_t QDMA_DIR_C2H = 0x2;
+static constexpr uint32_t QDMA_RING_SZ_IDX = 0;
+
+/// Required alignment for placement sizes/offsets: the QDMA transfer alignment
+/// (4 KiB base pages).
+static uint64_t requiredAlignment(const Validate::Options& options) {
+    (void)options;
+    return TRANSFER_ALIGNMENT;
+}
+
+/// Per-buffer AXI-MM channel selection.  A single-element list applies to every
+/// buffer; otherwise the list has exactly one entry per logical position
+/// (validated in validatePlacement) and is indexed directly.
+static Validate::Options::MmChannel mmChannelForPosition(const Validate::Options& options,
+                                                         uint64_t position) {
+    const auto& list = options.mmChannels;
+    return list.size() == 1 ? list.front() : list[position];
+}
+
+/// Map the per-buffer channel selection to the vrtd channel enum.
+static vrtd::MmChannel vrtdMmChannel(const Validate::Options& options, uint64_t position) {
+    switch (mmChannelForPosition(options, position)) {
+    case Validate::Options::MmChannel::Ch0: return vrtd::MmChannel::Ch0;
+    case Validate::Options::MmChannel::Ch1: return vrtd::MmChannel::Ch1;
+    case Validate::Options::MmChannel::Auto:
+    default: return vrtd::MmChannel::Auto;
+    }
+}
+
+/// Map the per-buffer channel selection to the SLASH UAPI channel enum.
+static slash_qdma_mm_channel slashMmChannel(const Validate::Options& options, uint64_t position) {
+    switch (mmChannelForPosition(options, position)) {
+    case Validate::Options::MmChannel::Ch0: return SLASH_QDMA_MM_CHANNEL_0;
+    case Validate::Options::MmChannel::Ch1: return SLASH_QDMA_MM_CHANNEL_1;
+    case Validate::Options::MmChannel::Auto:
+    default: return SLASH_QDMA_MM_CHANNEL_AUTO;
+    }
+}
+
+/// Map the per-buffer channel selection to a concrete channel for the
+/// off-the-shelf QDMA driver; -1 means auto (queue spreads by qid % channels).
+static int qdmaDriverMmChannel(const Validate::Options& options, uint64_t position) {
+    switch (mmChannelForPosition(options, position)) {
+    case Validate::Options::MmChannel::Ch0: return 0;
+    case Validate::Options::MmChannel::Ch1: return 1;
+    case Validate::Options::MmChannel::Auto:
+    default: return -1;
+    }
+}
+
+static std::string trim(std::string_view text) {
+    size_t first = 0;
+    while (first < text.size() &&
+           std::isspace(static_cast<unsigned char>(text[first]))) {
+        ++first;
+    }
+
+    size_t last = text.size();
+    while (last > first &&
+           std::isspace(static_cast<unsigned char>(text[last - 1]))) {
+        --last;
+    }
+
+    return std::string{text.substr(first, last - first)};
+}
+
+static uint64_t parseByteSizeText(std::string_view text) {
+    std::string value = trim(text);
+    if (value.empty()) {
+        throw std::invalid_argument("value must not be empty");
+    }
+
+    uint64_t multiplier = 1;
+    if (!value.empty() && (value.back() == 'b' || value.back() == 'B')) {
+        value.pop_back();
+    }
+    if (!value.empty()) {
+        const char suffix = value.back();
+        if (suffix == 'k' || suffix == 'K') {
+            multiplier = 1024ULL;
+            value.pop_back();
+        } else if (suffix == 'm' || suffix == 'M') {
+            multiplier = 1024ULL * 1024ULL;
+            value.pop_back();
+        } else if (suffix == 'g' || suffix == 'G') {
+            multiplier = 1024ULL * 1024ULL * 1024ULL;
+            value.pop_back();
+        }
+    }
+
+    value = trim(value);
+    if (value.empty() || value.front() == '-' || value.front() == '+') {
+        throw std::invalid_argument("value must be an unsigned byte count");
+    }
+
+    size_t parsed = 0;
+    uint64_t bytes = 0;
+    try {
+        bytes = std::stoull(value, &parsed, 0);
+    } catch (const std::exception&) {
+        throw std::invalid_argument("value must be an unsigned byte count");
+    }
+
+    if (parsed != value.size()) {
+        throw std::invalid_argument("unrecognized byte-size suffix");
+    }
+    if (bytes > std::numeric_limits<uint64_t>::max() / multiplier) {
+        throw std::invalid_argument("byte-size value is too large");
+    }
+
+    return bytes * multiplier;
+}
+
+static bool isAligned(uint64_t value, uint64_t alignment) {
+    return (value % alignment) == 0;
+}
+
+static bool checkAligned(const char* name, uint64_t value, uint64_t alignment) {
+    if (!isAligned(value, alignment)) {
+        std::cerr << "validate: " << name << " must be " << alignment
+                  << "-byte aligned" << std::endl;
+        return false;
+    }
+    return true;
+}
+
+static bool checkMemoryPlacementRange(const char* memoryName,
+                                      const Validate::Options& options,
+                                      uint64_t positions) {
+    if (positions == 0) {
+        return true;
+    }
+
+    const uint64_t lastPosition = positions - 1;
+    if (lastPosition != 0 &&
+        options.offset > (std::numeric_limits<uint64_t>::max() - options.startingOffset) /
+                             lastPosition) {
+        std::cerr << "validate: " << memoryName
+                  << " placement overflows 64-bit address arithmetic" << std::endl;
+        return false;
+    }
+
+    const uint64_t lastStart = options.startingOffset + lastPosition * options.offset;
+    if (lastStart > MEMORY_SPACE_SIZE || options.bufferSize > MEMORY_SPACE_SIZE - lastStart) {
+        std::cerr << "validate: " << memoryName << " placement exceeds available "
+                  << (MEMORY_SPACE_SIZE / (1024ULL * 1024ULL)) << " MiB address space"
+                  << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+/// Paired-mode per-channel region stride (NSU / pseudo-channel spacing),
+/// resolving 0 to half the per-memory address space.
+static uint64_t pairedRegionStride(const Validate::Options& options) {
+    return options.channelRegionStride != 0 ? options.channelRegionStride
+                                            : (MEMORY_SPACE_SIZE / 2);
+}
+
+/// Placement check for Paired channel allocation: even/odd positions occupy two
+/// regions `pairedRegionStride()` bytes apart, each packed by in-region index.
+/// Verifies neither region overflows into the next nor past the memory space.
+static bool checkMemoryPlacementRangePaired(const char* memoryName,
+                                            const Validate::Options& options,
+                                            uint64_t positions) {
+    if (positions == 0) {
+        return true;
+    }
+
+    const uint64_t stride = pairedRegionStride(options);
+    const uint64_t alignment = requiredAlignment(options);
+    if (stride == 0 || (stride % alignment) != 0) {
+        std::cerr << "validate: --channel-region-stride must be a non-zero multiple of "
+                  << alignment << " bytes" << std::endl;
+        return false;
+    }
+    if (stride > MEMORY_SPACE_SIZE) {
+        std::cerr << "validate: --channel-region-stride exceeds the "
+                  << (MEMORY_SPACE_SIZE / (1024ULL * 1024ULL)) << " MiB per-memory address space"
+                  << std::endl;
+        return false;
+    }
+
+    // Highest in-region index used across both regions (positions 0..positions-1,
+    // split even/odd, each using index = position >> 1).
+    const uint64_t maxIndex = (positions - 1) >> 1;
+    if (maxIndex != 0 &&
+        options.offset > (std::numeric_limits<uint64_t>::max() - options.startingOffset) / maxIndex) {
+        std::cerr << "validate: " << memoryName
+                  << " paired placement overflows 64-bit address arithmetic" << std::endl;
+        return false;
+    }
+    const uint64_t lastStart = options.startingOffset + maxIndex * options.offset;
+
+    // Each region must hold its last buffer without spilling into the next region.
+    if (lastStart > stride || options.bufferSize > stride - lastStart) {
+        std::cerr << "validate: " << memoryName
+                  << " paired placement overflows the per-channel region (stride " << stride
+                  << " bytes); reduce --threads/--buffer-size/--offset or raise"
+                     " --channel-region-stride" << std::endl;
+        return false;
+    }
+    // Region 1 sits one stride higher and must still fit the memory space.
+    if (lastStart + options.bufferSize > MEMORY_SPACE_SIZE - stride) {
+        std::cerr << "validate: " << memoryName
+                  << " paired placement exceeds available "
+                  << (MEMORY_SPACE_SIZE / (1024ULL * 1024ULL)) << " MiB address space"
+                  << std::endl;
+        return false;
+    }
+    return true;
+}
+
+static bool validatePlacement(const Validate::Options& options) {
+    const uint64_t positions = 2ULL * options.threads;
+    if (options.mmChannels.size() != 1 && options.mmChannels.size() != positions) {
+        std::cerr << "validate: --mm-channel list must have exactly 1 or " << positions
+                  << " entries (one per buffer position = 2 x --threads); got "
+                  << options.mmChannels.size() << std::endl;
+        return false;
+    }
+
+    if (options.bufferSize == 0 || options.bufferSize > MAX_BUFFER_SIZE) {
+        std::cerr << "validate: --buffer-size must be in the range 1..512M" << std::endl;
+        return false;
+    }
+    if (options.offset == 0) {
+        std::cerr << "validate: --offset must be greater than zero" << std::endl;
+        return false;
+    }
+    const uint64_t alignment = requiredAlignment(options);
+    if (!checkAligned("--buffer-size", options.bufferSize, alignment) ||
+        !checkAligned("--offset", options.offset, alignment) ||
+        !checkAligned("--starting-offset", options.startingOffset, alignment)) {
+        return false;
+    }
+    if (options.offset < options.bufferSize) {
+        std::cerr << "validate: --offset must be at least --buffer-size so buffers do not overlap"
+                  << std::endl;
+        return false;
+    }
+
+    const bool paired =
+        options.channelAllocation == Validate::Options::ChannelAllocation::Paired;
+    if (paired && !options.rawTransferTest && !options.useQdmaDriver) {
+        std::cerr << "validate: --channel-allocation paired only applies to the raw transfer"
+                     " tests (--raw-transfer-test or --use-qdma-driver)" << std::endl;
+        return false;
+    }
+    if ((options.bandwidthIterations > 1 || options.bandwidthDuration > 0.0) &&
+        !options.rawTransferTest && !options.useQdmaDriver) {
+        std::cerr << "validate: --bandwidth-iterations/--bandwidth-duration only apply to the raw transfer"
+                     " tests (--raw-transfer-test or --use-qdma-driver)" << std::endl;
+        return false;
+    }
+    if (options.ringSizeIndex.has_value() &&
+        !options.rawTransferTest && !options.useQdmaDriver) {
+        std::cerr << "validate: --ring-size-index only applies to the raw transfer"
+                     " tests (--raw-transfer-test or --use-qdma-driver)" << std::endl;
+        return false;
+    }
+    if (options.bandwidthDuration < 0.0) {
+        std::cerr << "validate: --bandwidth-duration must be non-negative" << std::endl;
+        return false;
+    }
+    if (options.ringSizeIndex.has_value() && *options.ringSizeIndex > 15) {
+        std::cerr << "validate: --ring-size-index must be in the range 0..15" << std::endl;
+        return false;
+    }
+
+    const auto checkRange = paired ? checkMemoryPlacementRangePaired : checkMemoryPlacementRange;
+    if (!options.ddrOnly && !checkRange("HBM", options, positions)) {
+        return false;
+    }
+    if (!options.hbmOnly && !checkRange("DDR", options, positions)) {
+        return false;
+    }
+
+    return true;
+}
+
+static uint64_t addressFor(uint64_t memoryBase,
+                           const Validate::Options& options,
+                           uint64_t position) {
+    return memoryBase + options.startingOffset + position * options.offset;
+}
+
+/// Device address for a raw-transfer buffer, honouring the channel-allocation
+/// strategy.  In Paired mode the mm-channel (position&1 -- which SLASH maps to
+/// the SW-context host_id and hence the CPM5 NoC NMU) is coupled to a distinct
+/// memory region (NSU): even positions land in region 0, odd positions in
+/// region 1, pairedRegionStride() bytes higher, each packed by its in-region
+/// index.  This mirrors dma-perf's offset_ch0/offset_ch1 so the two NMUs drive
+/// independent memory endpoints instead of converging on one.
+static uint64_t rawAddressFor(uint64_t memoryBase,
+                              const Validate::Options& options,
+                              uint64_t position) {
+    if (options.channelAllocation == Validate::Options::ChannelAllocation::Paired) {
+        const uint64_t channel = position & 1ULL;
+        const uint64_t inRegionIndex = position >> 1;
+        return memoryBase + channel * pairedRegionStride(options) +
+               options.startingOffset + inRegionIndex * options.offset;
+    }
+    return addressFor(memoryBase, options, position);
+}
+
+/// Print which raw-transfer channel-allocation strategy is in effect.
+static void printChannelAllocation(const Validate::Options& options) {
+    if (options.channelAllocation == Validate::Options::ChannelAllocation::Paired) {
+        std::cout << "Channel allocation: paired (even positions -> mm-channel 0 / region 0, "
+                     "odd -> mm-channel 1 / region 1; region stride 0x"
+                  << std::hex << pairedRegionStride(options) << std::dec << " bytes)" << std::endl;
+    } else {
+        std::cout << "Channel allocation: auto (mm-channel = qid&1, linear addressing)" << std::endl;
+    }
+}
+
+/// Print the raw-transfer queue ring-size override, when one was requested.
+static void printRingSizeIndex(const Validate::Options& options) {
+    if (options.ringSizeIndex.has_value()) {
+        std::cout << "QDMA ring size index: " << *options.ringSizeIndex << std::endl;
+    }
+}
+
+/// Print the per-buffer AXI-MM channel selection in effect.
+static void printMmChannel(const Validate::Options& options) {
+    std::cout << "MM channel: ";
+    for (size_t i = 0; i < options.mmChannels.size(); ++i) {
+        if (i != 0) {
+            std::cout << ",";
+        }
+        switch (options.mmChannels[i]) {
+        case Validate::Options::MmChannel::Ch0: std::cout << "0"; break;
+        case Validate::Options::MmChannel::Ch1: std::cout << "1"; break;
+        case Validate::Options::MmChannel::Auto:
+        default: std::cout << "auto"; break;
+        }
+    }
+    std::cout << (options.mmChannels.size() == 1 ? " (all buffers)" : " (per buffer position)")
+              << std::endl;
+}
+
+static bool checkHostMemoryBudget(const Validate::Options& options) {
+    const uint64_t maxConcurrentBuffers = (!options.ddrOnly && !options.hbmOnly)
+        ? 4ULL * options.threads
+        : 2ULL * options.threads;
+    const uint64_t requiredBytes = maxConcurrentBuffers * options.bufferSize;
+
+    const long pageSize = sysconf(_SC_PAGESIZE);
+    const long availablePages = sysconf(_SC_AVPHYS_PAGES);
+
+    if (pageSize <= 0 || availablePages <= 0) {
+        std::cerr << "Warning: unable to estimate available host memory for validate; "
+                  << "peak mapped buffer footprint is "
+                  << (requiredBytes / (1024ULL * 1024ULL)) << " MiB." << std::endl;
+        return true;
+    }
+
+    const auto availableBytes = static_cast<uint64_t>(pageSize) *
+        static_cast<uint64_t>(availablePages);
+    if (requiredBytes > availableBytes) {
+        std::cerr << "validate: requested test can map up to "
+                  << (requiredBytes / (1024ULL * 1024ULL)) << " MiB of host buffers, "
+                  << "but only about " << (availableBytes / (1024ULL * 1024ULL))
+                  << " MiB is currently available. Reduce --threads or use --ddr-only/--hbm-only."
+                  << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+static void warnIfNotRoot(const char* mode) {
+    if (geteuid() != 0) {
+        std::cerr << "Warning: " << mode
+                  << " usually needs root or udev-granted access to QDMA device nodes/sysfs."
+                  << std::endl;
+    }
+}
+
+std::string readDevNameFromUevent(const std::filesystem::path& miscPath) {
+    std::ifstream uevent(miscPath / "uevent");
+    if (!uevent.is_open()) {
+        throw std::runtime_error("Failed to open " + (miscPath / "uevent").string());
+    }
+
+    std::string line;
+    while (std::getline(uevent, line)) {
+        static constexpr std::string_view key{"DEVNAME="};
+        if (!line.starts_with(key)) {
+            continue;
+        }
+
+        std::string devName = line.substr(key.size());
+        while (!devName.empty() && (devName.back() == '\n' || devName.back() == '\r')) {
+            devName.pop_back();
+        }
+        return "/dev/" + devName;
+    }
+
+    throw std::runtime_error("No DEVNAME entry found in " + (miscPath / "uevent").string());
+}
+
+std::string resolveQdmaDevicePath(const std::string& boardBdf) {
+    static const std::filesystem::path MISC_PATH{"/sys/class/misc"};
+
+    const std::string exactName = "slash_qdma_ctl_" + boardBdf + ".1";
+    const auto exactPath = MISC_PATH / exactName;
+    if (std::filesystem::exists(exactPath)) {
+        return readDevNameFromUevent(exactPath);
+    }
+
+    const std::string prefix = "slash_qdma_ctl_" + boardBdf + ".";
+    std::vector<std::filesystem::path> matches;
+    for (const auto& entry : std::filesystem::directory_iterator(MISC_PATH)) {
+        const std::string name = entry.path().filename().string();
+        if (name.starts_with(prefix)) {
+            matches.push_back(entry.path());
+        }
+    }
+
+    if (matches.empty()) {
+        throw std::runtime_error(
+            "No QDMA misc device found for board " + boardBdf +
+            " (looked for /sys/class/misc/" + prefix + "*)");
+    }
+
+    std::sort(matches.begin(), matches.end());
+    if (matches.size() > 1) {
+        std::cerr << "Warning: multiple QDMA devices found for " << boardBdf
+                  << "; using " << matches.front().filename().string() << std::endl;
+    }
+
+    return readDevNameFromUevent(matches.front());
+}
+
+class RawQdmaDevice {
+public:
+    explicit RawQdmaDevice(const std::string& path) : qdma_{slash_qdma_open(path.c_str())} {
+        if (qdma_ == nullptr) {
+            throwSystemError("Failed to open QDMA device " + path);
+        }
+    }
+
+    RawQdmaDevice(const RawQdmaDevice&) = delete;
+    RawQdmaDevice& operator=(const RawQdmaDevice&) = delete;
+
+    RawQdmaDevice(RawQdmaDevice&& other) noexcept : qdma_{other.qdma_} {
+        other.qdma_ = nullptr;
+    }
+
+    RawQdmaDevice& operator=(RawQdmaDevice&& other) noexcept {
+        if (this != &other) {
+            cleanup();
+            qdma_ = other.qdma_;
+            other.qdma_ = nullptr;
+        }
+        return *this;
+    }
+
+    ~RawQdmaDevice() {
+        cleanup();
+    }
+
+    slash_qdma* get() const {
+        return qdma_;
+    }
+
+private:
+    void cleanup() {
+        if (qdma_ != nullptr) {
+            (void)slash_qdma_close(qdma_);
+            qdma_ = nullptr;
+        }
+    }
+
+    slash_qdma* qdma_ = nullptr;
+};
+
+class RawTransferBuffer {
+public:
+    RawTransferBuffer(slash_qdma* qdma, uint64_t physAddr, uint64_t size,
+                      slash_qdma_mm_channel mmChannel,
+                      uint32_t ringSizeIndex)
+        : qdma_{qdma}, physAddr_{physAddr}, size_{size},
+          mmChannel_{mmChannel}, ringSizeIndex_{ringSizeIndex} {
+        try {
+            createBuffer();
+            createQpair();
+        } catch (...) {
+            cleanup();
+            throw;
+        }
+    }
+
+    RawTransferBuffer(const RawTransferBuffer&) = delete;
+    RawTransferBuffer& operator=(const RawTransferBuffer&) = delete;
+
+    RawTransferBuffer(RawTransferBuffer&& other) noexcept {
+        moveFrom(other);
+    }
+
+    RawTransferBuffer& operator=(RawTransferBuffer&& other) noexcept {
+        if (this != &other) {
+            cleanup();
+            moveFrom(other);
+        }
+        return *this;
+    }
+
+    ~RawTransferBuffer() {
+        cleanup();
+    }
+
+    void* data() {
+        return data_;
+    }
+
+    uint64_t getSize() const {
+        return size_;
+    }
+
+    void syncToDevice(uint64_t offset, uint64_t size) {
+        validateSyncRange(offset, size);
+        transfer(offset, size, /*toDevice=*/true);
+    }
+
+    void syncFromDevice(uint64_t offset, uint64_t size) {
+        validateSyncRange(offset, size);
+        transfer(offset, size, /*toDevice=*/false);
+    }
+
+private:
+    void moveFrom(RawTransferBuffer& other) noexcept {
+        qdma_ = other.qdma_;
+        fd_ = other.fd_;
+        qid_ = other.qid_;
+        qpairCreated_ = other.qpairCreated_;
+        qpairStarted_ = other.qpairStarted_;
+        buf_ = other.buf_;
+        data_ = other.data_;
+        physAddr_ = other.physAddr_;
+        size_ = other.size_;
+        transferStepSize_ = other.transferStepSize_;
+        mmChannel_ = other.mmChannel_;
+        ringSizeIndex_ = other.ringSizeIndex_;
+
+        other.qdma_ = nullptr;
+        other.fd_ = -1;
+        other.qid_ = 0;
+        other.qpairCreated_ = false;
+        other.qpairStarted_ = false;
+        other.buf_ = slash_qdma_buffer{};
+        other.data_ = nullptr;
+        other.physAddr_ = 0;
+        other.size_ = 0;
+        other.transferStepSize_ = 0;
+        other.ringSizeIndex_ = QDMA_RING_SZ_IDX;
+    }
+
+    void createBuffer() {
+        // The kernel owns the DMA buffer (pages + SGL + DMA map built once at
+        // create time); we mmap it for CPU access via buf_.addr.
+        if (slash_qdma_buffer_create(qdma_, size_, &buf_) != 0) {
+            throwSystemError("Failed to create raw transfer DMA buffer");
+        }
+        data_ = buf_.addr;
+        transferStepSize_ = smi::raw::BASE_TRANSFER_STEP_SIZE;
+
+        // Pre-fault the mapping so the page-fault cost stays out of the timed
+        // transfer loop.
+        auto* touch = static_cast<volatile uint8_t*>(data_);
+        for (uint64_t off = 0; off < size_; off += transferStepSize_) {
+            touch[off] = 0;
+        }
+    }
+
+    void createQpair() {
+        if (qdma_ == nullptr || size_ == 0) {
+            throw std::invalid_argument("Invalid raw transfer buffer arguments");
+        }
+
+        struct slash_qdma_qpair_add req{};
+        req.size = sizeof(req);
+        req.mode = QDMA_Q_MODE_MM;
+        req.dir_mask = QDMA_DIR_H2C | QDMA_DIR_C2H;
+        req.mm_channel = mmChannel_;
+        req.h2c_ring_sz = ringSizeIndex_;
+        req.c2h_ring_sz = ringSizeIndex_;
+        req.cmpt_ring_sz = ringSizeIndex_;
+
+        if (slash_qdma_qpair_add(qdma_, &req) != 0) {
+            throwSystemError("Failed to add raw transfer QDMA queue pair");
+        }
+        qid_ = req.qid;
+        qpairCreated_ = true;
+
+        if (slash_qdma_qpair_start(qdma_, qid_) != 0) {
+            throwSystemError("Failed to start raw transfer QDMA queue pair");
+        }
+        qpairStarted_ = true;
+
+        fd_ = slash_qdma_qpair_get_fd(qdma_, qid_, O_CLOEXEC);
+        if (fd_ < 0) {
+            throwSystemError("Failed to get raw transfer QDMA queue fd");
+        }
+    }
+
+    void validateSyncRange(uint64_t offset, uint64_t size) const {
+        smi::raw::validateSyncRange(offset, size, size_, physAddr_, transferStepSize_);
+    }
+
+    void transfer(uint64_t offset, uint64_t size, bool toDevice) {
+        // Issue via the array transfer ioctl with a single sub-transfer on this
+        // buffer's queue pair (qpair_index 0).  Channel parallelism for the
+        // bandwidth test comes from running many buffers concurrently, each
+        // pinned to a channel by mm_channel (see the channel-allocation knobs).
+        struct slash_qdma_subxfer xfer{};
+        xfer.qpair_index = 0;
+        xfer.direction = toDevice ? SLASH_QDMA_XFER_H2C : SLASH_QDMA_XFER_C2H;
+        xfer.buf_fd = buf_.fd;
+        xfer.buf_offset = offset;
+        xfer.dev_addr = physAddr_ + offset;
+        xfer.length = size;
 
-/// Buffer size for each allocation (64 MB — one allocator subregion).
-static constexpr uint64_t BUFFER_SIZE = 64ULL * 1024 * 1024;
+        ssize_t n = slash_qdma_qpair_transfer_batch(fd_, &xfer, 1);
+        if (n < 0) {
+            throwSystemError(toDevice ? "Raw QDMA write failed"
+                                      : "Raw QDMA read failed");
+        }
+        if (static_cast<uint64_t>(n) != size) {
+            throw std::runtime_error("Raw QDMA transfer moved fewer bytes than requested");
+        }
+    }
+
+    void cleanup() {
+        if (fd_ >= 0) {
+            (void)close(fd_);
+            fd_ = -1;
+        }
+        if (qpairStarted_) {
+            (void)slash_qdma_qpair_stop(qdma_, qid_);
+            qpairStarted_ = false;
+        }
+        if (qpairCreated_) {
+            (void)slash_qdma_qpair_del(qdma_, qid_);
+            qpairCreated_ = false;
+        }
+        if (buf_.addr != nullptr) {
+            (void)slash_qdma_buffer_destroy(&buf_);
+            buf_ = slash_qdma_buffer{};
+        }
+        data_ = nullptr;
+    }
+
+    slash_qdma* qdma_ = nullptr;
+    int fd_ = -1;
+    uint32_t qid_ = 0;
+    bool qpairCreated_ = false;
+    bool qpairStarted_ = false;
+    slash_qdma_buffer buf_{};
+    void* data_ = nullptr;
+    uint64_t physAddr_ = 0;
+    uint64_t size_ = 0;
+    uint64_t transferStepSize_ = 0;
+    slash_qdma_mm_channel mmChannel_ = SLASH_QDMA_MM_CHANNEL_AUTO;
+    uint32_t ringSizeIndex_ = QDMA_RING_SZ_IDX;
+};
 
 /// Fill @p buf with a deterministic pattern seeded by @p seed.
 static void fillPattern(void* buf, uint64_t size, uint32_t seed) {
@@ -74,12 +785,19 @@ static bool verifyPattern(const void* buf, uint64_t size, uint32_t seed) {
 
 /// Run data integrity on every buffer: write pattern → sync to device →
 /// clear host → sync from device → verify.
+///
+/// Output policy: per-buffer FAIL lines are printed as failures occur; OK
+/// buffers are silent.  A single summary line ("N/N OK" or "M/N OK, K
+/// FAIL") is printed at the end.
+///
 /// @return true if all buffers pass.
-static bool testDataIntegrity(std::vector<vrtd::Buffer>& buffers,
+template<typename Buffer>
+static bool testDataIntegrity(std::vector<Buffer>& buffers,
                               const std::string& label) {
-    bool allPassed = true;
+    const size_t total = buffers.size();
+    size_t passed = 0;
 
-    for (size_t i = 0; i < buffers.size(); ++i) {
+    for (size_t i = 0; i < total; ++i) {
         auto& buf = buffers[i];
         uint32_t seed = static_cast<uint32_t>(i);
         uint64_t size = buf.getSize();
@@ -90,73 +808,726 @@ static bool testDataIntegrity(std::vector<vrtd::Buffer>& buffers,
         std::memset(buf.data(), 0, size);
         buf.syncFromDevice(0, size);
 
-        bool ok = verifyPattern(buf.data(), size, seed);
-        std::cout << "    " << label << i << ": "
-                  << (ok ? "OK" : "FAIL") << std::endl;
-
-        if (!ok) {
-            allPassed = false;
+        if (verifyPattern(buf.data(), size, seed)) {
+            ++passed;
+        } else {
+            std::cout << "    " << label << i << ": FAIL" << std::endl;
         }
     }
 
-    return allPassed;
+    if (passed == total) {
+        std::cout << "    " << total << "/" << total << " OK" << std::endl;
+    } else {
+        std::cout << "    " << passed << "/" << total << " OK, "
+                  << (total - passed) << " FAIL" << std::endl;
+    }
+
+    return passed == total;
+}
+
+static double mbPerSecond(uint64_t bytes, std::chrono::duration<double> elapsed) {
+    const double totalMB = static_cast<double>(bytes) / (1024.0 * 1024.0);
+    return totalMB / elapsed.count();
+}
+
+static void printBandwidthMetric(const char* label, double mbps) {
+    std::cout << "    " << label << ": " << std::fixed << std::setprecision(2)
+              << mbps << " MB/s" << std::endl;
+}
+
+struct BandwidthRepeatOptions {
+    uint64_t iterations = 1;
+    std::chrono::duration<double> duration{0.0};
+
+    bool durationMode() const {
+        return duration.count() > 0.0;
+    }
+
+    bool isRepeated() const {
+        return durationMode() || iterations > 1;
+    }
+};
+
+static BandwidthRepeatOptions repeatOptionsFromValidate(const Validate::Options& options) {
+    BandwidthRepeatOptions repeat;
+    repeat.iterations = std::max<uint64_t>(1, options.bandwidthIterations);
+    repeat.duration = std::chrono::duration<double>(options.bandwidthDuration);
+    return repeat;
 }
 
-/// Measure aggregate write and read bandwidth across all buffers in parallel
-/// (one std::thread per buffer).
-static void testBandwidth(std::vector<vrtd::Buffer>& buffers) {
+static void printBandwidthRepeatMode(const BandwidthRepeatOptions& repeat) {
+    if (repeat.durationMode()) {
+        std::cout << "Bandwidth mode: duration " << std::fixed << std::setprecision(3)
+                  << repeat.duration.count() << " s" << std::endl;
+    } else if (repeat.iterations > 1) {
+        std::cout << "Bandwidth mode: " << repeat.iterations << " iterations" << std::endl;
+    }
+}
+
+template<typename Buffer>
+static uint64_t fillBuffers(std::vector<Buffer>& buffers, int value) {
     uint64_t totalBytes = 0;
     for (auto& buf : buffers) {
-        std::memset(buf.data(), 0xAB, buf.getSize());
+        std::memset(buf.data(), value, buf.getSize());
         totalBytes += buf.getSize();
     }
+    return totalBytes;
+}
 
-    // -- Write (H2C) bandwidth --
-    auto writeStart = std::chrono::steady_clock::now();
-    {
-        std::vector<std::thread> threads;
-        threads.reserve(buffers.size());
-        for (auto& buf : buffers) {
-            threads.emplace_back([&buf] {
-                buf.syncToDevice(0, buf.getSize());
-            });
+template<typename Buffer>
+static void launchTransferThreads(std::vector<Buffer>& buffers,
+                                  bool toDevice,
+                                  std::vector<std::thread>& threads,
+                                  std::vector<std::exception_ptr>& errors,
+                                  size_t errorOffset) {
+    for (size_t i = 0; i < buffers.size(); ++i) {
+        threads.emplace_back([&buffers, &errors, i, errorOffset, toDevice] {
+            try {
+                if (toDevice) {
+                    buffers[i].syncToDevice(0, buffers[i].getSize());
+                } else {
+                    buffers[i].syncFromDevice(0, buffers[i].getSize());
+                }
+            } catch (...) {
+                errors[errorOffset + i] = std::current_exception();
+            }
+        });
+    }
+}
+
+template<typename Buffer>
+static void runTransfers(std::vector<Buffer>& buffers, bool toDevice) {
+    std::vector<std::thread> threads;
+    std::vector<std::exception_ptr> errors(buffers.size());
+    threads.reserve(buffers.size());
+
+    launchTransferThreads(buffers, toDevice, threads, errors, 0);
+
+    for (auto& t : threads) {
+        t.join();
+    }
+    for (auto& error : errors) {
+        if (error) {
+            std::rethrow_exception(error);
         }
-        for (auto& t : threads) {
-            t.join();
+    }
+}
+
+static uint64_t joinRepeatedTransferThreads(std::vector<std::thread>& threads,
+                                            std::vector<std::exception_ptr>& errors,
+                                            const std::vector<uint64_t>& bytes) {
+    for (auto& t : threads) {
+        t.join();
+    }
+    for (auto& error : errors) {
+        if (error) {
+            std::rethrow_exception(error);
         }
     }
-    auto writeEnd = std::chrono::steady_clock::now();
 
-    // -- Read (C2H) bandwidth --
-    auto readStart = std::chrono::steady_clock::now();
-    {
-        std::vector<std::thread> threads;
-        threads.reserve(buffers.size());
+    uint64_t totalBytes = 0;
+    for (uint64_t value : bytes) {
+        totalBytes += value;
+    }
+    return totalBytes;
+}
+
+template<typename Buffer>
+static std::pair<uint64_t, std::chrono::duration<double>>
+runRepeatedTransfers(std::vector<Buffer>& buffers,
+                     bool toDevice,
+                     const BandwidthRepeatOptions& repeat) {
+    std::vector<std::thread> threads;
+    std::vector<std::exception_ptr> errors(buffers.size());
+    std::vector<uint64_t> bytes(buffers.size(), 0);
+    threads.reserve(buffers.size());
+
+    const auto start = std::chrono::steady_clock::now();
+    const auto deadline = start + repeat.duration;
+
+    for (size_t i = 0; i < buffers.size(); ++i) {
+        threads.emplace_back([&buffers, &errors, &bytes, i, toDevice, repeat, deadline] {
+            try {
+                const uint64_t size = buffers[i].getSize();
+                uint64_t completed = 0;
+
+                if (repeat.durationMode()) {
+                    while (std::chrono::steady_clock::now() < deadline) {
+                        if (toDevice) {
+                            buffers[i].syncToDevice(0, size);
+                        } else {
+                            buffers[i].syncFromDevice(0, size);
+                        }
+                        ++completed;
+                    }
+                } else {
+                    for (uint64_t iter = 0; iter < repeat.iterations; ++iter) {
+                        if (toDevice) {
+                            buffers[i].syncToDevice(0, size);
+                        } else {
+                            buffers[i].syncFromDevice(0, size);
+                        }
+                        ++completed;
+                    }
+                }
+
+                bytes[i] = completed * size;
+            } catch (...) {
+                errors[i] = std::current_exception();
+            }
+        });
+    }
+
+    const uint64_t totalBytes = joinRepeatedTransferThreads(threads, errors, bytes);
+    const auto end = std::chrono::steady_clock::now();
+    return {totalBytes, end - start};
+}
+
+template<typename Buffer>
+static double testSingleDirectionBandwidth(std::vector<Buffer>& buffers,
+                                           bool toDevice,
+                                           const BandwidthRepeatOptions& repeat = {}) {
+    (void)fillBuffers(buffers, toDevice ? 0xAB : 0xCD);
+
+    if (!toDevice) {
+        runTransfers(buffers, /*toDevice=*/true);
         for (auto& buf : buffers) {
-            threads.emplace_back([&buf] {
-                buf.syncFromDevice(0, buf.getSize());
-            });
+            std::memset(buf.data(), 0, buf.getSize());
         }
-        for (auto& t : threads) {
-            t.join();
+    }
+
+    const auto [totalBytes, elapsed] = runRepeatedTransfers(buffers, toDevice, repeat);
+
+    return mbPerSecond(totalBytes, elapsed);
+}
+
+template<typename Buffer>
+static void testBidirectionalBandwidth(std::vector<Buffer>& writeBuffers,
+                                       std::vector<Buffer>& readBuffers,
+                                       const BandwidthRepeatOptions& repeat = {}) {
+    (void)fillBuffers(writeBuffers, 0xAB);
+    (void)fillBuffers(readBuffers, 0xCD);
+
+    // Prime device memory before timing so the C2H side reads initialized data.
+    runTransfers(readBuffers, /*toDevice=*/true);
+    for (auto& buf : readBuffers) {
+        std::memset(buf.data(), 0, buf.getSize());
+    }
+
+    std::vector<std::thread> threads;
+    std::vector<std::exception_ptr> errors(writeBuffers.size() + readBuffers.size());
+    std::vector<uint64_t> writeThreadBytes(writeBuffers.size(), 0);
+    std::vector<uint64_t> readThreadBytes(readBuffers.size(), 0);
+    threads.reserve(errors.size());
+
+    const auto start = std::chrono::steady_clock::now();
+    const auto deadline = start + repeat.duration;
+
+    for (size_t i = 0; i < writeBuffers.size(); ++i) {
+        threads.emplace_back([&writeBuffers, &errors, &writeThreadBytes, i, repeat, deadline] {
+            try {
+                const uint64_t size = writeBuffers[i].getSize();
+                uint64_t completed = 0;
+
+                if (repeat.durationMode()) {
+                    while (std::chrono::steady_clock::now() < deadline) {
+                        writeBuffers[i].syncToDevice(0, size);
+                        ++completed;
+                    }
+                } else {
+                    for (uint64_t iter = 0; iter < repeat.iterations; ++iter) {
+                        writeBuffers[i].syncToDevice(0, size);
+                        ++completed;
+                    }
+                }
+
+                writeThreadBytes[i] = completed * size;
+            } catch (...) {
+                errors[i] = std::current_exception();
+            }
+        });
+    }
+    for (size_t i = 0; i < readBuffers.size(); ++i) {
+        threads.emplace_back([&readBuffers, &errors, &readThreadBytes, i,
+                              repeat, deadline, errorOffset = writeBuffers.size()] {
+            try {
+                const uint64_t size = readBuffers[i].getSize();
+                uint64_t completed = 0;
+
+                if (repeat.durationMode()) {
+                    while (std::chrono::steady_clock::now() < deadline) {
+                        readBuffers[i].syncFromDevice(0, size);
+                        ++completed;
+                    }
+                } else {
+                    for (uint64_t iter = 0; iter < repeat.iterations; ++iter) {
+                        readBuffers[i].syncFromDevice(0, size);
+                        ++completed;
+                    }
+                }
+
+                readThreadBytes[i] = completed * size;
+            } catch (...) {
+                errors[errorOffset + i] = std::current_exception();
+            }
+        });
+    }
+
+    for (auto& t : threads) {
+        t.join();
+    }
+    const auto end = std::chrono::steady_clock::now();
+
+    for (auto& error : errors) {
+        if (error) {
+            std::rethrow_exception(error);
         }
     }
-    auto readEnd = std::chrono::steady_clock::now();
 
-    double writeSec = std::chrono::duration<double>(writeEnd - writeStart).count();
-    double readSec  = std::chrono::duration<double>(readEnd - readStart).count();
-    double totalMB  = static_cast<double>(totalBytes) / (1024.0 * 1024.0);
+    const auto elapsed = end - start;
+    uint64_t writeBytes = 0;
+    uint64_t readBytes = 0;
+    for (uint64_t value : writeThreadBytes) {
+        writeBytes += value;
+    }
+    for (uint64_t value : readThreadBytes) {
+        readBytes += value;
+    }
+    const double writeMBps = mbPerSecond(writeBytes, elapsed);
+    const double readMBps = mbPerSecond(readBytes, elapsed);
+
+    printBandwidthMetric("Read", readMBps);
+    printBandwidthMetric("Write", writeMBps);
+    printBandwidthMetric("Total", readMBps + writeMBps);
+}
+
+template<typename Buffer>
+static void testBandwidthSuite(std::vector<Buffer>& singleDirectionBuffers,
+                               const std::string& label,
+                               const std::string& backendSuffix,
+                               const BandwidthRepeatOptions& repeat = {}) {
+    std::cout << "Testing " << label << " read bandwidth ("
+              << singleDirectionBuffers.size() << " threads" << backendSuffix << ")..." << std::endl;
+    printBandwidthMetric("Read", testSingleDirectionBandwidth(singleDirectionBuffers, /*toDevice=*/false, repeat));
+
+    std::cout << "Testing " << label << " write bandwidth ("
+              << singleDirectionBuffers.size() << " threads" << backendSuffix << ")..." << std::endl;
+    printBandwidthMetric("Write", testSingleDirectionBandwidth(singleDirectionBuffers, /*toDevice=*/true, repeat));
+}
+
+template<typename Buffer>
+static void testBidirectionalBandwidthSuite(std::vector<Buffer>& bidirectionalWriteBuffers,
+                                            std::vector<Buffer>& bidirectionalReadBuffers,
+                                            const std::string& label,
+                                            const std::string& backendSuffix,
+                                            const BandwidthRepeatOptions& repeat = {}) {
+    std::cout << "Testing " << label << " bidirectional bandwidth ("
+              << (bidirectionalWriteBuffers.size() + bidirectionalReadBuffers.size())
+              << " threads" << backendSuffix << ")..." << std::endl;
+    testBidirectionalBandwidth(bidirectionalWriteBuffers, bidirectionalReadBuffers, repeat);
+}
+
+static vrtd::Buffer openValidateHbmBuffer(const vrtd::Device& device,
+                                          const Validate::Options& options,
+                                          uint64_t position) {
+    if (options.placementExplicit) {
+        return device.openRawBuffer(addressFor(HBM_BASE, options, position),
+                                    options.bufferSize, vrtd::BufferAllocDir::Bidirectional,
+                                    vrtdMmChannel(options, position));
+    }
+
+    return device.openHbmBuffer(static_cast<uint32_t>(position), options.bufferSize,
+                                vrtd::BufferAllocDir::Bidirectional,
+                                vrtdMmChannel(options, position));
+}
+
+static vrtd::Buffer openValidateDdrBuffer(const vrtd::Device& device,
+                                          const Validate::Options& options,
+                                          uint64_t position) {
+    if (options.placementExplicit) {
+        return device.openRawBuffer(addressFor(DDR_BASE, options, position),
+                                    options.bufferSize, vrtd::BufferAllocDir::Bidirectional,
+                                    vrtdMmChannel(options, position));
+    }
 
-    std::cout << "    Write: " << std::fixed << std::setprecision(2)
-              << (totalMB / writeSec) << " MB/s" << std::endl;
-    std::cout << "    Read:  " << std::fixed << std::setprecision(2)
-              << (totalMB / readSec) << " MB/s" << std::endl;
+    return device.openDdrBuffer(options.bufferSize, vrtd::BufferAllocDir::Bidirectional,
+                                vrtdMmChannel(options, position));
+}
+
+static int runRawTransferTest(const std::string& bdf, const Validate::Options& options) {
+    const unsigned N = options.threads;
+    const BandwidthRepeatOptions repeat = repeatOptionsFromValidate(options);
+
+    if (!options.noReset) {
+        std::cout << "Raw transfer mode skips reset; continuing without VRTD reset." << std::endl;
+    }
+    warnIfNotRoot("SLASH raw transfer mode");
+
+    const std::string qdmaPath = resolveQdmaDevicePath(bdf);
+    std::cout << "Using raw QDMA device " << qdmaPath << "..." << std::endl;
+    printChannelAllocation(options);
+    printMmChannel(options);
+    printRingSizeIndex(options);
+    printBandwidthRepeatMode(repeat);
+
+    RawQdmaDevice qdma(qdmaPath);
+    const uint32_t ringSizeIndex = options.ringSizeIndex.value_or(QDMA_RING_SZ_IDX);
+
+    if (!options.ddrOnly) {
+        std::cout << "Testing HBM data integrity (" << N << " regions, raw QDMA)..." << std::endl;
+        {
+            std::vector<RawTransferBuffer> hbmBuffers;
+            hbmBuffers.reserve(N);
+            for (unsigned i = 0; i < N; ++i) {
+                hbmBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, i),
+                                        options.bufferSize,
+                                        slashMmChannel(options, i), ringSizeIndex);
+            }
+
+            if (!testDataIntegrity(hbmBuffers, "HBM")) {
+                std::cerr << "HBM data integrity check failed" << std::endl;
+                return 1;
+            }
+
+            testBandwidthSuite(hbmBuffers, "HBM", ", raw QDMA", repeat);
+        }
+        {
+            // Bidirectional HBM: positions interleave R/W across regions
+            // 0..2N-1.  Reads land on even regions, writes on odd regions.
+            std::vector<RawTransferBuffer> hbmWriteBuffers;
+            std::vector<RawTransferBuffer> hbmReadBuffers;
+            hbmWriteBuffers.reserve(N);
+            hbmReadBuffers.reserve(N);
+            for (unsigned i = 0; i < N; ++i) {
+                hbmReadBuffers.emplace_back(qdma.get(),
+                                            rawAddressFor(HBM_BASE, options, 2 * i),
+                                            options.bufferSize,
+                                            slashMmChannel(options, 2 * i), ringSizeIndex);
+                hbmWriteBuffers.emplace_back(qdma.get(),
+                                             rawAddressFor(HBM_BASE, options, 2 * i + 1),
+                                             options.bufferSize,
+                                             slashMmChannel(options, 2 * i + 1), ringSizeIndex);
+            }
+
+            testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", raw QDMA", repeat);
+        }
+    }
+
+    if (!options.hbmOnly) {
+        std::cout << "Testing DDR data integrity (" << N << " buffers, raw QDMA)..." << std::endl;
+        {
+            std::vector<RawTransferBuffer> ddrBuffers;
+            ddrBuffers.reserve(N);
+            for (unsigned i = 0; i < N; ++i) {
+                ddrBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, i),
+                                        options.bufferSize,
+                                        slashMmChannel(options, i), ringSizeIndex);
+            }
+
+            if (!testDataIntegrity(ddrBuffers, "DDR")) {
+                std::cerr << "DDR data integrity check failed" << std::endl;
+                return 1;
+            }
+
+            testBandwidthSuite(ddrBuffers, "DDR", ", raw QDMA", repeat);
+        }
+        {
+            // Bidirectional DDR: positions interleave R/W across slot indices
+            // 0..2N-1 of the DDR address space.
+            std::vector<RawTransferBuffer> ddrWriteBuffers;
+            std::vector<RawTransferBuffer> ddrReadBuffers;
+            ddrWriteBuffers.reserve(N);
+            ddrReadBuffers.reserve(N);
+            for (unsigned i = 0; i < N; ++i) {
+                ddrReadBuffers.emplace_back(qdma.get(),
+                                            rawAddressFor(DDR_BASE, options, 2 * i),
+                                            options.bufferSize,
+                                            slashMmChannel(options, 2 * i), ringSizeIndex);
+                ddrWriteBuffers.emplace_back(qdma.get(),
+                                             rawAddressFor(DDR_BASE, options, 2 * i + 1),
+                                             options.bufferSize,
+                                             slashMmChannel(options, 2 * i + 1), ringSizeIndex);
+            }
+
+            testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", raw QDMA", repeat);
+        }
+    }
+
+    if (!options.ddrOnly && !options.hbmOnly) {
+        {
+            std::vector<RawTransferBuffer> parBuffers;
+            parBuffers.reserve(2 * N);
+            for (unsigned i = 0; i < N; ++i) {
+                parBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, i),
+                                        options.bufferSize,
+                                        slashMmChannel(options, i), ringSizeIndex);
+            }
+            for (unsigned i = 0; i < N; ++i) {
+                parBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, i),
+                                        options.bufferSize,
+                                        slashMmChannel(options, i), ringSizeIndex);
+            }
+
+            testBandwidthSuite(parBuffers, "HBM+DDR", ", raw QDMA", repeat);
+        }
+        {
+            // Bidirectional HBM+DDR: 4N positions total.  Positions 0..2N-1
+            // are HBM (interleaved R/W across regions 0..2N-1); positions
+            // 2N..4N-1 are DDR (interleaved R/W across DDR slots 0..2N-1).
+            // Channel = (p / 2) & 1 throughout.
+            std::vector<RawTransferBuffer> parWriteBuffers;
+            std::vector<RawTransferBuffer> parReadBuffers;
+            parWriteBuffers.reserve(2 * N);
+            parReadBuffers.reserve(2 * N);
+            for (unsigned i = 0; i < N; ++i) {
+                parReadBuffers.emplace_back(qdma.get(),
+                                            rawAddressFor(HBM_BASE, options, 2 * i),
+                                            options.bufferSize,
+                                            slashMmChannel(options, 2 * i), ringSizeIndex);
+                parWriteBuffers.emplace_back(qdma.get(),
+                                             rawAddressFor(HBM_BASE, options, 2 * i + 1),
+                                             options.bufferSize,
+                                             slashMmChannel(options, 2 * i + 1), ringSizeIndex);
+            }
+            for (unsigned i = 0; i < N; ++i) {
+                parReadBuffers.emplace_back(qdma.get(),
+                                            rawAddressFor(DDR_BASE, options, 2 * i),
+                                            options.bufferSize,
+                                            slashMmChannel(options, 2 * i), ringSizeIndex);
+                parWriteBuffers.emplace_back(qdma.get(),
+                                             rawAddressFor(DDR_BASE, options, 2 * i + 1),
+                                             options.bufferSize,
+                                             slashMmChannel(options, 2 * i + 1), ringSizeIndex);
+            }
+
+            testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", raw QDMA", repeat);
+        }
+    }
+
+    return 0;
+}
+
+/// Raw integrity + bandwidth test driven over the off-the-shelf Xilinx QDMA
+/// driver instead of SLASH.  smi provisions queues itself (qmax + netlink
+/// add/start) and transfers over the per-queue char devices.
+static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& options) {
+#ifndef SMI_ENABLE_QDMA_DRIVER_BACKEND
+    (void)bdf;
+    (void)options;
+    std::cerr << "validate: this v80-smi build was configured without "
+              << "--use-qdma-driver support. Rebuild with "
+              << "-DSMI_ENABLE_QDMA_DRIVER_BACKEND=ON." << std::endl;
+    return 1;
+#else
+    const unsigned N = options.threads;
+    const BandwidthRepeatOptions repeat = repeatOptionsFromValidate(options);
+
+    if (!options.noReset) {
+        std::cout << "QDMA-driver raw mode skips reset; continuing without VRTD reset." << std::endl;
+    }
+    warnIfNotRoot("QDMA-driver raw mode");
+
+    const bool runParallel = !options.ddrOnly && !options.hbmOnly;
+
+    std::cout << "Using off-the-shelf Xilinx QDMA driver for board " << bdf << "..." << std::endl;
+    printChannelAllocation(options);
+    printMmChannel(options);
+    printRingSizeIndex(options);
+    printBandwidthRepeatMode(repeat);
+    smi::qdma_driver::QdmaDriverDevice qdma(bdf, options.ringSizeIndex);
+    std::cout << "Resolved QDMA function " << qdma.functionBdf() << std::endl;
+    qdma.ensureQmax(runParallel ? 4 * N : 2 * N);
+
+    const unsigned mmChannels = qdma.mmChannelMax();
+    if (mmChannels > 1) {
+        std::cout << "Distributing queues across " << mmChannels
+                  << " MM channels (channel = qid % " << mmChannels << ")." << std::endl;
+    } else {
+        std::cout << "Device exposes a single MM channel; all queues on channel 0." << std::endl;
+    }
+
+    if (!options.ddrOnly) {
+        std::cout << "Testing HBM data integrity (" << N << " regions, QDMA driver)..." << std::endl;
+        {
+            std::vector<smi::qdma_driver::QdmaDriverBuffer> hbmBuffers;
+            hbmBuffers.reserve(N);
+            for (unsigned i = 0; i < N; ++i) {
+                hbmBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, i),
+                                        options.bufferSize,
+                                        qdmaDriverMmChannel(options, i));
+            }
+
+            if (!testDataIntegrity(hbmBuffers, "HBM")) {
+                std::cerr << "HBM data integrity check failed" << std::endl;
+                return 1;
+            }
+
+            testBandwidthSuite(hbmBuffers, "HBM", ", QDMA driver", repeat);
+        }
+        {
+            std::vector<smi::qdma_driver::QdmaDriverBuffer> hbmWriteBuffers;
+            std::vector<smi::qdma_driver::QdmaDriverBuffer> hbmReadBuffers;
+            hbmWriteBuffers.reserve(N);
+            hbmReadBuffers.reserve(N);
+            for (unsigned i = 0; i < N; ++i) {
+                hbmReadBuffers.emplace_back(qdma, i,
+                                            rawAddressFor(HBM_BASE, options, 2 * i),
+                                            options.bufferSize,
+                                            qdmaDriverMmChannel(options, 2 * i));
+                hbmWriteBuffers.emplace_back(qdma, N + i,
+                                             rawAddressFor(HBM_BASE, options, 2 * i + 1),
+                                             options.bufferSize,
+                                             qdmaDriverMmChannel(options, 2 * i + 1));
+            }
+
+            testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", QDMA driver", repeat);
+        }
+    }
+
+    if (!options.hbmOnly) {
+        std::cout << "Testing DDR data integrity (" << N << " buffers, QDMA driver)..." << std::endl;
+        {
+            std::vector<smi::qdma_driver::QdmaDriverBuffer> ddrBuffers;
+            ddrBuffers.reserve(N);
+            for (unsigned i = 0; i < N; ++i) {
+                ddrBuffers.emplace_back(qdma, i, rawAddressFor(DDR_BASE, options, i),
+                                        options.bufferSize,
+                                        qdmaDriverMmChannel(options, i));
+            }
+
+            if (!testDataIntegrity(ddrBuffers, "DDR")) {
+                std::cerr << "DDR data integrity check failed" << std::endl;
+                return 1;
+            }
+
+            testBandwidthSuite(ddrBuffers, "DDR", ", QDMA driver", repeat);
+        }
+        {
+            std::vector<smi::qdma_driver::QdmaDriverBuffer> ddrWriteBuffers;
+            std::vector<smi::qdma_driver::QdmaDriverBuffer> ddrReadBuffers;
+            ddrWriteBuffers.reserve(N);
+            ddrReadBuffers.reserve(N);
+            for (unsigned i = 0; i < N; ++i) {
+                ddrReadBuffers.emplace_back(qdma, i,
+                                            rawAddressFor(DDR_BASE, options, 2 * i),
+                                            options.bufferSize,
+                                            qdmaDriverMmChannel(options, 2 * i));
+                ddrWriteBuffers.emplace_back(qdma, N + i,
+                                             rawAddressFor(DDR_BASE, options, 2 * i + 1),
+                                             options.bufferSize,
+                                             qdmaDriverMmChannel(options, 2 * i + 1));
+            }
+
+            testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", QDMA driver", repeat);
+        }
+    }
+
+    if (runParallel) {
+        {
+            std::vector<smi::qdma_driver::QdmaDriverBuffer> parBuffers;
+            parBuffers.reserve(2 * N);
+            for (unsigned i = 0; i < N; ++i) {
+                parBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, i),
+                                        options.bufferSize,
+                                        qdmaDriverMmChannel(options, i));
+            }
+            for (unsigned i = 0; i < N; ++i) {
+                parBuffers.emplace_back(qdma, N + i, rawAddressFor(DDR_BASE, options, i),
+                                        options.bufferSize,
+                                        qdmaDriverMmChannel(options, i));
+            }
+
+            testBandwidthSuite(parBuffers, "HBM+DDR", ", QDMA driver", repeat);
+        }
+        {
+            std::vector<smi::qdma_driver::QdmaDriverBuffer> parWriteBuffers;
+            std::vector<smi::qdma_driver::QdmaDriverBuffer> parReadBuffers;
+            parWriteBuffers.reserve(2 * N);
+            parReadBuffers.reserve(2 * N);
+            for (unsigned i = 0; i < N; ++i) {
+                parReadBuffers.emplace_back(qdma, i,
+                                            rawAddressFor(HBM_BASE, options, 2 * i),
+                                            options.bufferSize,
+                                            qdmaDriverMmChannel(options, 2 * i));
+                parWriteBuffers.emplace_back(qdma, 2 * N + i,
+                                             rawAddressFor(HBM_BASE, options, 2 * i + 1),
+                                             options.bufferSize,
+                                             qdmaDriverMmChannel(options, 2 * i + 1));
+            }
+            for (unsigned i = 0; i < N; ++i) {
+                parReadBuffers.emplace_back(qdma, N + i,
+                                            rawAddressFor(DDR_BASE, options, 2 * i),
+                                            options.bufferSize,
+                                            qdmaDriverMmChannel(options, 2 * i));
+                parWriteBuffers.emplace_back(qdma, 3 * N + i,
+                                             rawAddressFor(DDR_BASE, options, 2 * i + 1),
+                                             options.bufferSize,
+                                             qdmaDriverMmChannel(options, 2 * i + 1));
+            }
+
+            testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", QDMA driver", repeat);
+        }
+    }
+
+    return 0;
+#endif
+}
+
+} // namespace
+
+uint64_t Validate::parseByteSizeOption(const std::string& text) {
+    return parseByteSizeText(text);
+}
+
+std::vector<Validate::Options::MmChannel> Validate::parseMmChannelSpec(const std::string& text) {
+    std::vector<Options::MmChannel> result;
+    size_t start = 0;
+    while (true) {
+        const size_t comma = text.find(',', start);
+        std::string token = trim(comma == std::string::npos ? text.substr(start)
+                                                            : text.substr(start, comma - start));
+        std::transform(token.begin(), token.end(), token.begin(),
+                       [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+        if (token == "auto") {
+            result.push_back(Options::MmChannel::Auto);
+        } else if (token == "0") {
+            result.push_back(Options::MmChannel::Ch0);
+        } else if (token == "1") {
+            result.push_back(Options::MmChannel::Ch1);
+        } else {
+            throw std::invalid_argument("mm-channel entries must be auto, 0, or 1");
+        }
+        if (comma == std::string::npos) {
+            break;
+        }
+        start = comma + 1;
+    }
+    if (result.empty()) {
+        throw std::invalid_argument("mm-channel spec must not be empty");
+    }
+    return result;
 }
 
 int Validate::run(const Options& options) {
     std::string bdf = resolveBoardBdf(options.bdf, "validate");
     unsigned N = options.threads;
 
+    if (!validatePlacement(options)) {
+        return 1;
+    }
+
+    if (!checkHostMemoryBudget(options)) {
+        return 1;
+    }
+
+    if (options.rawTransferTest) {
+        return runRawTransferTest(bdf, options);
+    }
+
+    if (options.useQdmaDriver) {
+        return runQdmaDriverTest(bdf, options);
+    }
+
     // -- Step 1: (Optional) Reset the device via vrtd --
     if (!options.noReset) {
         std::cout << "Resetting device " << bdf << "..." << std::endl;
@@ -171,43 +1542,117 @@ int Validate::run(const Options& options) {
     vrtd::Session session;
     auto device = session.getDeviceByBdf(bdf);
 
+    printMmChannel(options);
+
     // -- Step 2: HBM — integrity then bandwidth --
-    std::cout << "Testing HBM data integrity (" << N << " regions)..." << std::endl;
-    {
-        std::vector<vrtd::Buffer> hbmBuffers;
-        hbmBuffers.reserve(N);
-        for (unsigned i = 0; i < N; ++i) {
-            hbmBuffers.push_back(device.openHbmBuffer(i, BUFFER_SIZE));
-        }
+    if (!options.ddrOnly) {
+        std::cout << "Testing HBM data integrity (" << N << " regions)..." << std::endl;
+        {
+            std::vector<vrtd::Buffer> hbmBuffers;
+            hbmBuffers.reserve(N);
+            for (unsigned i = 0; i < N; ++i) {
+                hbmBuffers.push_back(openValidateHbmBuffer(device, options, i));
+            }
+
+            if (!testDataIntegrity(hbmBuffers, "HBM")) {
+                std::cerr << "HBM data integrity check failed" << std::endl;
+                return 1;
+            }
 
-        if (!testDataIntegrity(hbmBuffers, "HBM")) {
-            std::cerr << "HBM data integrity check failed" << std::endl;
-            return 1;
+            testBandwidthSuite(hbmBuffers, "HBM", "");
         }
+        // HBM buffers released.
+        {
+            std::vector<vrtd::Buffer> hbmWriteBuffers;
+            std::vector<vrtd::Buffer> hbmReadBuffers;
+            hbmWriteBuffers.reserve(N);
+            hbmReadBuffers.reserve(N);
+            for (unsigned i = 0; i < N; ++i) {
+                hbmReadBuffers.push_back(openValidateHbmBuffer(device, options, 2 * i));
+                hbmWriteBuffers.push_back(openValidateHbmBuffer(device, options, 2 * i + 1));
+            }
 
-        std::cout << "Testing HBM bandwidth (" << N << " threads)..." << std::endl;
-        testBandwidth(hbmBuffers);
+            testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", "");
+        }
+        // Bidirectional HBM buffers released.
     }
-    // HBM buffers released.
 
     // -- Step 3: DDR — integrity then bandwidth --
-    std::cout << "Testing DDR data integrity (" << N << " buffers)..." << std::endl;
-    {
-        std::vector<vrtd::Buffer> ddrBuffers;
-        ddrBuffers.reserve(N);
-        for (unsigned i = 0; i < N; ++i) {
-            ddrBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE));
+    if (!options.hbmOnly) {
+        std::cout << "Testing DDR data integrity (" << N << " buffers)..." << std::endl;
+        {
+            std::vector<vrtd::Buffer> ddrBuffers;
+            ddrBuffers.reserve(N);
+            for (unsigned i = 0; i < N; ++i) {
+                ddrBuffers.push_back(openValidateDdrBuffer(device, options, i));
+            }
+
+            if (!testDataIntegrity(ddrBuffers, "DDR")) {
+                std::cerr << "DDR data integrity check failed" << std::endl;
+                return 1;
+            }
+
+            testBandwidthSuite(ddrBuffers, "DDR", "");
+        }
+        // DDR buffers released.
+        {
+            std::vector<vrtd::Buffer> ddrWriteBuffers;
+            std::vector<vrtd::Buffer> ddrReadBuffers;
+            ddrWriteBuffers.reserve(N);
+            ddrReadBuffers.reserve(N);
+            for (unsigned i = 0; i < N; ++i) {
+                if (options.placementExplicit) {
+                    ddrReadBuffers.push_back(openValidateDdrBuffer(device, options, 2 * i));
+                    ddrWriteBuffers.push_back(openValidateDdrBuffer(device, options, 2 * i + 1));
+                } else {
+                    ddrWriteBuffers.push_back(openValidateDdrBuffer(device, options, i));
+                    ddrReadBuffers.push_back(openValidateDdrBuffer(device, options, i));
+                }
+            }
+
+            testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", "");
         }
+        // Bidirectional DDR buffers released.
+    }
+
+    // -- Step 4: HBM + DDR in parallel --
+    if (!options.ddrOnly && !options.hbmOnly) {
+        {
+            std::vector<vrtd::Buffer> parBuffers;
+            parBuffers.reserve(2 * N);
+            for (unsigned i = 0; i < N; ++i) {
+                parBuffers.push_back(openValidateHbmBuffer(device, options, i));
+            }
+            for (unsigned i = 0; i < N; ++i) {
+                parBuffers.push_back(openValidateDdrBuffer(device, options, i));
+            }
 
-        if (!testDataIntegrity(ddrBuffers, "DDR")) {
-            std::cerr << "DDR data integrity check failed" << std::endl;
-            return 1;
+            testBandwidthSuite(parBuffers, "HBM+DDR", "");
         }
+        // Parallel single-direction buffers released.
+        {
+            std::vector<vrtd::Buffer> parWriteBuffers;
+            std::vector<vrtd::Buffer> parReadBuffers;
+            parWriteBuffers.reserve(2 * N);
+            parReadBuffers.reserve(2 * N);
+            for (unsigned i = 0; i < N; ++i) {
+                parReadBuffers.push_back(openValidateHbmBuffer(device, options, 2 * i));
+                parWriteBuffers.push_back(openValidateHbmBuffer(device, options, 2 * i + 1));
+            }
+            for (unsigned i = 0; i < N; ++i) {
+                if (options.placementExplicit) {
+                    parReadBuffers.push_back(openValidateDdrBuffer(device, options, 2 * i));
+                    parWriteBuffers.push_back(openValidateDdrBuffer(device, options, 2 * i + 1));
+                } else {
+                    parWriteBuffers.push_back(openValidateDdrBuffer(device, options, i));
+                    parReadBuffers.push_back(openValidateDdrBuffer(device, options, i));
+                }
+            }
 
-        std::cout << "Testing DDR bandwidth (" << N << " threads)..." << std::endl;
-        testBandwidth(ddrBuffers);
+            testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", "");
+        }
+        // Parallel bidirectional buffers released.
     }
-    // DDR buffers released.
 
     return 0;
 }
diff --git a/smi/src/validate.hpp b/smi/src/validate.hpp
index 2e5d1f8e..ef15a174 100644
--- a/smi/src/validate.hpp
+++ b/smi/src/validate.hpp
@@ -24,11 +24,15 @@
 /// @file validate.hpp
 /// @brief Declaration of the Validate command.
 ///
-/// The Validate command resets a V80 board and then exercises DDR and HBM
-/// memory via PCIe by running data integrity checks followed by parallel
-/// bandwidth measurements.
+/// The Validate command optionally resets a V80 board and then exercises DDR
+/// and HBM memory via PCIe by running data integrity checks followed by
+/// parallel bandwidth measurements. Raw transfer modes skip reset and bypass
+/// the default VRTD buffer path.
 
+#include <cstdint>
+#include <optional>
 #include <string>
+#include <vector>
 
 /// @brief Static entry-point for the validate command.
 ///
@@ -39,15 +43,69 @@ class Validate {
 public:
     /// @brief Options parsed from the CLI for the validate command.
     struct Options {
+        /// @brief How raw-transfer buffers map QDMA MM/NoC channels onto memory.
+        ///
+        /// On CPM5 the host-side NoC ingress port (NMU) is selected per queue by
+        /// the SW-context mm-channel/host_id (SLASH uses qid&1), while the
+        /// memory-side NoC egress endpoint (NSU / pseudo-channel) is selected by
+        /// the device address.  Sustaining both NMUs requires also spreading
+        /// across two NSUs; otherwise both ports converge on one memory endpoint
+        /// and bandwidth caps at a single path.  This mirrors the off-the-shelf
+        /// dma-perf knobs offset_ch0/offset_ch1.
+        enum class ChannelAllocation {
+            Auto,    ///< Interleaved: driver picks mm-channel (qid&1), addresses linear. Default; current behaviour.
+            Paired,  ///< Couple mm-channel to a distinct memory region: even positions -> region 0, odd -> region 1.
+        };
+
+        /// @brief Per-queue AXI-MM/NoC channel selection for a buffer.
+        ///
+        /// Auto lets the driver stripe by qid&1; Ch0/Ch1 pin the queue to a
+        /// single AXI-MM channel (and hence NoC channel).  Applies to the VRTD,
+        /// raw SLASH, and off-the-shelf QDMA-driver backends.
+        enum class MmChannel {
+            Auto, ///< Driver stripes by qid&1 (default).
+            Ch0,  ///< Pin to AXI-MM/NoC channel 0.
+            Ch1,  ///< Pin to AXI-MM/NoC channel 1.
+        };
+
         std::string bdf;           ///< BDF (Bus:Device.Function) address of the target device.
         unsigned threads = 8;      ///< Number of parallel buffers/threads (1-64).
         bool noReset = false;      ///< Skip the device reset step before running memory tests.
+        bool ddrOnly = false;      ///< Skip HBM phase (mutually exclusive with hbmOnly).
+        bool hbmOnly = false;      ///< Skip DDR phase (mutually exclusive with ddrOnly).
+        bool rawTransferTest = false; ///< Use libslash raw QDMA transfers instead of VRTD buffers.
+        bool useQdmaDriver = false;   ///< Run the raw test over the off-the-shelf Xilinx QDMA driver.
+        /// Per-buffer AXI-MM channel selection, indexed by buffer position
+        /// modulo size (a single entry applies to every buffer). Default auto.
+        std::vector<MmChannel> mmChannels{MmChannel::Auto};
+        uint64_t bufferSize = 512ULL * 1024ULL * 1024ULL; ///< Size of each test buffer.
+        uint64_t offset = 512ULL * 1024ULL * 1024ULL; ///< Distance between logical buffer positions.
+        uint64_t startingOffset = 0; ///< Offset from memory-space base for position 0.
+        bool placementExplicit = false; ///< True when any placement option was provided.
+        /// Raw-transfer NoC channel/memory placement strategy (raw modes only).
+        ChannelAllocation channelAllocation = ChannelAllocation::Auto;
+        /// Paired-mode byte distance between the two per-channel memory regions
+        /// (the NSU / pseudo-channel stride). Default 16 GiB == MEMORY_SPACE_SIZE/2,
+        /// which matches the dma-perf HBM offset_ch1-offset_ch0 spacing.
+        uint64_t channelRegionStride = 16ULL * 1024ULL * 1024ULL * 1024ULL;
+        /// Number of whole-buffer transfers per buffer in raw bandwidth phases.
+        uint64_t bandwidthIterations = 1;
+        /// Raw bandwidth phase duration in seconds. 0 means use fixed iterations.
+        double bandwidthDuration = 0.0;
+        /// Optional descriptor-ring size index for raw QDMA queue creation.
+        std::optional<uint32_t> ringSizeIndex;
     };
 
     /// @brief Executes the validate command.
     /// @param options Populated options struct.
     /// @return Exit code (0 on success).
     static int run(const Options& options);
+
+    /// @brief Parse a byte-size option accepting bare values and k/K/m/M suffixes.
+    static uint64_t parseByteSizeOption(const std::string& text);
+
+    /// @brief Parse an --mm-channel spec: a single auto|0|1 or a comma-separated list.
+    static std::vector<Options::MmChannel> parseMmChannelSpec(const std::string& text);
 };
 
 #endif // SMI_VALIDATE_HPP
diff --git a/vrt/src/qdma/qdma_intf.cpp b/vrt/src/qdma/qdma_intf.cpp
index 780fb766..66454c60 100644
--- a/vrt/src/qdma/qdma_intf.cpp
+++ b/vrt/src/qdma/qdma_intf.cpp
@@ -20,6 +20,8 @@
 
 #include <vrt/qdma/qdma_intf.hpp>
 
+#include <cstring>
+
 #include <slash/qdma.h>
 #include <vrtd/device.hpp>
 
@@ -56,61 +58,41 @@ QdmaIntf::~QdmaIntf() {
     }
 }
 
+namespace {
+constexpr uint64_t kQdmaPage = 4096ULL;
+inline uint64_t roundUpToPage(uint64_t v) { return (v + kQdmaPage - 1) & ~(kQdmaPage - 1); }
+}  // namespace
+
 ssize_t QdmaIntf::write_from_buffer(const char* fname, char* buffer, uint64_t size, uint64_t base) {
     if (qpairFd < 0) {
         utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__,
                            "QDMA streaming not initialized");
         return -EIO;
     }
-    int fd = qpairFd;
-    ssize_t rc;
-    uint64_t count = 0;
-    char* buf = buffer;
-    off_t offset = base;
-
-    do { /* Support zero byte transfer */
-        uint64_t bytes = size - count;
-
-        if (bytes > RW_MAX_SIZE) bytes = RW_MAX_SIZE;
-
-        if (offset) {
-            rc = lseek(fd, offset, SEEK_SET);
-            if (rc < 0) {
-                utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__,
-                                   "Could not write to {}", fname);
-                return -EIO;
-            }
-            if (rc != offset) {
-                utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__,
-                                   "Could not write to {}", fname);
-                return -EIO;
-            }
-        }
-
-        /* write data to file from memory buffer */
-        rc = write(fd, buf, bytes);
-        if (rc < 0) {
-            utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not write to {}",
-                               fname);
-            return -EIO;
-        }
-        if (rc != bytes) {
-            utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not write to {}",
-                               fname);
-            return -EIO;
-        }
-
-        count += bytes;
-        buf += bytes;
-        offset += bytes;
-    } while (count < size);
-
-    if (count != size) {
+    if (size == 0) {
+        return 0;
+    }
+
+    // The kernel buffer owns its DMA-mapped pages; stage the caller's data into
+    // the mapping, then transfer whole pages.
+    const uint64_t aligned = roundUpToPage(size);
+    struct slash_qdma_buffer buf{};
+    if (slash_qdma_qpair_buffer_create(qpairFd, aligned, &buf) != 0) {
+        utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__,
+                           "Could not create QDMA write buffer for {}", fname);
+        return -EIO;
+    }
+    std::memcpy(buf.addr, buffer, size);
+
+    ssize_t rc = slash_qdma_qpair_transfer(qpairFd, buf.fd, 0, base, aligned,
+                                           SLASH_QDMA_XFER_H2C);
+    (void)slash_qdma_buffer_destroy(&buf);
+    if (rc != (ssize_t)aligned) {
         utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not write to {}",
                            fname);
         return -EIO;
     }
-    return count;
+    return (ssize_t)size;
 }
 
 ssize_t QdmaIntf::read_to_buffer(const char* fname, char* buffer, uint64_t size, uint64_t base) {
@@ -119,55 +101,30 @@ ssize_t QdmaIntf::read_to_buffer(const char* fname, char* buffer, uint64_t size,
                            "QDMA streaming not initialized");
         return -EIO;
     }
-    int fd = qpairFd;
-    ssize_t rc;
-    uint64_t count = 0;
-    char* buf = buffer;
-    off_t offset = base;
-
-    do { /* Support zero byte transfer */
-        uint64_t bytes = size - count;
-
-        if (bytes > RW_MAX_SIZE) bytes = RW_MAX_SIZE;
-
-        if (offset) {
-            rc = lseek(fd, offset, SEEK_SET);
-            if (rc < 0) {
-                utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__,
-                                   "Could not read from {}", fname);
-                return -EIO;
-            }
-            if (rc != offset) {
-                utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__,
-                                   "Could not read from {}", fname);
-                return -EIO;
-            }
-        }
-
-        /* read data from file into memory buffer */
-        rc = read(fd, buf, bytes);
-        if (rc < 0) {
-            utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__,
-                               "Could not read from {}", fname);
-            return -EIO;
-        }
-        if (rc != bytes) {
-            utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__,
-                               "Could not read from {}", fname);
-            return -EIO;
-        }
-
-        count += bytes;
-        buf += bytes;
-        offset += bytes;
-    } while (count < size);
-
-    if (count != size) {
+    if (size == 0) {
+        return 0;
+    }
+
+    const uint64_t aligned = roundUpToPage(size);
+    struct slash_qdma_buffer buf{};
+    if (slash_qdma_qpair_buffer_create(qpairFd, aligned, &buf) != 0) {
+        utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__,
+                           "Could not create QDMA read buffer for {}", fname);
+        return -EIO;
+    }
+
+    ssize_t rc = slash_qdma_qpair_transfer(qpairFd, buf.fd, 0, base, aligned,
+                                           SLASH_QDMA_XFER_C2H);
+    if (rc == (ssize_t)aligned) {
+        std::memcpy(buffer, buf.addr, size);
+    }
+    (void)slash_qdma_buffer_destroy(&buf);
+    if (rc != (ssize_t)aligned) {
         utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not read from {}",
                            fname);
         return -EIO;
     }
-    return count;
+    return (ssize_t)size;
 }
 
 void QdmaIntf::write_buff(char* buffer, uint64_t start_addr, uint64_t size) {
diff --git a/vrt/vrtd/include/vrtd/wire.h b/vrt/vrtd/include/vrtd/wire.h
index 4749afd4..a543b815 100644
--- a/vrt/vrtd/include/vrtd/wire.h
+++ b/vrt/vrtd/include/vrtd/wire.h
@@ -276,8 +276,9 @@ struct vrtd_resp_qdma_qpair_op {
 /**
  * @brief Request a read/write file descriptor for a QDMA qpair.
  *
- * The qpair FD is sent out-of-band via SCM_RIGHTS when
- * @ref vrtd_resp_header::ret == VRTD_RET_OK.
+ * One or more qpair FDs are sent out-of-band via SCM_RIGHTS when
+ * @ref vrtd_resp_header::ret == VRTD_RET_OK.  The response body reports the
+ * number of descriptors attached.
  */
 struct vrtd_req_qdma_qpair_get_fd {
     uint32_t dev_number; ///< Device index (0-based).
@@ -299,6 +300,7 @@ struct vrtd_req_buffer_open {
     uint32_t dev_number; ///< Device index (0-based).
     uint32_t alloc_type; ///< One of enum vrtd_alloc_type.
     uint32_t alloc_dir;  ///< One of enum vrtd_alloc_dir.
+    uint32_t mm_channel; ///< AXI-MM/NoC channel selection (enum vrtd_mm_channel).
     uint64_t alloc_arg;  ///< Allocation argument (HBM region index for HBM).
     uint64_t size;       ///< Requested size in bytes.
 } __attribute__((packed));
@@ -306,6 +308,15 @@ struct vrtd_req_buffer_open {
 struct vrtd_resp_buffer_open {
     uint64_t size; ///< Allocated size in bytes (rounded up to subregion).
     uint64_t phys_addr; ///< Device physical address of the allocation.
+    /**
+     * Number of QDMA queue pairs (AXI-MM/NoC channels) owned by the single
+     * transfer FD sent via SCM_RIGHTS (1 or 2).  When two qpairs are bound
+     * (an mm_channel == AUTO request), their qpair_index ordering is fixed:
+     * index 0 is pinned to channel 0 and index 1 to channel 1, so the client
+     * can apply the V80 placement policy deterministically.  Exactly one FD is
+     * always sent regardless of this count.
+     */
+    uint32_t qpair_count;
 } __attribute__((packed));
 
 /**
@@ -327,18 +338,25 @@ struct vrtd_resp_buffer_close {
  * Bypasses the allocator entirely — the caller is responsible for ensuring the
  * address is valid and not in use.  Requires the @c raw-mem-access permission.
  *
- * The qpair FD is sent out-of-band via SCM_RIGHTS when
- * @ref vrtd_resp_header::ret == VRTD_RET_OK.
+ * A single transfer FD is sent out-of-band via SCM_RIGHTS when
+ * @ref vrtd_resp_header::ret == VRTD_RET_OK.  The response body reports how
+ * many queue pairs (channels) that FD owns.
  */
 struct vrtd_req_buffer_open_raw {
     uint32_t dev_number; ///< Device index (0-based).
     uint32_t alloc_dir;  ///< One of enum vrtd_alloc_dir.
+    uint32_t mm_channel; ///< AXI-MM/NoC channel selection (enum vrtd_mm_channel).
     uint64_t phys_addr;  ///< Caller-specified device physical address (bypasses allocator).
     uint64_t size;       ///< Size in bytes.
 } __attribute__((packed));
 
 struct vrtd_resp_buffer_open_raw {
-    uint8_t zero; ///< Placeholder; all data is carried via SCM_RIGHTS.
+    /**
+     * Number of QDMA queue pairs (channels) owned by the single transfer FD
+     * sent via SCM_RIGHTS (1 or 2).  Same qpair_index-to-channel ordering as
+     * @ref vrtd_resp_buffer_open.
+     */
+    uint32_t qpair_count;
 } __attribute__((packed));
 
 /**
diff --git a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h
index 76cf2541..0e6ce0ec 100644
--- a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h
+++ b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h
@@ -54,6 +54,18 @@ extern "C" {
 
 struct vrtd_buffer;
 
+/**
+ * @brief AXI-MM / NoC channel selection for a buffer's QDMA queue pair.
+ *
+ * Sent to the daemon, which forwards it to the SLASH driver's qpair-add ioctl
+ * (the values mirror enum slash_qdma_mm_channel).
+ */
+enum vrtd_mm_channel {
+    VRTD_MM_CHANNEL_AUTO = 0, ///< Stripe across channels by (qid & 1).
+    VRTD_MM_CHANNEL_0    = 1, ///< Pin to AXI-MM/NoC channel 0.
+    VRTD_MM_CHANNEL_1    = 2, ///< Pin to AXI-MM/NoC channel 1.
+};
+
 
 /**
  * @brief Connect to the vrtd UNIX domain socket.
@@ -291,9 +303,10 @@ enum vrtd_ret vrtd_qdma_qpair_del(
 );
 
 /**
- * @brief Obtain a read/write file descriptor for a QDMA qpair.
+ * @brief Obtain an ioctl-only file descriptor for a QDMA qpair.
  *
- * The descriptor can be used with read()/write() for C2H/H2C data transfer.
+ * The descriptor can be used with registered-buffer transfer ioctls for
+ * C2H/H2C data transfer.
  *
  * @param fd        Connected vrtd socket file descriptor.
  * @param dev       Device index (0‑based).
@@ -325,6 +338,7 @@ enum vrtd_ret vrtd_qdma_qpair_get_fd(
  * @param alloc_dir  QDMA direction (one of enum vrtd_alloc_dir).
  * @param alloc_arg  Allocation argument (HBM region index for HBM).
  * @param size_in     Requested size in bytes.
+ * @param mm_channel  AXI-MM/NoC channel selection (one of enum vrtd_mm_channel).
  * @param buffer_out  Output pointer to receive the allocated buffer handle.
  *
  * @return #VRTD_RET_OK on success; otherwise a #vrtd_ret error code.
@@ -338,6 +352,7 @@ enum vrtd_ret vrtd_buffer_open(
     uint32_t alloc_dir,
     uint64_t alloc_arg,
     uint64_t size_in,
+    enum vrtd_mm_channel mm_channel,
     struct vrtd_buffer **buffer_out
 );
 
@@ -352,6 +367,7 @@ enum vrtd_ret vrtd_buffer_open(
  * @param phys_addr   Caller-specified device physical address.
  * @param size        Size in bytes.
  * @param alloc_dir   One of #vrtd_alloc_dir.
+ * @param mm_channel  AXI-MM/NoC channel selection (one of enum vrtd_mm_channel).
  * @param buffer_out  Output parameter set to the new buffer handle on success.
  *
  * @return #VRTD_RET_OK on success; otherwise a #vrtd_ret error code.
@@ -364,6 +380,7 @@ enum vrtd_ret vrtd_buffer_open_raw(
     uint64_t phys_addr,
     uint64_t size,
     uint32_t alloc_dir,
+    enum vrtd_mm_channel mm_channel,
     struct vrtd_buffer **buffer_out
 );
 
@@ -514,8 +531,17 @@ struct vrtd_buffer {
 
     uint64_t size;
     uint64_t phys_addr;
+    /* Single transfer fd that owns @qpair_count queue pairs (channels). */
     int qpair_fd;
+    /* Number of queue pairs (channels) the fd owns; selects 1- or 2-way split. */
+    uint32_t qpair_count;
+    /* Kernel-owned DMA buffer fd backing @buf (from slash_qdma_qpair_buffer_create). */
+    int buffer_fd;
+    enum slash_qdma_transfer_hint transfer_hint;
+    /* CPU mapping of the kernel buffer (mmap of @buffer_fd). */
     void *buf;
+    /* Internal DMA granule for the host mapping (4 KiB base pages). */
+    uint64_t transfer_step_size;
 };
 
 enum vrtd_ret vrtd_buffer_create_raw(
@@ -527,9 +553,25 @@ enum vrtd_ret vrtd_buffer_create_raw(
     uint64_t size,
     uint64_t phys_addr,
     int qpair_fd,
+    uint32_t qpair_count,
     struct vrtd_buffer **buffer_out
 );
 
+/**
+ * @brief Synchronize a range from the local host buffer to the device.
+ *
+ * The requested range may be smaller than the QDMA transfer granule. libvrtd
+ * handles any required internal alignment. Bidirectional buffers preserve
+ * device bytes outside the requested range with an internal read-modify-write;
+ * host-to-device-only buffers keep the historical behavior of expanding the
+ * transfer to the backing DMA granule.
+ */
+enum vrtd_ret vrtd_buffer_sync_to_device(
+    struct vrtd_buffer *buffer,
+    uint64_t offset,
+    uint64_t size
+);
+
 /**
  * @brief Destroy a local buffer handle.
  *
@@ -540,12 +582,13 @@ enum vrtd_ret vrtd_buffer_destroy(
     struct vrtd_buffer *buffer
 );
 
-enum vrtd_ret vrtd_buffer_sync_to_device(
-    struct vrtd_buffer *buffer,
-    uint64_t offset,
-    uint64_t size
-);
-
+/**
+ * @brief Synchronize a range from the device into the local host buffer.
+ *
+ * The requested range may be smaller than the QDMA transfer granule. libvrtd
+ * handles any required internal alignment and preserves bytes outside the
+ * requested host range.
+ */
 enum vrtd_ret vrtd_buffer_sync_from_device(
     struct vrtd_buffer *buffer,
     uint64_t offset,
diff --git a/vrt/vrtd/libvrtd/src/buffer.c b/vrt/vrtd/libvrtd/src/buffer.c
index b810de2c..87573074 100644
--- a/vrt/vrtd/libvrtd/src/buffer.c
+++ b/vrt/vrtd/libvrtd/src/buffer.c
@@ -24,14 +24,14 @@
  * DMA buffer lifecycle management for the vrtd C client library.
  *
  * Buffers are host-side memory regions used for DMA transfers to/from
- * the FPGA.  Each buffer is backed by an anonymous mmap (preferring
- * 2 MB hugepages for TLB efficiency, with automatic fallback to
- * regular pages) and associated with a QDMA queue pair fd for
- * performing the actual H2C / C2H transfers.
+ * the FPGA.  Each buffer is backed by an anonymous mmap of 4 KiB base pages
+ * (transparent hugepages disabled) and associated with a QDMA queue pair fd
+ * for performing the actual H2C / C2H transfers.
  *
- * Sync operations (sync_to_device / sync_from_device) transfer data
- * between the host buffer and FPGA memory in TRANSFER_STEP_SIZE (4 KB)
- * chunks using positional I/O on the QDMA qpair fd.
+ * Sync operations (sync_to_device / sync_from_device) accept arbitrary
+ * in-buffer ranges. Internally, the QDMA fd requires page-aligned transfer
+ * ranges, so libvrtd expands partial requests to the mapping granule and uses
+ * a staging buffer when needed to preserve host-side partial-range semantics.
  *
  * Buffer lifecycle:
  *   1. vrtd_buffer_open()          -- daemon allocates, returns qpair fd
@@ -44,25 +44,179 @@
 
 #include <vrtd/vrtd.h>
 
+#include <slash/qdma.h>
+
+#include "v80_policy.h"
+
 #include <assert.h>
 #include <errno.h>
+#include <limits.h>
+#include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <string.h>
 #include <sys/mman.h>
+#include <syslog.h>
+#include <time.h>
 #include <unistd.h>
 
 
 #include <stdio.h>
 
-#ifndef MAP_HUGE_SHIFT
-#define MAP_HUGE_SHIFT 26
+#define BASE_TRANSFER_STEP_SIZE (4ULL * 1024ULL)              // 4K
+
+/*
+ * Per-sync timing instrumentation.
+ *
+ * When SLASH_QDMA_TIMING is non-zero (compile-time flag, e.g. built with
+ * -DSLASH_QDMA_TIMING=1), the sync_to/from_device paths log the wall-clock
+ * cost of each transfer ioctl plus the aggregate per-sync time and
+ * effective bandwidth.  This is the userspace counterpart to the kernel's
+ * SLASH_QDMA_TIMING breakdown.
+ */
+#ifndef SLASH_QDMA_TIMING
+#define SLASH_QDMA_TIMING 0
 #endif
 
-#ifndef MAP_HUGE_2MB
-#define MAP_HUGE_2MB (21UL << MAP_HUGE_SHIFT)
+#if SLASH_QDMA_TIMING
+static inline uint64_t vrtd_now_ns(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec;
+}
 #endif
 
-#define TRANSFER_STEP_SIZE (4ULL * 1024ULL) // 4K
+/*
+ * Issue a buffer transfer of [offset, offset + size) as a single batched ioctl
+ * per round, fanning the range across the fd's queue pairs (channels) according
+ * to the placement policy so both NoC channels run concurrently in-kernel.
+ *
+ * The QDMA transfer descriptor's length is a 32-bit byte count, so each
+ * segment is chunked to stay within that limit while preserving step alignment;
+ * every chunk round issues one ioctl covering all active channels.
+ */
+static int vrtd_transfer_registered(
+    int qpair_fd,
+    uint32_t qpair_count,
+    enum slash_qdma_transfer_hint transfer_hint,
+    int buf_fd,
+    uint64_t phys_addr,
+    uint64_t offset,
+    uint64_t size,
+    uint64_t step,
+    bool to_device
+) {
+    uint32_t direction = to_device ? SLASH_QDMA_XFER_H2C : SLASH_QDMA_XFER_C2H;
+
+    if (size == 0) {
+        return 0;
+    }
+
+    if (qpair_fd < 0 || qpair_count == 0) {
+        return -EINVAL;
+    }
+
+    if (step == 0 || (offset % step) != 0 || (size % step) != 0) {
+        return -EINVAL;
+    }
+
+    /*
+     * Decide how the transfer maps onto the available queue pairs.  V80 applies
+     * the placement-aware policy (DDR halved, HBM routed by the half-memory
+     * boundary); any other hint keeps everything on the primary qpair.
+     */
+    struct vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS];
+    uint32_t nseg;
+
+    if (transfer_hint == SLASH_QDMA_TRANSFER_HINT_V80) {
+        nseg = vrtd_plan_v80(phys_addr, offset, size, step, qpair_count, segs);
+    } else {
+        segs[0].qpair_index = 0;
+        segs[0].offset = offset;
+        segs[0].size = size;
+        nseg = 1;
+    }
+
+    /* Clamp any planned qpair_index to the qpairs the fd actually owns. */
+    for (uint32_t i = 0; i < nseg; ++i) {
+        if (segs[i].qpair_index >= qpair_count) {
+            segs[i].qpair_index = 0;
+        }
+    }
+
+    /* Per-channel descriptor length is 32-bit; keep chunks step-aligned. */
+    uint64_t max_chunk = 0xFFFFF000ULL;
+    max_chunk -= max_chunk % step;
+    if (max_chunk == 0) {
+        return -EINVAL;
+    }
+
+    uint64_t done[VRTD_V80_MAX_SEGS] = {0};
+    for (;;) {
+        struct slash_qdma_subxfer xfers[VRTD_V80_MAX_SEGS];
+        uint32_t map_seg[VRTD_V80_MAX_SEGS];
+        uint32_t count = 0;
+
+        for (uint32_t i = 0; i < nseg; ++i) {
+            uint64_t remaining = segs[i].size - done[i];
+            uint64_t chunk;
+            uint64_t xfer_offset;
+
+            if (remaining == 0) {
+                continue;
+            }
+            chunk = remaining > max_chunk ? max_chunk : remaining;
+            xfer_offset = segs[i].offset + done[i];
+
+            memset(&xfers[count], 0, sizeof(xfers[count]));
+            xfers[count].qpair_index = segs[i].qpair_index;
+            xfers[count].direction = direction;
+            xfers[count].buf_fd = buf_fd;
+            xfers[count].buf_offset = xfer_offset;
+            xfers[count].dev_addr = phys_addr + xfer_offset;
+            xfers[count].length = chunk;
+            map_seg[count] = i;
+            count++;
+        }
+
+        if (count == 0) {
+            break;
+        }
+
+        ssize_t ret = slash_qdma_qpair_transfer_batch(qpair_fd, xfers, count);
+        if (ret < 0) {
+            return -EIO;
+        }
+
+        for (uint32_t c = 0; c < count; ++c) {
+            done[map_seg[c]] += xfers[c].length;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Transfer [0, size) of a separate kernel buffer (@bounce) against the device
+ * starting at @phys_addr.  Used for partial-range read-modify-write staging.
+ */
+static int vrtd_bounce_transfer(
+    const struct vrtd_buffer *buffer,
+    const struct slash_qdma_buffer *bounce,
+    uint64_t phys_addr,
+    uint64_t size,
+    bool to_device
+) {
+    if (buffer == NULL || bounce == NULL || buffer->qpair_count == 0 ||
+        buffer->qpair_fd < 0) {
+        return -EINVAL;
+    }
+
+    return vrtd_transfer_registered(buffer->qpair_fd, buffer->qpair_count,
+                                    buffer->transfer_hint, bounce->fd,
+                                    phys_addr, 0, size,
+                                    BASE_TRANSFER_STEP_SIZE, to_device);
+}
 
 enum vrtd_ret vrtd_buffer_create_raw(
     int sock_fd,
@@ -73,6 +227,7 @@ enum vrtd_ret vrtd_buffer_create_raw(
     uint64_t size,
     uint64_t phys_addr,
     int qpair_fd,
+    uint32_t qpair_count,
     struct vrtd_buffer **buffer_out
 ) {
     if (buffer_out == NULL) {
@@ -84,31 +239,44 @@ enum vrtd_ret vrtd_buffer_create_raw(
         return VRTD_RET_INTERNAL_ERROR;
     }
 
-    buffer->buf = mmap(
-        NULL, /* address (let the kernel choose) */
-        size,
-        PROT_READ | PROT_WRITE,
-        MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE,
-        -1, /* fd */
-        0   /* offset */
-    );
-    if (buffer->buf == MAP_FAILED) {
-        // Huge pages are an optimization, not a hard requirement.
-        // Fall back to normal anonymous mapping when hugepage mmap fails.
-        buffer->buf = mmap(
-            NULL, /* address (let the kernel choose) */
-            size,
-            PROT_READ | PROT_WRITE,
-            MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE,
-            -1, /* fd */
-            0   /* offset */
-        );
-        if (buffer->buf == MAP_FAILED) {
-            free(buffer);
-            return VRTD_RET_INTERNAL_ERROR;
-        }
+    buffer->buf = NULL;
+    buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE;
+    buffer->qpair_fd = -1;
+    buffer->qpair_count = 0;
+    buffer->buffer_fd = -1;
+    buffer->transfer_hint = SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR;
+
+    if (qpair_fd < 0 || qpair_count == 0 || qpair_count > 2) {
+        free(buffer);
+        return VRTD_RET_BAD_LIB_CALL;
     }
 
+    /*
+     * The kernel owns the DMA buffer: it allocates 4 KiB base pages, builds the
+     * SGL, and DMA-maps everything once at create time, then hands back a
+     * mappable fd.  We mmap that fd for CPU access (buffer->buf).
+     */
+    struct slash_qdma_buffer sbuf;
+    memset(&sbuf, 0, sizeof(sbuf));
+    if (slash_qdma_qpair_buffer_create(qpair_fd, size, &sbuf) != 0) {
+        free(buffer);
+        return VRTD_RET_INTERNAL_ERROR;
+    }
+
+    buffer->buf           = sbuf.addr;
+    buffer->buffer_fd     = sbuf.fd;
+    buffer->transfer_hint = sbuf.transfer_hint;
+    buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE;
+#if SLASH_QDMA_TIMING
+    syslog(
+        LOG_INFO,
+        "libvrtd: buffer kernel mapping size=%llu phys_addr=0x%llx step=%llu",
+        (unsigned long long)size,
+        (unsigned long long)phys_addr,
+        (unsigned long long)buffer->transfer_step_size
+    );
+#endif
+
     buffer->sock_fd    = sock_fd;
     buffer->dev        = dev;
     buffer->alloc_type = alloc_type;
@@ -116,13 +284,69 @@ enum vrtd_ret vrtd_buffer_create_raw(
     buffer->alloc_arg  = alloc_arg;
     buffer->size       = size;
     buffer->phys_addr  = phys_addr;
-    buffer->qpair_fd   = qpair_fd;
+    buffer->qpair_fd    = qpair_fd;
+    buffer->qpair_count = qpair_count;
 
     *buffer_out = buffer;
 
     return VRTD_RET_OK;
 }
 
+static enum vrtd_ret vrtd_buffer_prepare_sync_range(
+    const struct vrtd_buffer *buffer,
+    uint64_t offset,
+    uint64_t size,
+    uint64_t *aligned_offset_out,
+    uint64_t *aligned_size_out,
+    bool *needs_bounce_out
+) {
+    uint64_t step;
+    uint64_t end;
+    uint64_t aligned_offset;
+    uint64_t aligned_end;
+
+    if (buffer == NULL || aligned_offset_out == NULL ||
+        aligned_size_out == NULL || needs_bounce_out == NULL) {
+        return VRTD_RET_BAD_LIB_CALL;
+    }
+
+    step = buffer->transfer_step_size;
+    if (step == 0) {
+        return VRTD_RET_INVALID_ARGUMENT;
+    }
+
+    if (offset > buffer->size || size > buffer->size - offset) {
+        return VRTD_RET_INVALID_ARGUMENT;
+    }
+
+    if (size == 0) {
+        *aligned_offset_out = offset;
+        *aligned_size_out = 0;
+        *needs_bounce_out = false;
+        return VRTD_RET_OK;
+    }
+
+    if ((buffer->size % step) != 0 || (buffer->phys_addr % step) != 0) {
+        return VRTD_RET_INVALID_ARGUMENT;
+    }
+
+    end = offset + size;
+    aligned_offset = offset - (offset % step);
+    if (end > UINT64_MAX - (step - 1)) {
+        return VRTD_RET_INVALID_ARGUMENT;
+    }
+    aligned_end = ((end + step - 1) / step) * step;
+    if (aligned_end > buffer->size) {
+        return VRTD_RET_INVALID_ARGUMENT;
+    }
+
+    *aligned_offset_out = aligned_offset;
+    *aligned_size_out = aligned_end - aligned_offset;
+    *needs_bounce_out = (aligned_offset != offset || aligned_end != end);
+
+    return VRTD_RET_OK;
+}
+
 enum vrtd_ret vrtd_buffer_destroy(
     struct vrtd_buffer *buffer
 ) {
@@ -130,12 +354,19 @@ enum vrtd_ret vrtd_buffer_destroy(
         return VRTD_RET_BAD_LIB_CALL;
     }
 
-    if (buffer->qpair_fd >= 0) {
-        (void) close(buffer->qpair_fd);
+    if (buffer->buf != NULL && buffer->size != 0) {
+        (void) munmap(buffer->buf, buffer->size);
+        buffer->buf = NULL;
     }
 
-    if (buffer->buf != NULL) {
-        (void) munmap(buffer->buf, buffer->size);
+    if (buffer->buffer_fd >= 0) {
+        (void) close(buffer->buffer_fd);
+        buffer->buffer_fd = -1;
+    }
+
+    if (buffer->qpair_fd >= 0) {
+        (void) close(buffer->qpair_fd);
+        buffer->qpair_fd = -1;
     }
 
     free(buffer);
@@ -189,31 +420,77 @@ enum vrtd_ret vrtd_buffer_sync_to_device(
         return VRTD_RET_INVALID_ARGUMENT;
     }
 
+    assert(buffer->qpair_count > 0);
     assert(buffer->qpair_fd >= 0);
     assert(buffer->buf != NULL);
-    assert(buffer->size % TRANSFER_STEP_SIZE == 0);
-    assert(buffer->phys_addr % TRANSFER_STEP_SIZE == 0);
+    uint64_t aligned_offset = 0;
+    uint64_t aligned_size = 0;
+    bool needs_bounce = false;
+    enum vrtd_ret range_ret = vrtd_buffer_prepare_sync_range(
+        buffer, offset, size, &aligned_offset, &aligned_size, &needs_bounce);
+    if (range_ret != VRTD_RET_OK) {
+        return range_ret;
+    }
+    if (aligned_size == 0) {
+        return VRTD_RET_OK;
+    }
 
-    uint64_t effective_offset = offset - (offset % TRANSFER_STEP_SIZE);
-    uint64_t end_offset = offset + size;
+    uint64_t step = buffer->transfer_step_size;
+#if SLASH_QDMA_TIMING
+    uint64_t sync_start_ns = vrtd_now_ns();
+#endif
 
-    off_t ret = lseek(buffer->qpair_fd, buffer->phys_addr + effective_offset, SEEK_SET);
-    if (ret == -1) {
+    int transfer_ret;
+    if (needs_bounce && buffer->alloc_dir == VRTD_ALLOC_DIR_BIDIRECTIONAL) {
+        struct slash_qdma_buffer bounce;
+        memset(&bounce, 0, sizeof(bounce));
+        if (slash_qdma_qpair_buffer_create(buffer->qpair_fd, aligned_size,
+                                           &bounce) != 0) {
+            return VRTD_RET_INTERNAL_ERROR;
+        }
+
+        transfer_ret = vrtd_bounce_transfer(
+            buffer, &bounce, buffer->phys_addr + aligned_offset,
+            aligned_size, false);
+        if (transfer_ret == 0) {
+            memcpy(
+                (uint8_t *)bounce.addr + (offset - aligned_offset),
+                (uint8_t *)buffer->buf + offset,
+                size
+            );
+            transfer_ret = vrtd_bounce_transfer(
+                buffer, &bounce, buffer->phys_addr + aligned_offset,
+                aligned_size, true);
+        }
+        (void) slash_qdma_buffer_destroy(&bounce);
+    } else {
+        /*
+         * Host-to-device-only buffers cannot read the surrounding device
+         * granule for a read-modify-write, so keep the historical behavior:
+         * expand partial syncs to the backing DMA granule.
+         */
+        transfer_ret = vrtd_transfer_registered(
+            buffer->qpair_fd, buffer->qpair_count, buffer->transfer_hint,
+            buffer->buffer_fd, buffer->phys_addr,
+            aligned_offset, aligned_size, step, true);
+    }
+    if (transfer_ret != 0) {
         return VRTD_RET_INTERNAL_ERROR;
     }
 
-    for (uint64_t curr_offset = effective_offset; curr_offset < end_offset; curr_offset += TRANSFER_STEP_SIZE) {
-        ssize_t bytes_written = 0;
-        while (bytes_written < TRANSFER_STEP_SIZE) {
-            ssize_t bw = write(buffer->qpair_fd,
-                               (uint8_t *) buffer->buf + curr_offset + bytes_written,
-                               TRANSFER_STEP_SIZE - bytes_written);
-            if (bw == -1) {
-                return VRTD_RET_INTERNAL_ERROR;
-            }
-            bytes_written += bw;
-        }
+#if SLASH_QDMA_TIMING
+    {
+        uint64_t total_ns = vrtd_now_ns() - sync_start_ns;
+        double mb = (double) size / (1024.0 * 1024.0);
+        double sec = (double) total_ns / 1e9;
+        syslog(LOG_INFO,
+               "libvrtd: timing H2C sync offset=%llu size=%llu aligned_offset=%llu aligned_size=%llu step=%llu total=%llu ns (%.1f MB/s)",
+               (unsigned long long) offset, (unsigned long long) size,
+               (unsigned long long) aligned_offset, (unsigned long long) aligned_size,
+               (unsigned long long) step, (unsigned long long) total_ns,
+               sec > 0.0 ? mb / sec : 0.0);
     }
+#endif
 
     return VRTD_RET_OK;
 }
@@ -231,31 +508,69 @@ enum vrtd_ret vrtd_buffer_sync_from_device(
         return VRTD_RET_INVALID_ARGUMENT;
     }
 
+    assert(buffer->qpair_count > 0);
     assert(buffer->qpair_fd >= 0);
     assert(buffer->buf != NULL);
-    assert(buffer->size % TRANSFER_STEP_SIZE == 0);
-    assert(buffer->phys_addr % TRANSFER_STEP_SIZE == 0);
+    uint64_t aligned_offset = 0;
+    uint64_t aligned_size = 0;
+    bool needs_bounce = false;
+    enum vrtd_ret range_ret = vrtd_buffer_prepare_sync_range(
+        buffer, offset, size, &aligned_offset, &aligned_size, &needs_bounce);
+    if (range_ret != VRTD_RET_OK) {
+        return range_ret;
+    }
+    if (aligned_size == 0) {
+        return VRTD_RET_OK;
+    }
+
+    uint64_t step = buffer->transfer_step_size;
+#if SLASH_QDMA_TIMING
+    uint64_t sync_start_ns = vrtd_now_ns();
+#endif
 
-    uint64_t effective_offset = offset - (offset % TRANSFER_STEP_SIZE);
-    uint64_t end_offset = offset + size;
+    int transfer_ret;
+    if (needs_bounce) {
+        struct slash_qdma_buffer bounce;
+        memset(&bounce, 0, sizeof(bounce));
+        if (slash_qdma_qpair_buffer_create(buffer->qpair_fd, aligned_size,
+                                           &bounce) != 0) {
+            return VRTD_RET_INTERNAL_ERROR;
+        }
 
-    off_t ret = lseek(buffer->qpair_fd, buffer->phys_addr + effective_offset, SEEK_SET);
-    if (ret == -1) {
+        transfer_ret = vrtd_bounce_transfer(
+            buffer, &bounce, buffer->phys_addr + aligned_offset,
+            aligned_size, false);
+        if (transfer_ret == 0) {
+            memcpy(
+                (uint8_t *)buffer->buf + offset,
+                (uint8_t *)bounce.addr + (offset - aligned_offset),
+                size
+            );
+        }
+        (void) slash_qdma_buffer_destroy(&bounce);
+    } else {
+        transfer_ret = vrtd_transfer_registered(
+            buffer->qpair_fd, buffer->qpair_count, buffer->transfer_hint,
+            buffer->buffer_fd, buffer->phys_addr,
+            aligned_offset, aligned_size, step, false);
+    }
+    if (transfer_ret != 0) {
         return VRTD_RET_INTERNAL_ERROR;
     }
 
-    for (uint64_t curr_offset = effective_offset; curr_offset < end_offset; curr_offset += TRANSFER_STEP_SIZE) {
-        ssize_t bytes_read = 0;
-        while (bytes_read < TRANSFER_STEP_SIZE) {
-            ssize_t br = read(buffer->qpair_fd,
-                              (uint8_t *) buffer->buf + curr_offset + bytes_read,
-                              TRANSFER_STEP_SIZE - bytes_read);
-            if (br == -1) {
-                return VRTD_RET_INTERNAL_ERROR;
-            }
-            bytes_read += br;
-        }
+#if SLASH_QDMA_TIMING
+    {
+        uint64_t total_ns = vrtd_now_ns() - sync_start_ns;
+        double mb = (double) size / (1024.0 * 1024.0);
+        double sec = (double) total_ns / 1e9;
+        syslog(LOG_INFO,
+               "libvrtd: timing C2H sync offset=%llu size=%llu aligned_offset=%llu aligned_size=%llu step=%llu total=%llu ns (%.1f MB/s)",
+               (unsigned long long) offset, (unsigned long long) size,
+               (unsigned long long) aligned_offset, (unsigned long long) aligned_size,
+               (unsigned long long) step, (unsigned long long) total_ns,
+               sec > 0.0 ? mb / sec : 0.0);
     }
+#endif
 
     return VRTD_RET_OK;
 }
diff --git a/vrt/vrtd/libvrtd/src/requests.c b/vrt/vrtd/libvrtd/src/requests.c
index b03c863a..d56c2a47 100644
--- a/vrt/vrtd/libvrtd/src/requests.c
+++ b/vrt/vrtd/libvrtd/src/requests.c
@@ -56,13 +56,13 @@
 #include <vrtd/vrtd.h>
 
 /**
- * vrtd_recv_response() - Receive a response message from the daemon.
+ * vrtd_recv_response_fds() - Receive a response message from the daemon.
  * @fd:            Connection socket.
  * @resp_body_buf: Buffer for the response body (may be NULL if no body expected).
  * @resp_bufsz:    Size of @resp_body_buf.
- * @resp_fd:       If non-NULL, receives an out-of-band file descriptor
- *                 sent by the daemon via SCM_RIGHTS (e.g. a BAR fd or
- *                 QDMA qpair fd).  Set to -1 if no fd was received.
+ * @resp_fds:      Optional array receiving out-of-band file descriptors.
+ * @max_resp_fds:  Capacity of @resp_fds.
+ * @resp_fd_count: Optional output count of received fds.
  *
  * Uses recvmsg() with scatter-gather I/O: the header and body are read
  * into separate buffers in a single system call.  MSG_CMSG_CLOEXEC
@@ -70,11 +70,13 @@
  *
  * Return: VRTD_RET_OK on success, or an error code.
  */
-static enum vrtd_ret vrtd_recv_response(
+static enum vrtd_ret vrtd_recv_response_fds(
     int fd,
     void *resp_body_buf,
     size_t resp_bufsz,
-    int *resp_fd
+    int *resp_fds,
+    uint32_t max_resp_fds,
+    uint32_t *resp_fd_count
 )
 {
     struct vrtd_resp_header rh = {0};
@@ -85,16 +87,21 @@ static enum vrtd_ret vrtd_recv_response(
     riov[1].iov_base = resp_body_buf;
     riov[1].iov_len  = resp_bufsz;
 
-    char cbuf[CMSG_SPACE(sizeof(int))];
+    char cbuf[CMSG_SPACE(2 * sizeof(int))];
     struct msghdr rmsg = {
         .msg_iov        = riov,
         .msg_iovlen     = resp_bufsz ? 2 : 1,
-        .msg_control    = resp_fd ? cbuf : NULL,
-        .msg_controllen = resp_fd ? sizeof(cbuf) : 0,
+        .msg_control    = resp_fds ? cbuf : NULL,
+        .msg_controllen = resp_fds ? sizeof(cbuf) : 0,
     };
 
-    if (resp_fd) {
-        *resp_fd = -1;
+    if (resp_fd_count) {
+        *resp_fd_count = 0;
+    }
+    if (resp_fds) {
+        for (uint32_t i = 0; i < max_resp_fds; ++i) {
+            resp_fds[i] = -1;
+        }
     }
 
     ssize_t rn = recvmsg(fd, &rmsg, MSG_CMSG_CLOEXEC);
@@ -118,11 +125,19 @@ static enum vrtd_ret vrtd_recv_response(
         return VRTD_RET_BAD_CONN;
     }
 
-    /* Extract file descriptor from SCM_RIGHTS ancillary data, if any. */
+    /* Extract file descriptors from SCM_RIGHTS ancillary data, if any. */
     for (struct cmsghdr *c = CMSG_FIRSTHDR(&rmsg); c != NULL; c = CMSG_NXTHDR(&rmsg, c)) {
         if (c->cmsg_level == SOL_SOCKET && c->cmsg_type == SCM_RIGHTS && c->cmsg_len >= CMSG_LEN(sizeof(int))) {
-            assert(resp_fd != NULL);
-            memcpy(resp_fd, CMSG_DATA(c), sizeof(int));
+            assert(resp_fds != NULL);
+            size_t payload = c->cmsg_len - CMSG_LEN(0);
+            uint32_t n = (uint32_t)(payload / sizeof(int));
+            if (n > max_resp_fds) {
+                n = max_resp_fds;
+            }
+            memcpy(resp_fds, CMSG_DATA(c), n * sizeof(int));
+            if (resp_fd_count) {
+                *resp_fd_count = n;
+            }
             break;
         }
     }
@@ -130,6 +145,22 @@ static enum vrtd_ret vrtd_recv_response(
     return (enum vrtd_ret) rh.ret;
 }
 
+static enum vrtd_ret vrtd_recv_response(
+    int fd,
+    void *resp_body_buf,
+    size_t resp_bufsz,
+    int *resp_fd
+)
+{
+    uint32_t count = 0;
+    enum vrtd_ret ret = vrtd_recv_response_fds(
+        fd, resp_body_buf, resp_bufsz, resp_fd, resp_fd ? 1u : 0u, &count);
+    if (resp_fd && count == 0) {
+        *resp_fd = -1;
+    }
+    return ret;
+}
+
 int vrtd_connect(const char *path)
 {
     if (path == NULL) {
@@ -232,6 +263,60 @@ enum vrtd_ret vrtd_raw_request(
     return vrtd_recv_response(fd, resp_body_buf, resp_bufsz, resp_fd);
 }
 
+static enum vrtd_ret vrtd_raw_request_fds(
+    int fd,
+    uint16_t opcode,
+    const void *req_body, uint16_t req_size,
+    void *resp_body_buf, size_t resp_bufsz,
+    int *resp_fds, uint32_t max_resp_fds, uint32_t *resp_fd_count,
+    const int *req_fd
+)
+{
+    if (req_size > VRTD_MSG_MAX_SIZE - sizeof(struct vrtd_req_header)) { errno = EMSGSIZE; return -1; }
+
+    struct vrtd_req_header h = {
+        .size  = req_size,
+        .opcode= opcode,
+        .seqno = 1,
+    };
+
+    struct iovec siov[2];
+    siov[0].iov_base = &h;
+    siov[0].iov_len  = sizeof(h);
+    siov[1].iov_base = (void*) req_body;
+    siov[1].iov_len  = req_size;
+
+    char cbuf[CMSG_SPACE(sizeof(int))];
+    struct msghdr smsg = {
+        .msg_iov        = siov,
+        .msg_iovlen     = req_size ? 2 : 1,
+        .msg_control    = NULL,
+        .msg_controllen = 0,
+    };
+
+    if (req_fd && *req_fd >= 0) {
+        smsg.msg_control = cbuf;
+        smsg.msg_controllen = sizeof(cbuf);
+
+        struct cmsghdr *cmsg = CMSG_FIRSTHDR(&smsg);
+        cmsg->cmsg_level = SOL_SOCKET;
+        cmsg->cmsg_type  = SCM_RIGHTS;
+        cmsg->cmsg_len   = CMSG_LEN(sizeof(int));
+        memcpy(CMSG_DATA(cmsg), req_fd, sizeof(int));
+    }
+
+    ssize_t sn = sendmsg(fd, &smsg, MSG_NOSIGNAL);
+    if (sn == -1) {
+        return VRTD_RET_BAD_CONN;
+    }
+    if ((size_t) sn != sizeof(h) + req_size) {
+        return VRTD_RET_BAD_CONN;
+    }
+
+    return vrtd_recv_response_fds(fd, resp_body_buf, resp_bufsz,
+                                  resp_fds, max_resp_fds, resp_fd_count);
+}
+
 
 enum vrtd_ret vrtd_get_num_devices(int fd, uint32_t *out)
 {
@@ -468,6 +553,7 @@ enum vrtd_ret vrtd_buffer_open(
     uint32_t alloc_dir,
     uint64_t alloc_arg,
     uint64_t size_in,
+    enum vrtd_mm_channel mm_channel,
     struct vrtd_buffer **buffer_out
 )
 {
@@ -480,21 +566,28 @@ enum vrtd_ret vrtd_buffer_open(
         .dev_number = dev,
         .alloc_type = alloc_type,
         .alloc_dir = alloc_dir,
+        .mm_channel = mm_channel,
         .alloc_arg = alloc_arg,
         .size = size_in,
     };
     struct vrtd_resp_buffer_open resp = {0};
 
+    /* The daemon sends a single transfer fd that owns resp.qpair_count qpairs. */
     int qpair_fd = -1;
-    int ret = vrtd_raw_request(fd, VRTD_REQ_BUFFER_OPEN,
-                               &req, sizeof(req),
-                               &resp, sizeof(resp),
-                               &qpair_fd, NULL);
+    uint32_t fd_count = 0;
+    int ret = vrtd_raw_request_fds(fd, VRTD_REQ_BUFFER_OPEN,
+                                   &req, sizeof(req),
+                                   &resp, sizeof(resp),
+                                   &qpair_fd, 1, &fd_count, NULL);
     if (ret != VRTD_RET_OK) {
         return ret;
     }
 
-    if (qpair_fd < 0) {
+    if (fd_count != 1 || qpair_fd < 0 ||
+        resp.qpair_count == 0 || resp.qpair_count > 2) {
+        if (qpair_fd >= 0) {
+            (void) close(qpair_fd);
+        }
         return VRTD_RET_INTERNAL_ERROR;
     }
 
@@ -507,6 +600,7 @@ enum vrtd_ret vrtd_buffer_open(
         resp.size,
         resp.phys_addr,
         qpair_fd,
+        resp.qpair_count,
         buffer_out
     );
     if (ret != VRTD_RET_OK) {
@@ -523,6 +617,7 @@ enum vrtd_ret vrtd_buffer_open_raw(
     uint64_t phys_addr,
     uint64_t size,
     uint32_t alloc_dir,
+    enum vrtd_mm_channel mm_channel,
     struct vrtd_buffer **buffer_out
 )
 {
@@ -534,21 +629,28 @@ enum vrtd_ret vrtd_buffer_open_raw(
     struct vrtd_req_buffer_open_raw req = {
         .dev_number = dev,
         .alloc_dir = alloc_dir,
+        .mm_channel = mm_channel,
         .phys_addr = phys_addr,
         .size = size,
     };
     struct vrtd_resp_buffer_open_raw resp = {0};
 
+    /* The daemon sends a single transfer fd that owns resp.qpair_count qpairs. */
     int qpair_fd = -1;
-    int ret = vrtd_raw_request(fd, VRTD_REQ_BUFFER_OPEN_RAW,
-                               &req, sizeof(req),
-                               &resp, sizeof(resp),
-                               &qpair_fd, NULL);
+    uint32_t fd_count = 0;
+    int ret = vrtd_raw_request_fds(fd, VRTD_REQ_BUFFER_OPEN_RAW,
+                                   &req, sizeof(req),
+                                   &resp, sizeof(resp),
+                                   &qpair_fd, 1, &fd_count, NULL);
     if (ret != VRTD_RET_OK) {
         return ret;
     }
 
-    if (qpair_fd < 0) {
+    if (fd_count != 1 || qpair_fd < 0 ||
+        resp.qpair_count == 0 || resp.qpair_count > 2) {
+        if (qpair_fd >= 0) {
+            (void) close(qpair_fd);
+        }
         return VRTD_RET_INTERNAL_ERROR;
     }
 
@@ -561,6 +663,7 @@ enum vrtd_ret vrtd_buffer_open_raw(
         size,
         phys_addr,
         qpair_fd,
+        resp.qpair_count,
         buffer_out
     );
     if (ret != VRTD_RET_OK) {
diff --git a/vrt/vrtd/libvrtd/src/v80_policy.h b/vrt/vrtd/libvrtd/src/v80_policy.h
new file mode 100644
index 00000000..2205aacf
--- /dev/null
+++ b/vrt/vrtd/libvrtd/src/v80_policy.h
@@ -0,0 +1,144 @@
+/**
+ * The MIT License (MIT)
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software
+ * and associated documentation files (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * @file v80_policy.h
+ * @brief Client-side V80 placement-aware channel policy for QDMA transfers.
+ *
+ * The kernel returns the opaque SLASH_QDMA_TRANSFER_HINT_V80 marker on buffer
+ * registration; the actual decision of how to spread a transfer across the
+ * available QDMA queues lives here, where the buffer's device address is known.
+ *
+ * On the V80 a transfer takes two independent NoC paths: the host-side ingress
+ * master (NMU) is chosen by the queue's mm-channel, while the memory-side
+ * egress endpoint (NSU / HBM pseudo-channel) is chosen by the device address.
+ * Sustaining both NMUs requires also spreading across two NSUs.  The policy:
+ *
+ *   - DDR (single NSU): split the range in half so both NMUs stay busy.
+ *   - HBM below the 16 GiB half-boundary: channel 0 only.
+ *   - HBM at/above the half-boundary:     channel 1 only.
+ *   - HBM spanning the boundary:          split there (below -> ch0, above -> ch1).
+ *
+ * The qpair-to-channel mapping is the wire contract from vrtd: qpair_index 0 is
+ * pinned to channel 0 and qpair_index 1 to channel 1 (see vrtd_resp_buffer_open).
+ */
+
+#ifndef VRTD_V80_POLICY_H
+#define VRTD_V80_POLICY_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+/*
+ * V80 device-memory geometry (mirrors vrt/vrtd/src/allocator.h and the
+ * memory-model docs).  HBM and DDR are each 64 x 512 MiB = 32 GiB; the HBM
+ * half-boundary at +16 GiB separates the two NoC slave-unit (NSU) regions.
+ */
+#define VRTD_V80_HBM_BASE 0x4000000000ULL
+#define VRTD_V80_HBM_SIZE (64ULL * 512ULL * 1024ULL * 1024ULL)
+#define VRTD_V80_HBM_HALF (VRTD_V80_HBM_SIZE / 2ULL)
+#define VRTD_V80_DDR_BASE 0x60000000000ULL
+#define VRTD_V80_DDR_SIZE (64ULL * 512ULL * 1024ULL * 1024ULL)
+
+/** @brief Maximum segments a transfer is split into (one per mm-channel). */
+#define VRTD_V80_MAX_SEGS 2u
+
+/** @brief One contiguous sub-transfer routed to a specific qpair. */
+struct vrtd_xfer_seg {
+    uint32_t qpair_index;  /**< Index into the fd's bound qpairs (== channel). */
+    uint64_t offset;       /**< Buffer-relative byte offset. */
+    uint64_t size;         /**< Byte count. */
+};
+
+/**
+ * @brief Compute the V80 transfer plan for a buffer range.
+ *
+ * Plans the transfer of [@p offset, @p offset + @p size) within a buffer based
+ * at device address @p phys_addr across @p qpair_count available queue pairs
+ * (qpair_index 0 == channel 0, qpair_index 1 == channel 1).  Split points are
+ * aligned down to @p step so every emitted segment stays page-aligned.  With
+ * fewer than two queue pairs (or a zero step) the whole range is assigned to
+ * qpair_index 0.
+ *
+ * @param phys_addr   Device base address of the buffer.
+ * @param offset      Buffer-relative start of the transfer.
+ * @param size        Transfer length in bytes (assumed a multiple of step).
+ * @param step        Transfer/page granule used to align split points.
+ * @param qpair_count Number of available queue pairs (1 or 2).
+ * @param segs        [out] Receives up to VRTD_V80_MAX_SEGS segments.
+ * @return Number of segments written to @p segs (1 or 2).
+ */
+static inline uint32_t vrtd_plan_v80(uint64_t phys_addr,
+                                     uint64_t offset,
+                                     uint64_t size,
+                                     uint64_t step,
+                                     uint32_t qpair_count,
+                                     struct vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS])
+{
+    if (qpair_count < 2u || step == 0u) {
+        segs[0].qpair_index = 0u;
+        segs[0].offset = offset;
+        segs[0].size = size;
+        return 1u;
+    }
+
+    uint64_t start = phys_addr + offset;
+    uint64_t end = start + size;
+    bool is_ddr = (phys_addr >= VRTD_V80_DDR_BASE &&
+                   phys_addr < VRTD_V80_DDR_BASE + VRTD_V80_DDR_SIZE);
+
+    uint64_t lo_len;
+    if (is_ddr) {
+        /* Single DDR NSU: just split the range to drive both NMUs. */
+        lo_len = size / 2u;
+    } else {
+        /* HBM: route by the 16 GiB half-memory boundary (NSU split). */
+        uint64_t boundary = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF;
+        if (end <= boundary) {
+            lo_len = size;             /* entirely in the lower half -> ch0 */
+        } else if (start >= boundary) {
+            segs[0].qpair_index = 1u;  /* entirely in the upper half -> ch1 */
+            segs[0].offset = offset;
+            segs[0].size = size;
+            return 1u;
+        } else {
+            lo_len = boundary - start;  /* spans the boundary */
+        }
+    }
+
+    lo_len -= lo_len % step;        /* keep both segments page-aligned */
+
+    if (lo_len == 0u || lo_len >= size) {
+        segs[0].qpair_index = 0u;
+        segs[0].offset = offset;
+        segs[0].size = size;
+        return 1u;
+    }
+
+    segs[0].qpair_index = 0u;
+    segs[0].offset = offset;
+    segs[0].size = lo_len;
+    segs[1].qpair_index = 1u;
+    segs[1].offset = offset + lo_len;
+    segs[1].size = size - lo_len;
+    return 2u;
+}
+
+#endif /* VRTD_V80_POLICY_H */
diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp
index 8748d379..b2569d91 100644
--- a/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp
+++ b/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp
@@ -48,6 +48,18 @@ enum class BufferAllocDir : uint32_t {
     DeviceToHost   = VRTD_ALLOC_DIR_DEVICE_TO_HOST,
 };
 
+/**
+ * @brief AXI-MM / NoC channel selection for a buffer's QDMA queue pair.
+ *
+ * Mirrors @c vrtd_mm_channel (values must stay in sync). @c Auto stripes across
+ * channels by (qid & 1); @c Ch0 / @c Ch1 pin to a single channel.
+ */
+enum class MmChannel : uint32_t {
+    Auto = 0, ///< Stripe across channels by (qid & 1).
+    Ch0  = 1, ///< Pin to AXI-MM/NoC channel 0.
+    Ch1  = 2, ///< Pin to AXI-MM/NoC channel 1.
+};
+
 /**
  * @brief RAII wrapper for a vrtd buffer allocation.
  *
diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp
index 8123a7f1..5a220075 100644
--- a/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp
+++ b/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp
@@ -158,19 +158,22 @@ class Device {
      * @param size      Requested size in bytes.
      * @param allocArg  Allocation argument (HBM region index for HBM).
      * @param allocDir  QDMA transfer direction.
+     * @param mmChannel AXI-MM/NoC channel selection (defaults to auto).
      * @return An owning @c Buffer.
      * @throws vrtd::Error on error.
      */
     Buffer openBuffer(BufferAllocType allocType,
                       uint64_t size,
                       uint64_t allocArg = 0,
-                      BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const;
+                      BufferAllocDir allocDir = BufferAllocDir::Bidirectional,
+                      MmChannel mmChannel = MmChannel::Auto) const;
 
     /**
      * @brief Convenience helper for DDR allocations.
      */
-    Buffer openDdrBuffer(uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const {
-        return openBuffer(BufferAllocType::Ddr, size, 0, allocDir);
+    Buffer openDdrBuffer(uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional,
+                         MmChannel mmChannel = MmChannel::Auto) const {
+        return openBuffer(BufferAllocType::Ddr, size, 0, allocDir, mmChannel);
     }
 
     /**
@@ -178,16 +181,18 @@ class Device {
      */
     Buffer openHbmBuffer(uint32_t region,
                          uint64_t size,
-                         BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const {
-        return openBuffer(BufferAllocType::Hbm, size, region, allocDir);
+                         BufferAllocDir allocDir = BufferAllocDir::Bidirectional,
+                         MmChannel mmChannel = MmChannel::Auto) const {
+        return openBuffer(BufferAllocType::Hbm, size, region, allocDir, mmChannel);
     }
 
     /**
      * @brief Convenience helper for HBM VNOC allocations.
      */
     Buffer openHbmVnocBuffer(uint64_t size,
-                             BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const {
-        return openBuffer(BufferAllocType::HbmVnoc, size, 0, allocDir);
+                             BufferAllocDir allocDir = BufferAllocDir::Bidirectional,
+                             MmChannel mmChannel = MmChannel::Auto) const {
+        return openBuffer(BufferAllocType::HbmVnoc, size, 0, allocDir, mmChannel);
     }
 
     /**
@@ -199,12 +204,14 @@ class Device {
      * @param phys_addr Device physical address.
      * @param size      Size in bytes.
      * @param allocDir  QDMA transfer direction.
+     * @param mmChannel AXI-MM/NoC channel selection (defaults to auto).
      * @return An owning @c Buffer.
      * @throws vrtd::Error on error.
      */
     Buffer openRawBuffer(uint64_t phys_addr,
                          uint64_t size,
-                         BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const;
+                         BufferAllocDir allocDir = BufferAllocDir::Bidirectional,
+                         MmChannel mmChannel = MmChannel::Auto) const;
 
     /**
      * @brief Perform a PCIe hotplug operation for this device.
@@ -351,8 +358,8 @@ class Device {
            uint16_t subsystemDeviceId,
            std::function<Bar(const Device&, uint8_t)> fGetBar,
            std::function<QdmaQpair(const Device&, const struct slash_qdma_qpair_add&)> fCreateQdmaQpair,
-           std::function<Buffer(const Device&, BufferAllocType, uint64_t, uint64_t, BufferAllocDir)> fOpenBuffer,
-           std::function<Buffer(const Device&, uint64_t, uint64_t, BufferAllocDir)> fOpenBufferRaw,
+           std::function<Buffer(const Device&, BufferAllocType, uint64_t, uint64_t, BufferAllocDir, MmChannel)> fOpenBuffer,
+           std::function<Buffer(const Device&, uint64_t, uint64_t, BufferAllocDir, MmChannel)> fOpenBufferRaw,
            std::function<void(const Device&, HotplugOp, uint8_t)> fHotplugOp,
            std::function<void(const Device&, int)> fDesignWrite,
            std::function<void(const Device&, std::string_view)> fDesignWriteFile,
@@ -370,8 +377,8 @@ class Device {
 
     std::function<Bar(const Device&, uint8_t)> fGetBar;
     std::function<QdmaQpair(const Device&, const struct slash_qdma_qpair_add&)> fCreateQdmaQpair;
-    std::function<Buffer(const Device&, BufferAllocType, uint64_t, uint64_t, BufferAllocDir)> fOpenBuffer;
-    std::function<Buffer(const Device&, uint64_t, uint64_t, BufferAllocDir)> fOpenBufferRaw;
+    std::function<Buffer(const Device&, BufferAllocType, uint64_t, uint64_t, BufferAllocDir, MmChannel)> fOpenBuffer;
+    std::function<Buffer(const Device&, uint64_t, uint64_t, BufferAllocDir, MmChannel)> fOpenBufferRaw;
     std::function<void(const Device&, HotplugOp, uint8_t)> fHotplugOp;
     std::function<void(const Device&, int)> fDesignWrite;
     std::function<void(const Device&, std::string_view)> fDesignWriteFile;
diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp
index e2dbbac2..422160c9 100644
--- a/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp
+++ b/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp
@@ -190,6 +190,7 @@ class Session {
      * @param size      Requested size in bytes.
      * @param allocArg  Allocation argument (HBM region index for HBM).
      * @param allocDir  QDMA transfer direction.
+     * @param mmChannel AXI-MM/NoC channel selection for the queue pair.
      * @return An owning @c Buffer.
      * @throws vrtd::Error on error.
      */
@@ -198,7 +199,8 @@ class Session {
         BufferAllocType allocType,
         uint64_t size,
         uint64_t allocArg,
-        BufferAllocDir allocDir
+        BufferAllocDir allocDir,
+        MmChannel mmChannel
     ) const;
 
     /**
@@ -208,6 +210,7 @@ class Session {
      * @param phys_addr Caller-specified device physical address (bypasses allocator).
      * @param size      Size in bytes.
      * @param allocDir  QDMA transfer direction.
+     * @param mmChannel AXI-MM/NoC channel selection for the queue pair.
      * @return An owning @c Buffer.
      * @throws vrtd::Error on error.
      */
@@ -215,7 +218,8 @@ class Session {
         const Device& device,
         uint64_t phys_addr,
         uint64_t size,
-        BufferAllocDir allocDir
+        BufferAllocDir allocDir,
+        MmChannel mmChannel
     ) const;
 
     /**
diff --git a/vrt/vrtd/libvrtdpp/src/buffer.cpp b/vrt/vrtd/libvrtdpp/src/buffer.cpp
index 0c68ff22..756170a6 100644
--- a/vrt/vrtd/libvrtdpp/src/buffer.cpp
+++ b/vrt/vrtd/libvrtdpp/src/buffer.cpp
@@ -151,24 +151,12 @@ bool Buffer::isClosed() const noexcept
 
 std::fstream Buffer::fstream(std::ios_base::openmode mode) const
 {
+    (void)mode;
     if (isClosed()) {
         throw std::runtime_error("Buffer is closed");
     }
 
-    int fd = getFd();
-    if (fd < 0) {
-        throw std::runtime_error("Buffer FD is invalid");
-    }
-
-    std::string path = "/proc/self/fd/" + std::to_string(fd);
-
-    std::fstream stream;
-    stream.open(path, mode);
-    if (!stream.is_open()) {
-        throw std::runtime_error("Failed to open fstream for buffer");
-    }
-
-    return stream;
+    throw std::runtime_error("Buffer qpair fds are ioctl-only; use syncToDevice/syncFromDevice");
 }
 
 void Buffer::syncToDevice(uint64_t offset, uint64_t size)
diff --git a/vrt/vrtd/libvrtdpp/src/device.cpp b/vrt/vrtd/libvrtdpp/src/device.cpp
index f45cda24..6fa00791 100644
--- a/vrt/vrtd/libvrtdpp/src/device.cpp
+++ b/vrt/vrtd/libvrtdpp/src/device.cpp
@@ -31,8 +31,8 @@ Device::Device(uint32_t num,
                uint16_t subsystemDeviceId,
                std::function<Bar(const Device&, uint8_t)> fGetBar,
                std::function<QdmaQpair(const Device&, const struct slash_qdma_qpair_add&)> fCreateQdmaQpair,
-               std::function<Buffer(const Device&, BufferAllocType, uint64_t, uint64_t, BufferAllocDir)> fOpenBuffer,
-               std::function<Buffer(const Device&, uint64_t, uint64_t, BufferAllocDir)> fOpenBufferRaw,
+               std::function<Buffer(const Device&, BufferAllocType, uint64_t, uint64_t, BufferAllocDir, MmChannel)> fOpenBuffer,
+               std::function<Buffer(const Device&, uint64_t, uint64_t, BufferAllocDir, MmChannel)> fOpenBufferRaw,
                std::function<void(const Device&, HotplugOp, uint8_t)> fHotplugOp,
                std::function<void(const Device&, int)> fDesignWrite,
                std::function<void(const Device&, std::string_view)> fDesignWriteFile,
@@ -97,14 +97,16 @@ QdmaQpair Device::createQdmaQpair(const struct slash_qdma_qpair_add& cfg) const
 Buffer Device::openBuffer(BufferAllocType allocType,
                           uint64_t size,
                           uint64_t allocArg,
-                          BufferAllocDir allocDir) const {
-    return fOpenBuffer(*this, allocType, size, allocArg, allocDir);
+                          BufferAllocDir allocDir,
+                          MmChannel mmChannel) const {
+    return fOpenBuffer(*this, allocType, size, allocArg, allocDir, mmChannel);
 }
 
 Buffer Device::openRawBuffer(uint64_t phys_addr,
                              uint64_t size,
-                             BufferAllocDir allocDir) const {
-    return fOpenBufferRaw(*this, phys_addr, size, allocDir);
+                             BufferAllocDir allocDir,
+                             MmChannel mmChannel) const {
+    return fOpenBufferRaw(*this, phys_addr, size, allocDir, mmChannel);
 }
 
 void Device::hotplugOp(HotplugOp op, uint8_t function) const {
diff --git a/vrt/vrtd/libvrtdpp/src/session.cpp b/vrt/vrtd/libvrtdpp/src/session.cpp
index d2e69fd9..7bbda0bc 100644
--- a/vrt/vrtd/libvrtdpp/src/session.cpp
+++ b/vrt/vrtd/libvrtdpp/src/session.cpp
@@ -132,11 +132,11 @@ Device Session::getDevice(size_t i) const {
         info.pci.subsystem_device_id,
         [&](const Device& device, uint8_t num) { return getBar(device, num); },
         [&](const Device& device, const slash_qdma_qpair_add& cfg) { return createQdmaQpair(device, cfg); },
-        [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir) {
-            return openBuffer(device, type, size, arg, dir);
+        [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, MmChannel mm) {
+            return openBuffer(device, type, size, arg, dir, mm);
         },
-        [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir) {
-            return openBufferRaw(device, phys_addr, size, dir);
+        [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, MmChannel mm) {
+            return openBufferRaw(device, phys_addr, size, dir, mm);
         },
         [&](const Device& device, HotplugOp op, uint8_t function) { return hotplugOp(device, op, function); },
         [&](const Device& device, int input_fd) { return designWrite(device, input_fd); },
@@ -197,11 +197,11 @@ Device Session::getDeviceByBdf(std::string_view bdf) const {
         info.pci.subsystem_device_id,
         [&](const Device& device, uint8_t num) { return getBar(device, num); },
         [&](const Device& device, const slash_qdma_qpair_add& cfg) { return createQdmaQpair(device, cfg); },
-        [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir) {
-            return openBuffer(device, type, size, arg, dir);
+        [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, MmChannel mm) {
+            return openBuffer(device, type, size, arg, dir, mm);
         },
-        [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir) {
-            return openBufferRaw(device, phys_addr, size, dir);
+        [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, MmChannel mm) {
+            return openBufferRaw(device, phys_addr, size, dir, mm);
         },
         [&](const Device& device, HotplugOp op, uint8_t function) { return hotplugOp(device, op, function); },
         [&](const Device& device, int input_fd) { return designWrite(device, input_fd); },
@@ -289,7 +289,8 @@ Buffer Session::openBuffer(
     BufferAllocType allocType,
     uint64_t size,
     uint64_t allocArg,
-    BufferAllocDir allocDir
+    BufferAllocDir allocDir,
+    MmChannel mmChannel
 ) const {
     if (isClosed()) {
         throw Error(VRTD_RET_BAD_LIB_CALL);
@@ -304,6 +305,7 @@ Buffer Session::openBuffer(
         static_cast<uint32_t>(allocDir),
         allocArg,
         size,
+        static_cast<vrtd_mm_channel>(static_cast<uint32_t>(mmChannel)),
         &raw
     );
     if (ret != VRTD_RET_OK) {
@@ -321,7 +323,8 @@ Buffer Session::openBufferRaw(
     const Device& device,
     uint64_t phys_addr,
     uint64_t size,
-    BufferAllocDir allocDir
+    BufferAllocDir allocDir,
+    MmChannel mmChannel
 ) const {
     if (isClosed()) {
         throw Error(VRTD_RET_BAD_LIB_CALL);
@@ -335,6 +338,7 @@ Buffer Session::openBufferRaw(
         phys_addr,
         size,
         static_cast<uint32_t>(allocDir),
+        static_cast<vrtd_mm_channel>(static_cast<uint32_t>(mmChannel)),
         &raw
     );
     if (ret != VRTD_RET_OK) {
diff --git a/vrt/vrtd/src/buffer.c b/vrt/vrtd/src/buffer.c
index 5a30076e..a5194fc7 100644
--- a/vrt/vrtd/src/buffer.c
+++ b/vrt/vrtd/src/buffer.c
@@ -52,7 +52,38 @@
 #define VRTD_QDMA_Q_MODE_MM 0u          /* Memory-mapped (MM) mode */
 #define VRTD_QDMA_DIR_H2C (1u << 0)     /* Host-to-Card direction */
 #define VRTD_QDMA_DIR_C2H (1u << 1)     /* Card-to-Host direction */
-#define VRTD_QDMA_RING_SZ_IDX 0u        /* Default ring size index */
+/*
+ * TODO: make this a vrtd.conf setting.  Index 15 is the largest QDMA descriptor
+ * ring and gives the best sustained transfer speed, but it consumes more
+ * host-side DMA-coherent memory per queue.
+ */
+#define VRTD_QDMA_RING_SZ_IDX 15u       /* Default ring size index */
+
+/**
+ * Decide how many qpairs back a buffer and which AXI-MM channel each one uses.
+ *
+ * A request of SLASH_QDMA_MM_CHANNEL_AUTO is expanded into two qpairs --
+ * @c fds[0] pinned to channel 0 and @c fds[1] to channel 1 -- so the client can
+ * apply the V80 placement policy with a deterministic fd-to-channel mapping.
+ * An explicit channel request pins a single qpair to that channel (no split).
+ *
+ * @param mm_channel  Requested channel (enum slash_qdma_mm_channel).
+ * @param channels    [out] Per-qpair channel value, indexed by qpair number.
+ * @return Number of qpairs to create (1 or VRTD_BUFFER_MAX_QPAIR_FDS).
+ */
+static uint32_t buffer_plan_qpair_channels(
+    uint32_t mm_channel,
+    uint32_t channels[VRTD_BUFFER_MAX_QPAIR_FDS])
+{
+    if (mm_channel == SLASH_QDMA_MM_CHANNEL_AUTO) {
+        channels[0] = SLASH_QDMA_MM_CHANNEL_0;
+        channels[1] = SLASH_QDMA_MM_CHANNEL_1;
+        return VRTD_BUFFER_MAX_QPAIR_FDS;
+    }
+
+    channels[0] = mm_channel;
+    return 1u;
+}
 
 /**
  * Initialise a buffer: allocate device memory, create a QDMA queue pair,
@@ -81,6 +112,7 @@ static int buffer_init(struct buffer *buf,
                        uint64_t size,
                        uint64_t alloc_arg,
                        uint64_t client_id,
+                       uint32_t mm_channel,
                        const struct slash_qdma_qpair_add *qpair_params)
 {
     if (buf == NULL) {
@@ -100,7 +132,8 @@ static int buffer_init(struct buffer *buf,
         .client_id = client_id,
         .addr = 0,
         .size = 0,
-        .qid = 0,
+        .qpair_count = 0,
+        .qids = {0},
         .fd = -1,
         .allocation_valid = false,
         .qpair_created = false,
@@ -171,46 +204,57 @@ static int buffer_init(struct buffer *buf,
     buf->size = alloc_size;
     buf->allocation_valid = true;
 
-    /* Step 2: Configure and create a QDMA queue pair.  If the caller
-     * supplied custom qpair parameters (e.g. streaming mode), use those;
-     * otherwise default to memory-mapped mode with the smallest ring size. */
-    struct slash_qdma_qpair_add qpair = {0};
-    if (qpair_params != NULL) {
-        qpair = *qpair_params;
-    } else {
-        qpair.mode = VRTD_QDMA_Q_MODE_MM;
-        qpair.h2c_ring_sz = VRTD_QDMA_RING_SZ_IDX;
-        qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX;
-        qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX;
-    }
-    qpair.dir_mask = dir_mask;
-    qpair.size = sizeof(qpair);
+    /* Steps 2-4: create/start queue pairs and obtain their fds.  An AUTO
+     * request yields two qpairs -- fds[0] on channel 0, fds[1] on channel 1 --
+     * so the client's V80 placement policy has a deterministic fd-to-channel
+     * mapping; an explicit channel pins a single qpair. */
+    uint32_t qpair_channels[VRTD_BUFFER_MAX_QPAIR_FDS];
+    uint32_t num_qpairs = buffer_plan_qpair_channels(mm_channel, qpair_channels);
+
+    for (uint32_t i = 0; i < num_qpairs; ++i) {
+        struct slash_qdma_qpair_add qpair = {0};
+
+        if (qpair_params != NULL) {
+            qpair = *qpair_params;
+        } else {
+            qpair.mode = VRTD_QDMA_Q_MODE_MM;
+            qpair.h2c_ring_sz = VRTD_QDMA_RING_SZ_IDX;
+            qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX;
+            qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX;
+        }
+        qpair.dir_mask = dir_mask;
+        qpair.mm_channel = qpair_channels[i];
+        qpair.size = sizeof(qpair);
 
-    if (slash_qdma_qpair_add(qdma, &qpair) != 0) {
-        LOG(LOG_ERR, "Failed to add buffer qpair: %m");
-        goto fail;
-    }
+        if (slash_qdma_qpair_add(qdma, &qpair) != 0) {
+            LOG(LOG_ERR, "Failed to add buffer qpair %u: %m", (unsigned int)i);
+            goto fail;
+        }
 
-    buf->qid = qpair.qid;
-    buf->qpair_created = true;
+        buf->qids[i] = qpair.qid;
+        buf->qpair_count = i + 1;
+        buf->qpair_created = true;
 
-    /* Step 3: Start the queue pair so DMA transfers can be issued. */
-    if (slash_qdma_qpair_start(qdma, buf->qid) != 0) {
-        LOG(LOG_ERR, "Failed to start buffer qpair %u: %m", buf->qid);
-        goto fail;
+        if (slash_qdma_qpair_start(qdma, qpair.qid) != 0) {
+            LOG(LOG_ERR, "Failed to start buffer qpair %u: %m", qpair.qid);
+            goto fail;
+        }
     }
 
-    /* Step 4: Obtain a file descriptor for the queue.  The client will use
-     * this fd (passed over the Unix socket via SCM_RIGHTS) to perform
-     * read/write/mmap against the QDMA queue. */
-    int fd = slash_qdma_qpair_get_fd(qdma, buf->qid, O_CLOEXEC);
-    if (fd < 0) {
-        LOG(LOG_ERR, "Failed to get fd for buffer qpair %u: %m", buf->qid);
+    /* Step 5: bind every started qpair into a single transfer fd so one
+     * transfer ioctl can fan across both channels.  The qids array index
+     * becomes the qpair_index used by the client's sub-transfers. */
+    buf->fd = slash_qdma_qpair_get_fd_multi(qdma, buf->qids, buf->qpair_count,
+                                            O_CLOEXEC);
+    if (buf->fd < 0) {
+        LOG(LOG_ERR, "Failed to get combined fd for %u buffer qpairs: %m",
+            (unsigned int)buf->qpair_count);
         goto fail;
     }
-    buf->fd = fd;
 
-    LOG(LOG_DEBUG, "Buffer initialized addr=0x%llx size=%llu qid=%u", (unsigned long long)buf->addr, (unsigned long long)buf->size, buf->qid);
+    LOG(LOG_DEBUG, "Buffer initialized addr=0x%llx size=%llu qpairs=%u",
+        (unsigned long long)buf->addr, (unsigned long long)buf->size,
+        (unsigned int)buf->qpair_count);
     return 0;
 
 fail:
@@ -235,6 +279,7 @@ struct buffer *buffer_create(struct slash_qdma *qdma,
                              uint64_t size,
                              uint64_t alloc_arg,
                              uint64_t client_id,
+                             uint32_t mm_channel,
                              const struct slash_qdma_qpair_add *qpair_params)
 {
     struct buffer *buf = calloc(1, sizeof(*buf));
@@ -243,7 +288,7 @@ struct buffer *buffer_create(struct slash_qdma *qdma,
         return NULL;
     }
 
-    if (buffer_init(buf, qdma, map, alloc_type, alloc_dir, size, alloc_arg, client_id, qpair_params) != 0) {
+    if (buffer_init(buf, qdma, map, alloc_type, alloc_dir, size, alloc_arg, client_id, mm_channel, qpair_params) != 0) {
         LOG(LOG_ERR, "Failed to initialize buffer: %m");
         return NULL;
     }
@@ -263,7 +308,9 @@ struct buffer *buffer_create(struct slash_qdma *qdma,
 struct buffer *buffer_create_raw(struct slash_qdma *qdma,
                                  uint64_t phys_addr,
                                  uint64_t size,
-                                 enum vrtd_alloc_dir alloc_dir)
+                                 enum vrtd_alloc_dir alloc_dir,
+                                 uint64_t client_id,
+                                 uint32_t mm_channel)
 {
     if (qdma == NULL || size == 0) {
         errno = EINVAL;
@@ -299,48 +346,60 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma,
         .alloc_type = 0,
         .alloc_arg = 0,
         .alloc_dir = alloc_dir,
-        .client_id = 0,
+        .client_id = client_id,
         .addr = phys_addr,
         .size = size,
-        .qid = 0,
+        .qpair_count = 0,
+        .qids = {0},
         .fd = -1,
         .allocation_valid = false, /* no allocator reservation to free */
         .qpair_created = false,
     };
 
-    struct slash_qdma_qpair_add qpair = {0};
-    qpair.mode = VRTD_QDMA_Q_MODE_MM;
-    qpair.h2c_ring_sz = VRTD_QDMA_RING_SZ_IDX;
-    qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX;
-    qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX;
-    qpair.dir_mask = dir_mask;
-    qpair.size = sizeof(qpair);
-
-    if (slash_qdma_qpair_add(qdma, &qpair) != 0) {
-        LOG(LOG_ERR, "buffer_create_raw: failed to add qpair: %m");
-        free(buf);
-        return NULL;
-    }
+    uint32_t qpair_channels[VRTD_BUFFER_MAX_QPAIR_FDS];
+    uint32_t num_qpairs = buffer_plan_qpair_channels(mm_channel, qpair_channels);
 
-    buf->qid = qpair.qid;
-    buf->qpair_created = true;
+    for (uint32_t i = 0; i < num_qpairs; ++i) {
+        struct slash_qdma_qpair_add qpair = {0};
 
-    if (slash_qdma_qpair_start(qdma, buf->qid) != 0) {
-        LOG(LOG_ERR, "buffer_create_raw: failed to start qpair %u: %m", buf->qid);
-        cleanup_buffer(buf);
-        return NULL;
+        qpair.mode = VRTD_QDMA_Q_MODE_MM;
+        qpair.h2c_ring_sz = VRTD_QDMA_RING_SZ_IDX;
+        qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX;
+        qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX;
+        qpair.dir_mask = dir_mask;
+        qpair.mm_channel = qpair_channels[i];
+        qpair.size = sizeof(qpair);
+
+        if (slash_qdma_qpair_add(qdma, &qpair) != 0) {
+            LOG(LOG_ERR, "buffer_create_raw: failed to add qpair %u: %m", (unsigned int)i);
+            cleanup_buffer(buf);
+            return NULL;
+        }
+
+        buf->qids[i] = qpair.qid;
+        buf->qpair_count = i + 1;
+        buf->qpair_created = true;
+
+        if (slash_qdma_qpair_start(qdma, qpair.qid) != 0) {
+            LOG(LOG_ERR, "buffer_create_raw: failed to start qpair %u: %m", qpair.qid);
+            cleanup_buffer(buf);
+            return NULL;
+        }
     }
 
-    int fd = slash_qdma_qpair_get_fd(qdma, buf->qid, O_CLOEXEC);
-    if (fd < 0) {
-        LOG(LOG_ERR, "buffer_create_raw: failed to get fd for qpair %u: %m", buf->qid);
+    /* Bind every started qpair into a single transfer fd. */
+    buf->fd = slash_qdma_qpair_get_fd_multi(qdma, buf->qids, buf->qpair_count,
+                                            O_CLOEXEC);
+    if (buf->fd < 0) {
+        LOG(LOG_ERR, "buffer_create_raw: failed to get combined fd for %u qpairs: %m",
+            (unsigned int)buf->qpair_count);
         cleanup_buffer(buf);
         return NULL;
     }
-    buf->fd = fd;
 
-    LOG(LOG_DEBUG, "Raw buffer created phys_addr=0x%llx size=%llu qid=%u",
-        (unsigned long long)phys_addr, (unsigned long long)size, buf->qid);
+    LOG(LOG_DEBUG, "Raw buffer created phys_addr=0x%llx size=%llu qpairs=%u",
+        (unsigned long long)phys_addr, (unsigned long long)size,
+        (unsigned int)buf->qpair_count);
     return buf;
 }
 
@@ -348,12 +407,12 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma,
  * Tear down a buffer and release all associated resources.
  *
  * Resources are released in reverse acquisition order:
- *  1. Close the file descriptor (if open).
- *  2. Stop and delete the QDMA queue pair (if created).
+ *  1. Close the file descriptors (if open).
+ *  2. Stop and delete the QDMA queue pairs (if created).
  *  3. Free the device memory allocation (if valid).
  *  4. Zero all fields and free the struct.
  *
- * Each step is guarded by its corresponding flag (fd >= 0,
+ * Each step is guarded by its corresponding flag (fds[] >= 0,
  * qpair_created, allocation_valid) so this function is safe to call
  * after partial initialisation.  NULL-safe.
  */
@@ -363,31 +422,31 @@ void cleanup_buffer(struct buffer *buf)
         return;
     }
 
-    LOG(LOG_DEBUG, "Freeing buffer addr=0x%llx size=%llu qid=%u", (unsigned long long)buf->addr, (unsigned long long)buf->size, buf->qid);
+    LOG(LOG_DEBUG, "Freeing buffer addr=0x%llx size=%llu qpairs=%u",
+        (unsigned long long)buf->addr, (unsigned long long)buf->size,
+        (unsigned int)buf->qpair_count);
 
-    /* Close the QDMA queue fd first, before stopping the queue. */
+    /* Close the combined QDMA transfer fd first, before stopping the queues. */
     if (buf->fd >= 0) {
         (void) close(buf->fd);
         buf->fd = -1;
     }
 
-    /* Stop and delete the QDMA queue pair.  Errors are logged but
+    /* Stop and delete the QDMA queue pairs.  Errors are logged but
      * otherwise ignored -- we are on the teardown path and must continue
      * releasing remaining resources. */
     if (buf->qpair_created && buf->qdma != NULL) {
-        if (slash_qdma_qpair_stop(buf->qdma, buf->qid) != 0) {
-            LOG(
-                LOG_WARNING,
-                "Error stopping buffer qpair %u: %m (ignored)",
-                buf->qid
-            );
-        }
-        if (slash_qdma_qpair_del(buf->qdma, buf->qid) != 0) {
-            LOG(
-                LOG_WARNING,
-                "Error deleting buffer qpair %u: %m (ignored)",
-                buf->qid
-            );
+        for (uint32_t i = 0; i < buf->qpair_count; ++i) {
+            if (slash_qdma_qpair_stop(buf->qdma, buf->qids[i]) != 0) {
+                LOG(LOG_WARNING,
+                    "Error stopping buffer qpair %u: %m (ignored)",
+                    buf->qids[i]);
+            }
+            if (slash_qdma_qpair_del(buf->qdma, buf->qids[i]) != 0) {
+                LOG(LOG_WARNING,
+                    "Error deleting buffer qpair %u: %m (ignored)",
+                    buf->qids[i]);
+            }
         }
     }
 
@@ -417,7 +476,8 @@ void cleanup_buffer(struct buffer *buf)
     buf->allocation_valid = false;
     buf->addr = 0;
     buf->size = 0;
-    buf->qid = 0;
+    buf->qpair_count = 0;
+    memset(buf->qids, 0, sizeof(buf->qids));
     buf->fd = -1;
 
     free(buf);
diff --git a/vrt/vrtd/src/buffer.h b/vrt/vrtd/src/buffer.h
index 6834222b..167bf98a 100644
--- a/vrt/vrtd/src/buffer.h
+++ b/vrt/vrtd/src/buffer.h
@@ -48,6 +48,8 @@
 #include "array.h"
 #include "vrtd/wire.h"
 
+#define VRTD_BUFFER_MAX_QPAIR_FDS 2u
+
 /**
  * @brief A single DMA buffer allocated on a SLASH FPGA device.
  *
@@ -72,10 +74,13 @@ struct buffer {
     uint64_t addr;
     /** @brief Size of the allocated memory region in bytes (rounded up to subregion granularity). */
     uint64_t size;
-    /** @brief QDMA queue ID assigned to this buffer's queue pair. */
-    uint32_t qid;
-    /** @brief File descriptor for the QDMA queue pair character device.
-     *  Passed to the client via SCM_RIGHTS for direct data transfer. */
+    /** @brief Number of QDMA queue pairs created for this buffer (1 or 2). */
+    uint32_t qpair_count;
+    /** @brief QDMA queue IDs assigned to this buffer's queue pairs. */
+    uint32_t qids[VRTD_BUFFER_MAX_QPAIR_FDS];
+    /** @brief Single transfer fd that owns all @qpair_count queue pairs.
+     *  Passed to the client via SCM_RIGHTS for direct data transfer; the client
+     *  selects a channel per sub-transfer by qpair_index. -1 when not created. */
     int fd;
     /** @brief True if the address-space allocation in the memory map is valid and must be freed. */
     bool allocation_valid;
@@ -96,6 +101,7 @@ struct buffer {
  * @param size          Requested buffer size in bytes (may be rounded up).
  * @param alloc_arg     Type-specific argument (HBM region index for non-VNOC HBM).
  * @param client_id     Connection ID of the owning client.
+ * @param mm_channel    AXI-MM/NoC channel selection (enum slash_qdma_mm_channel).
  * @param qpair_params  QDMA queue pair configuration parameters.
  * @return Heap-allocated buffer on success, NULL on failure.
  */
@@ -106,6 +112,7 @@ struct buffer *buffer_create(struct slash_qdma *qdma,
                              uint64_t size,
                              uint64_t alloc_arg,
                              uint64_t client_id,
+                             uint32_t mm_channel,
                              const struct slash_qdma_qpair_add *qpair_params);
 
 /**
@@ -120,12 +127,17 @@ struct buffer *buffer_create(struct slash_qdma *qdma,
  * @param phys_addr  Caller-specified device physical address.
  * @param size       Size in bytes.
  * @param alloc_dir  DMA transfer direction.
+ * @param client_id  Connection ID of the owning client (for ownership checks
+ *                   and automatic cleanup on disconnect; must be non-zero).
+ * @param mm_channel AXI-MM/NoC channel selection (enum slash_qdma_mm_channel).
  * @return Heap-allocated buffer on success, NULL on failure (errno set).
  */
 struct buffer *buffer_create_raw(struct slash_qdma *qdma,
                                  uint64_t phys_addr,
                                  uint64_t size,
-                                 enum vrtd_alloc_dir alloc_dir);
+                                 enum vrtd_alloc_dir alloc_dir,
+                                 uint64_t client_id,
+                                 uint32_t mm_channel);
 
 /**
  * @brief Release all resources owned by a buffer.
diff --git a/vrt/vrtd/src/serve.c b/vrt/vrtd/src/serve.c
index c11c4d32..4559f82a 100644
--- a/vrt/vrtd/src/serve.c
+++ b/vrt/vrtd/src/serve.c
@@ -761,7 +761,7 @@ static int client_handle_in(struct client *client)
      * Allocate a cmsg buffer large enough for one fd.
      * CMSG_SPACE includes alignment padding required by the kernel.
      */
-    char cbuf[CMSG_SPACE(sizeof(int))];
+    char cbuf[CMSG_SPACE(2 * sizeof(int))];
     struct msghdr msg = {
         .msg_name       = NULL,
         .msg_namelen    = 0,
@@ -895,17 +895,24 @@ static int client_handle_out(struct client *client)
      * The cbuf is zeroed to satisfy kernel expectations about padding.
      */
     if (client->have_out_fd) {
+        uint32_t fd_count = client->out_fd_count ? client->out_fd_count : 1;
+
+        if (fd_count > 2) {
+            LOG(LOG_ERR, "Invalid outbound fd count %u", (unsigned int)fd_count);
+            return -1;
+        }
+
         memset(cbuf, 0, sizeof cbuf);
 
         msg.msg_control = cbuf;
-        msg.msg_controllen = sizeof cbuf;
+        msg.msg_controllen = CMSG_SPACE(fd_count * sizeof(int));
 
         struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
         cmsg->cmsg_level = SOL_SOCKET;
         cmsg->cmsg_type  = SCM_RIGHTS;
-        cmsg->cmsg_len   = CMSG_LEN(sizeof(int));
+        cmsg->cmsg_len   = CMSG_LEN(fd_count * sizeof(int));
 
-        memcpy(CMSG_DATA(cmsg), &client->out_fd, sizeof(int));
+        memcpy(CMSG_DATA(cmsg), client->out_fds, fd_count * sizeof(int));
     }
 
     ssize_t n;
@@ -937,6 +944,7 @@ static int client_handle_out(struct client *client)
     /* Response sent -- clear state so the client can send a new request. */
     client->have_response = false;
     client->have_out_fd = false;
+    client->out_fd_count = 0;
 
     return 0;
 }
@@ -1040,7 +1048,7 @@ static int client_handle_request(struct client *client)
                 req_header->size,
                 CLIENT_OUT_BODY(*client, vrtd_resp_get_bar_fd),
                 &size,
-                &client->out_fd,
+                &client->out_fds[0],
                 &client->have_out_fd
             );
         break;
@@ -1082,7 +1090,7 @@ static int client_handle_request(struct client *client)
                 req_header->size,
                 CLIENT_OUT_BODY(*client, vrtd_resp_qdma_qpair_get_fd),
                 &size,
-                &client->out_fd,
+                &client->out_fds[0],
                 &client->have_out_fd
             );
         break;
@@ -1094,7 +1102,7 @@ static int client_handle_request(struct client *client)
                 req_header->size,
                 CLIENT_OUT_BODY(*client, vrtd_resp_buffer_open),
                 &size,
-                &client->out_fd,
+                &client->out_fds[0],
                 &client->have_out_fd
             );
         break;
@@ -1106,7 +1114,7 @@ static int client_handle_request(struct client *client)
                 req_header->size,
                 CLIENT_OUT_BODY(*client, vrtd_resp_buffer_open_raw),
                 &size,
-                &client->out_fd,
+                &client->out_fds[0],
                 &client->have_out_fd
             );
         break;
@@ -1966,6 +1974,12 @@ static uint16_t client_handle_request_buffer_open(
         return VRTD_RET_INVALID_ARGUMENT;
     }
 
+    if (req_body->mm_channel > SLASH_QDMA_MM_CHANNEL_1) {
+        LOG(LOG_WARNING, "Received buffer open request with invalid mm_channel %u",
+            (unsigned int)req_body->mm_channel);
+        return VRTD_RET_INVALID_ARGUMENT;
+    }
+
     struct device *d = client->state->devices.d[req_body->dev_number];
     if (d == NULL || d->qdma == NULL || d->memory_map == NULL) {
         LOG(LOG_WARNING, "Received buffer open request for non-existent or non-functional device");
@@ -1992,6 +2006,7 @@ static uint16_t client_handle_request_buffer_open(
         req_body->size,
         req_body->alloc_arg,
         client_id,
+        req_body->mm_channel,
         NULL
     );
     if (buf == NULL) {
@@ -2009,14 +2024,16 @@ static uint16_t client_handle_request_buffer_open(
         return VRTD_RET_INTERNAL_ERROR;
     }
 
-    if (buf->fd < 0) {
-        LOG(LOG_ERR, "Buffer created without valid fd");
+    if (buf->qpair_count == 0 || buf->qpair_count > VRTD_BUFFER_MAX_QPAIR_FDS ||
+        buf->fd < 0) {
+        LOG(LOG_ERR, "Buffer created without valid qpair fd");
         return VRTD_RET_INTERNAL_ERROR;
     }
 
     uint64_t real_size = buf->size;
-    int fd = buf->fd;
     uint64_t phys_addr = buf->addr;
+    uint32_t qpair_count = buf->qpair_count;
+    int fd = buf->fd;
 
     /*
      * Transfer ownership of the buffer into the device's buffer list.
@@ -2030,6 +2047,11 @@ static uint16_t client_handle_request_buffer_open(
 
     resp_body->size = real_size;
     resp_body->phys_addr = phys_addr;
+    resp_body->qpair_count = qpair_count;
+    /* A single transfer fd owns all qpairs; the client selects channels by
+     * qpair_index per sub-transfer. */
+    client->out_fds[0] = fd;
+    client->out_fd_count = 1;
     *out_fd = fd;
     *have_out_fd = true;
     *resp_size = sizeof(*resp_body);
@@ -2104,18 +2126,32 @@ static uint16_t client_handle_request_buffer_open_raw(
         return VRTD_RET_INVALID_ARGUMENT;
     }
 
+    if (req_body->mm_channel > SLASH_QDMA_MM_CHANNEL_1) {
+        LOG(LOG_WARNING, "Received raw buffer open request with invalid mm_channel %u",
+            (unsigned int)req_body->mm_channel);
+        return VRTD_RET_INVALID_ARGUMENT;
+    }
+
     struct device *d = client->state->devices.d[req_body->dev_number];
     if (d == NULL || d->qdma == NULL) {
         LOG(LOG_WARNING, "Received raw buffer open request for non-existent or non-functional device");
         return VRTD_RET_NOEXIST;
     }
 
+    uint64_t client_id = client->conn_id;
+    if (client_id == 0) {
+        LOG(LOG_ERR, "Invalid client connection id");
+        return VRTD_RET_INTERNAL_ERROR;
+    }
+
     _cleanup_(cleanup_bufferp)
     struct buffer *buf = buffer_create_raw(
         d->qdma,
         req_body->phys_addr,
         req_body->size,
-        (enum vrtd_alloc_dir) req_body->alloc_dir
+        (enum vrtd_alloc_dir) req_body->alloc_dir,
+        client_id,
+        req_body->mm_channel
     );
     if (buf == NULL) {
         if (errno == EINVAL) {
@@ -2126,11 +2162,13 @@ static uint16_t client_handle_request_buffer_open_raw(
         return VRTD_RET_INTERNAL_ERROR;
     }
 
-    if (buf->fd < 0) {
-        LOG(LOG_ERR, "Raw buffer created without valid fd");
+    if (buf->qpair_count == 0 || buf->qpair_count > VRTD_BUFFER_MAX_QPAIR_FDS ||
+        buf->fd < 0) {
+        LOG(LOG_ERR, "Raw buffer created without valid qpair fd");
         return VRTD_RET_INTERNAL_ERROR;
     }
 
+    uint32_t qpair_count = buf->qpair_count;
     int fd = buf->fd;
 
     if (buffer_ptr_array_push_move(&d->buffers, &buf) != 0) {
@@ -2138,7 +2176,9 @@ static uint16_t client_handle_request_buffer_open_raw(
         return VRTD_RET_INTERNAL_ERROR;
     }
 
-    resp_body->zero = 0;
+    resp_body->qpair_count = qpair_count;
+    client->out_fds[0] = fd;
+    client->out_fd_count = 1;
     *out_fd = fd;
     *have_out_fd = true;
     *resp_size = sizeof(*resp_body);
@@ -2211,8 +2251,15 @@ static uint16_t client_handle_request_buffer_close(
         return VRTD_RET_NOEXIST;
     }
 
-    /* Search for the buffer by physical address. */
+    /*
+     * Search for the caller's buffer by physical address.  Raw buffers bypass
+     * the allocator and use caller-specified addresses, so distinct clients can
+     * hold buffers at the same address; scan all matches and pick the one owned
+     * by this connection rather than rejecting on the first address match.
+     */
     struct buffer *found = NULL;
+    bool addr_size_match_foreign = false; /* same addr+size, owned by another conn */
+    bool addr_match_size_mismatch = false; /* same addr, different size */
     for (size_t i = 0; i < d->buffers.len; ++i) {
         struct buffer *buf = d->buffers.d[i];
         if (buf == NULL) {
@@ -2221,15 +2268,20 @@ static uint16_t client_handle_request_buffer_close(
         if (buf->addr != req_body->phys_addr) {
             continue;
         }
-        /* Found a buffer at the right address -- verify size. */
         if (buf->size != req_body->size) {
-            LOG(LOG_WARNING, "buffer_close: size mismatch at addr=0x%llx (expected %llu, got %llu)",
-                (unsigned long long)req_body->phys_addr,
-                (unsigned long long)buf->size, (unsigned long long)req_body->size);
-            return VRTD_RET_INVALID_ARGUMENT;
+            addr_match_size_mismatch = true;
+            continue;
         }
-        /* Verify ownership: only the client that opened the buffer may close it. */
         if (buf->client_id != client->conn_id) {
+            addr_size_match_foreign = true;
+            continue;
+        }
+        found = buf;
+        break;
+    }
+
+    if (found == NULL) {
+        if (addr_size_match_foreign) {
             char pwbuf[1024];
             LOG(
                 LOG_WARNING,
@@ -2239,11 +2291,12 @@ static uint16_t client_handle_request_buffer_close(
             );
             return VRTD_RET_AUTH_ERROR;
         }
-        found = buf;
-        break;
-    }
-
-    if (found == NULL) {
+        if (addr_match_size_mismatch) {
+            LOG(LOG_WARNING, "buffer_close: size mismatch at addr=0x%llx (got %llu)",
+                (unsigned long long)req_body->phys_addr,
+                (unsigned long long)req_body->size);
+            return VRTD_RET_INVALID_ARGUMENT;
+        }
         LOG(LOG_NOTICE, "buffer_close: no buffer at addr=0x%llx on device %u",
             (unsigned long long)req_body->phys_addr, (unsigned int)req_body->dev_number);
         return VRTD_RET_NOEXIST;
diff --git a/vrt/vrtd/src/serve.h b/vrt/vrtd/src/serve.h
index 55cdd9ba..8d4e728c 100644
--- a/vrt/vrtd/src/serve.h
+++ b/vrt/vrtd/src/serve.h
@@ -73,9 +73,11 @@ struct client {
     /** @brief True when @c in_fd contains a valid received file descriptor. */
     bool have_in_fd;
 
-    /** @brief File descriptor to send back to the client via SCM_RIGHTS ancillary data. */
-    int out_fd;
-    /** @brief True when @c out_fd contains a valid file descriptor to transmit. */
+    /** @brief File descriptors to send back to the client via SCM_RIGHTS ancillary data. */
+    int out_fds[2];
+    /** @brief Number of valid descriptors in @c out_fds. */
+    uint32_t out_fd_count;
+    /** @brief True when @c out_fds contains at least one valid file descriptor to transmit. */
     bool have_out_fd;
 
     /** @brief True when a complete request has been read into @c inb and is awaiting dispatch. */
diff --git a/vrt/vrtd/tests/CMakeLists.txt b/vrt/vrtd/tests/CMakeLists.txt
index f5197f45..241f01d4 100644
--- a/vrt/vrtd/tests/CMakeLists.txt
+++ b/vrt/vrtd/tests/CMakeLists.txt
@@ -29,5 +29,6 @@ add_vrtd_test(hotplug_test hotplug_test.cpp)
 add_vrtd_test(config_test config_test.cpp)
 add_vrtd_test(auth_test auth_test.cpp)
 add_vrtd_test(buffer_test buffer_test.cpp)
+add_vrtd_test(v80_policy_test v80_policy_test.cpp)
 add_vrtd_test(design_writer_test design_writer_test.cpp)
 add_vrtd_test(device_test device_test.cpp)
diff --git a/vrt/vrtd/tests/buffer_test.cpp b/vrt/vrtd/tests/buffer_test.cpp
index 078f5819..1038678b 100644
--- a/vrt/vrtd/tests/buffer_test.cpp
+++ b/vrt/vrtd/tests/buffer_test.cpp
@@ -36,6 +36,27 @@ static constexpr const char    *REAL_QDMA_PATH   = "/dev/slash_qdma_ctl0";
 static constexpr uint64_t       XFER_SIZE        = 4096;
 static constexpr uint64_t       CLIENT_ID        = 42;
 
+static void qpair_fd_round_trip(int fd, uint64_t addr, const uint8_t *src, uint8_t *dst)
+{
+    struct slash_qdma_buffer write_buf{};
+    struct slash_qdma_buffer read_buf{};
+    ASSERT_EQ(slash_qdma_qpair_buffer_create(fd, XFER_SIZE, &write_buf), 0);
+    ASSERT_EQ(slash_qdma_qpair_buffer_create(fd, XFER_SIZE, &read_buf), 0);
+    std::memcpy(write_buf.addr, src, XFER_SIZE);
+
+    ssize_t written = slash_qdma_qpair_transfer(
+        fd, write_buf.fd, 0, addr, XFER_SIZE, SLASH_QDMA_XFER_H2C);
+    EXPECT_EQ(written, static_cast<ssize_t>(XFER_SIZE));
+
+    ssize_t read_bytes = slash_qdma_qpair_transfer(
+        fd, read_buf.fd, 0, addr, XFER_SIZE, SLASH_QDMA_XFER_C2H);
+    EXPECT_EQ(read_bytes, static_cast<ssize_t>(XFER_SIZE));
+    std::memcpy(dst, read_buf.addr, XFER_SIZE);
+
+    EXPECT_EQ(slash_qdma_buffer_destroy(&write_buf), 0);
+    EXPECT_EQ(slash_qdma_buffer_destroy(&read_buf), 0);
+}
+
 // ─── Null / argument validation (no hardware needed, always run) ──────────────
 
 TEST(BufferNullTest, NullQdma) {
@@ -43,7 +64,7 @@ TEST(BufferNullTest, NullQdma) {
     ASSERT_NE(map, nullptr);
     struct buffer *buf = buffer_create(nullptr, map, ALLOCATION_TYPE_DDR,
                                        VRTD_ALLOC_DIR_HOST_TO_DEVICE,
-                                       XFER_SIZE, 0, CLIENT_ID, nullptr);
+                                       XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr);
     EXPECT_EQ(buf, nullptr);
     device_memory_map_cleanup(map);
 }
@@ -53,7 +74,7 @@ TEST(BufferNullTest, NullMap) {
     ASSERT_NE(qdma, nullptr);
     struct buffer *buf = buffer_create(qdma, nullptr, ALLOCATION_TYPE_DDR,
                                        VRTD_ALLOC_DIR_HOST_TO_DEVICE,
-                                       XFER_SIZE, 0, CLIENT_ID, nullptr);
+                                       XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr);
     EXPECT_EQ(buf, nullptr);
     slash_qdma_close(qdma);
 }
@@ -65,7 +86,7 @@ TEST(BufferNullTest, ZeroSize) {
     ASSERT_NE(map, nullptr);
     struct buffer *buf = buffer_create(qdma, map, ALLOCATION_TYPE_DDR,
                                        VRTD_ALLOC_DIR_HOST_TO_DEVICE,
-                                       0, 0, CLIENT_ID, nullptr);
+                                       0, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr);
     EXPECT_EQ(buf, nullptr);
     device_memory_map_cleanup(map);
     slash_qdma_close(qdma);
@@ -78,7 +99,7 @@ TEST(BufferNullTest, ZeroClientId) {
     ASSERT_NE(map, nullptr);
     struct buffer *buf = buffer_create(qdma, map, ALLOCATION_TYPE_DDR,
                                        VRTD_ALLOC_DIR_HOST_TO_DEVICE,
-                                       XFER_SIZE, 0, 0, nullptr);
+                                       XFER_SIZE, 0, 0, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr);
     EXPECT_EQ(buf, nullptr);
     device_memory_map_cleanup(map);
     slash_qdma_close(qdma);
@@ -91,7 +112,7 @@ TEST(BufferNullTest, InvalidDirection) {
     ASSERT_NE(map, nullptr);
     struct buffer *buf = buffer_create(qdma, map, ALLOCATION_TYPE_DDR,
                                        static_cast<vrtd_alloc_dir>(99),
-                                       XFER_SIZE, 0, CLIENT_ID, nullptr);
+                                       XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr);
     EXPECT_EQ(buf, nullptr);
     device_memory_map_cleanup(map);
     slash_qdma_close(qdma);
@@ -103,7 +124,8 @@ TEST(BufferNullTest, CleanupNull) {
 
 TEST(BufferNullTest, RawNullQdma) {
     struct buffer *buf = buffer_create_raw(nullptr, DDR_START_ADDRESS, XFER_SIZE,
-                                           VRTD_ALLOC_DIR_HOST_TO_DEVICE);
+                                           VRTD_ALLOC_DIR_HOST_TO_DEVICE, CLIENT_ID,
+                                           SLASH_QDMA_MM_CHANNEL_AUTO);
     EXPECT_EQ(buf, nullptr);
     EXPECT_EQ(errno, EINVAL);
 }
@@ -112,7 +134,8 @@ TEST(BufferNullTest, RawZeroSize) {
     struct slash_qdma *qdma = slash_qdma_open("@mock");
     ASSERT_NE(qdma, nullptr);
     struct buffer *buf = buffer_create_raw(qdma, DDR_START_ADDRESS, 0,
-                                           VRTD_ALLOC_DIR_HOST_TO_DEVICE);
+                                           VRTD_ALLOC_DIR_HOST_TO_DEVICE, CLIENT_ID,
+                                           SLASH_QDMA_MM_CHANNEL_AUTO);
     EXPECT_EQ(buf, nullptr);
     EXPECT_EQ(errno, EINVAL);
     slash_qdma_close(qdma);
@@ -154,20 +177,17 @@ class BufferTest : public ::testing::TestWithParam<bool> {
 TEST_P(BufferTest, LifecycleBidirectional) {
     struct buffer *buf = buffer_create(qdma_, map_, ALLOCATION_TYPE_DDR,
                                        VRTD_ALLOC_DIR_BIDIRECTIONAL,
-                                       XFER_SIZE, 0, CLIENT_ID, nullptr);
+                                       XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr);
     ASSERT_NE(buf, nullptr);
+    ASSERT_GE(buf->qpair_count, 1u);
     EXPECT_GE(buf->fd, 0);
 
     uint8_t src[XFER_SIZE];
     for (size_t i = 0; i < XFER_SIZE; ++i)
         src[i] = static_cast<uint8_t>(i & 0xFF);
 
-    ssize_t written = pwrite(buf->fd, src, XFER_SIZE, static_cast<off_t>(buf->addr));
-    EXPECT_EQ(written, static_cast<ssize_t>(XFER_SIZE));
-
     uint8_t dst[XFER_SIZE]{};
-    ssize_t read_bytes = pread(buf->fd, dst, XFER_SIZE, static_cast<off_t>(buf->addr));
-    EXPECT_EQ(read_bytes, static_cast<ssize_t>(XFER_SIZE));
+    qpair_fd_round_trip(buf->fd, buf->addr, src, dst);
     EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0);
 
     cleanup_buffer(buf);
@@ -175,20 +195,19 @@ TEST_P(BufferTest, LifecycleBidirectional) {
 
 TEST_P(BufferTest, RawCreateAndIO) {
     struct buffer *buf = buffer_create_raw(qdma_, DDR_START_ADDRESS, XFER_SIZE,
-                                           VRTD_ALLOC_DIR_BIDIRECTIONAL);
+                                           VRTD_ALLOC_DIR_BIDIRECTIONAL, CLIENT_ID,
+                                           SLASH_QDMA_MM_CHANNEL_AUTO);
     ASSERT_NE(buf, nullptr);
+    ASSERT_GE(buf->qpair_count, 1u);
     EXPECT_GE(buf->fd, 0);
     EXPECT_EQ(buf->addr, DDR_START_ADDRESS);
     EXPECT_FALSE(buf->allocation_valid);
+    EXPECT_EQ(buf->client_id, CLIENT_ID);
 
     uint8_t src[XFER_SIZE];
     std::memset(src, 0xCD, sizeof(src));
-    ssize_t written = pwrite(buf->fd, src, XFER_SIZE, static_cast<off_t>(DDR_START_ADDRESS));
-    EXPECT_EQ(written, static_cast<ssize_t>(XFER_SIZE));
-
     uint8_t dst[XFER_SIZE]{};
-    ssize_t n = pread(buf->fd, dst, XFER_SIZE, static_cast<off_t>(DDR_START_ADDRESS));
-    EXPECT_EQ(n, static_cast<ssize_t>(XFER_SIZE));
+    qpair_fd_round_trip(buf->fd, DDR_START_ADDRESS, src, dst);
     EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0);
 
     cleanup_buffer(buf);
@@ -202,20 +221,22 @@ TEST_P(BufferTest, QueueExhaustion) {
         GTEST_SKIP() << "Queue exhaustion test is mock-only";
     }
 
-    static constexpr int MAX_QUEUES = 64;
+    static constexpr int MAX_BUFFERS = 32; /* two mock queues per buffer */
     std::vector<struct buffer *> bufs;
-    bufs.reserve(MAX_QUEUES);
+    bufs.reserve(MAX_BUFFERS);
 
-    for (int i = 0; i < MAX_QUEUES; ++i) {
+    for (int i = 0; i < MAX_BUFFERS; ++i) {
         struct buffer *buf = buffer_create_raw(qdma_, DDR_START_ADDRESS + i * XFER_SIZE,
-                                               XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE);
-        ASSERT_NE(buf, nullptr) << "Expected success for queue " << i;
+                                               XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE, CLIENT_ID,
+                                               SLASH_QDMA_MM_CHANNEL_AUTO);
+        ASSERT_NE(buf, nullptr) << "Expected success for buffer " << i;
         bufs.push_back(buf);
     }
 
-    /* 65th allocation must fail */
+    /* 33rd allocation needs queues 65/66 and must fail. */
     struct buffer *overflow = buffer_create_raw(qdma_, DDR_START_ADDRESS,
-                                                XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE);
+                                                XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE, CLIENT_ID,
+                                                SLASH_QDMA_MM_CHANNEL_AUTO);
     EXPECT_EQ(overflow, nullptr);
     EXPECT_EQ(errno, ENOSPC);
 
diff --git a/vrt/vrtd/tests/device_test.cpp b/vrt/vrtd/tests/device_test.cpp
index 36518c10..c66fd9f6 100644
--- a/vrt/vrtd/tests/device_test.cpp
+++ b/vrt/vrtd/tests/device_test.cpp
@@ -149,7 +149,8 @@ TEST(DeviceCleanupTest, CleanupWithBuffers) {
 
     /* Allocate a raw buffer on the mock QDMA and hand ownership to d->buffers. */
     struct buffer *buf = buffer_create_raw(d->qdma, DDR_START_ADDRESS, 4096,
-                                           VRTD_ALLOC_DIR_HOST_TO_DEVICE);
+                                           VRTD_ALLOC_DIR_HOST_TO_DEVICE, /*client_id=*/1,
+                                           SLASH_QDMA_MM_CHANNEL_AUTO);
     ASSERT_NE(buf, nullptr);
 
     int ret = buffer_ptr_array_push_move(&d->buffers, &buf);
diff --git a/vrt/vrtd/tests/v80_policy_test.cpp b/vrt/vrtd/tests/v80_policy_test.cpp
new file mode 100644
index 00000000..068a724b
--- /dev/null
+++ b/vrt/vrtd/tests/v80_policy_test.cpp
@@ -0,0 +1,124 @@
+/**
+ * The MIT License (MIT)
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software
+ * and associated documentation files (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy, modify, merge, publish, distribute,
+ * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or
+ * substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+ * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+
+#include "../libvrtd/src/v80_policy.h"
+
+namespace {
+
+constexpr uint64_t STEP = 4096;
+constexpr uint64_t MiB = 1024ULL * 1024ULL;
+constexpr uint64_t GiB = 1024ULL * MiB;
+
+// A single available queue always carries the whole transfer on fds[0].
+TEST(V80Plan, SingleQueueIsWhole) {
+    vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS];
+    uint32_t n = vrtd_plan_v80(VRTD_V80_HBM_BASE, 0, 512 * MiB, STEP, 1, segs);
+    ASSERT_EQ(n, 1u);
+    EXPECT_EQ(segs[0].qpair_index, 0u);
+    EXPECT_EQ(segs[0].offset, 0u);
+    EXPECT_EQ(segs[0].size, 512 * MiB);
+}
+
+// DDR has a single NSU, so the range is split in half across both channels.
+TEST(V80Plan, DdrSplitsInHalf) {
+    vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS];
+    uint32_t n = vrtd_plan_v80(VRTD_V80_DDR_BASE, 0, 512 * MiB, STEP, 2, segs);
+    ASSERT_EQ(n, 2u);
+    EXPECT_EQ(segs[0].qpair_index, 0u);
+    EXPECT_EQ(segs[0].offset, 0u);
+    EXPECT_EQ(segs[0].size, 256 * MiB);
+    EXPECT_EQ(segs[1].qpair_index, 1u);
+    EXPECT_EQ(segs[1].offset, 256 * MiB);
+    EXPECT_EQ(segs[1].size, 256 * MiB);
+}
+
+// A DDR transfer too small to halve along the step boundary stays on fds[0].
+TEST(V80Plan, DdrTinyTransferStaysOnPrimary) {
+    vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS];
+    uint32_t n = vrtd_plan_v80(VRTD_V80_DDR_BASE, 0, STEP, STEP, 2, segs);
+    ASSERT_EQ(n, 1u);
+    EXPECT_EQ(segs[0].qpair_index, 0u);
+    EXPECT_EQ(segs[0].size, STEP);
+}
+
+// An HBM buffer entirely below the half-boundary uses channel 0 only.
+TEST(V80Plan, HbmLowerHalfChannel0) {
+    vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS];
+    uint32_t n = vrtd_plan_v80(VRTD_V80_HBM_BASE, 0, 512 * MiB, STEP, 2, segs);
+    ASSERT_EQ(n, 1u);
+    EXPECT_EQ(segs[0].qpair_index, 0u);
+    EXPECT_EQ(segs[0].offset, 0u);
+    EXPECT_EQ(segs[0].size, 512 * MiB);
+}
+
+// An HBM buffer entirely at/above the half-boundary uses channel 1 only.
+TEST(V80Plan, HbmUpperHalfChannel1) {
+    vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS];
+    uint64_t base = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF + 4 * GiB;
+    uint32_t n = vrtd_plan_v80(base, 0, 512 * MiB, STEP, 2, segs);
+    ASSERT_EQ(n, 1u);
+    EXPECT_EQ(segs[0].qpair_index, 1u);
+    EXPECT_EQ(segs[0].offset, 0u);
+    EXPECT_EQ(segs[0].size, 512 * MiB);
+}
+
+// A buffer sitting exactly on the boundary belongs to the upper half.
+TEST(V80Plan, HbmOnBoundaryIsUpperHalf) {
+    vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS];
+    uint64_t base = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF;
+    uint32_t n = vrtd_plan_v80(base, 0, 256 * MiB, STEP, 2, segs);
+    ASSERT_EQ(n, 1u);
+    EXPECT_EQ(segs[0].qpair_index, 1u);
+}
+
+// An HBM range straddling the boundary splits exactly at it.
+TEST(V80Plan, HbmSpanningSplitsAtBoundary) {
+    vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS];
+    uint64_t base = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF - 256 * MiB;
+    uint32_t n = vrtd_plan_v80(base, 0, 512 * MiB, STEP, 2, segs);
+    ASSERT_EQ(n, 2u);
+    EXPECT_EQ(segs[0].qpair_index, 0u);
+    EXPECT_EQ(segs[0].offset, 0u);
+    EXPECT_EQ(segs[0].size, 256 * MiB);
+    EXPECT_EQ(segs[1].qpair_index, 1u);
+    EXPECT_EQ(segs[1].offset, 256 * MiB);
+    EXPECT_EQ(segs[1].size, 256 * MiB);
+}
+
+// The split point is computed from the absolute device address, so a non-zero
+// buffer offset that crosses the boundary is honoured.
+TEST(V80Plan, HbmSpanningWithOffset) {
+    vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS];
+    uint64_t offset = VRTD_V80_HBM_HALF - STEP;  // crosses boundary STEP into the range
+    uint32_t n = vrtd_plan_v80(VRTD_V80_HBM_BASE, offset, 2 * STEP, STEP, 2, segs);
+    ASSERT_EQ(n, 2u);
+    EXPECT_EQ(segs[0].qpair_index, 0u);
+    EXPECT_EQ(segs[0].offset, offset);
+    EXPECT_EQ(segs[0].size, STEP);
+    EXPECT_EQ(segs[1].qpair_index, 1u);
+    EXPECT_EQ(segs[1].offset, offset + STEP);
+    EXPECT_EQ(segs[1].size, STEP);
+}
+
+}  // namespace