diff --git a/.gitignore b/.gitignore index 5c17336a..e90cb2ef 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,6 @@ driver/kcompat/.scratch/ # Python test coverage .coverage + +# Project-local scratch space +/tmp/ diff --git a/docs/reference/kernel-abi/index.rst b/docs/reference/kernel-abi/index.rst index 6fbc1faa..25045cc5 100644 --- a/docs/reference/kernel-abi/index.rst +++ b/docs/reference/kernel-abi/index.rst @@ -336,8 +336,8 @@ Memory transfers via QDMA: ``/dev/slash_qdma_ctl`` The QDMA device manages DMA queue pairs for bulk data movement between host memory and the card's on-board memory (HBM or DDR). Each queue pair is allocated with a mode (currently only MM) and a direction mask, then started before use. An anon-inode fd obtained from the queue pair serves as -the I/O channel: ``write()`` performs H2C transfers, ``read()`` performs C2H transfers, and the -file position encodes the device-side physical address. +the transfer channel: host buffers are registered once, and transfer ioctls name the registered +buffer, buffer offset, device-side physical address, length, and direction. - **Device file name:** ``/dev/slash_qdma_ctl`` (e.g. ``/dev/slash_qdma_ctl0``) - **Sysfs name:** ``slash_qdma_ctl_`` (e.g. ``/sys/class/misc/slash_qdma_ctl_0000:61:00.1``) @@ -353,9 +353,9 @@ Usage ----- In order to transfer data via QDMA, a queue pair must be added, started, and an I/O fd needs -to be created. The I/O fd treats the file position as the device-side physical address: -``write()`` performs an H2C (host-to-card) transfer, and ``read()`` performs a C2H (card-to-host) -transfer. Full lifecycle: +to be created. The I/O fd is ioctl-only for data movement: userspace registers a host buffer, +then issues transfer ioctls that name the registered buffer, buffer offset, device-side address, +length, and direction. Full lifecycle: .. code-block:: c @@ -381,32 +381,58 @@ transfer. Full lifecycle: }; int io_fd = ioctl(qdma_fd, SLASH_QDMA_IOCTL_QPAIR_GET_FD, &fd_req); - /* Step 4: H2C transfer to device address 0x4000000000 */ - pwrite(io_fd, host_buf, nbytes, 0x4000000000LL); + /* Step 4: Create a kernel-owned DMA buffer and mmap it for CPU access. + * The buffer fd is returned by the ioctl; the kernel allocated the pages, + * built the SGL, and DMA-mapped everything once. */ + struct slash_qdma_buf_create bc = { .size = sizeof(bc), .length = nbytes }; + int buf_fd = ioctl(io_fd, SLASH_QDMA_IOCTL_BUF_CREATE, &bc); + void *host_buf = mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, + buf_fd, 0); + + /* Step 5: H2C transfer to device address 0x4000000000. The transfer + * carries an array of per-qpair sub-transfers; a single-channel fd uses + * one sub-transfer with qpair_index 0. */ + struct slash_qdma_transfer xfer = { + .size = sizeof(xfer), + .count = 1, + .xfers[0] = { + .qpair_index = 0, + .direction = SLASH_QDMA_XFER_H2C, + .buf_fd = buf_fd, + .buf_offset = 0, + .dev_addr = 0x4000000000LL, + .length = nbytes, + }, + }; + ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &xfer); - /* Step 5: C2H transfer from device address 0x4000000000 */ - pread(io_fd, host_buf, nbytes, 0x4000000000LL); + /* Step 6: C2H transfer from device address 0x4000000000 */ + xfer.xfers[0].direction = SLASH_QDMA_XFER_C2H; + ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &xfer); - /* Step 6: Teardown */ + /* Step 7: Teardown — closing the buffer fd (after munmap) releases it. */ + munmap(host_buf, nbytes); + close(buf_fd); close(io_fd); op.op = 1; ioctl(qdma_fd, SLASH_QDMA_IOCTL_Q_OP, &op); /* STOP */ op.op = 2; ioctl(qdma_fd, SLASH_QDMA_IOCTL_Q_OP, &op); /* DEL */ -The file position can also be set explicitly with ``lseek`` before a plain ``read()``/``write()``: - -.. code-block:: c - - lseek(io_fd, 0x1000, SEEK_SET); - write(io_fd, src_buf, nbytes); - -``lseek`` supports all flags ``SEEK_SET``, ``SEEK_CUR``, and ``SEEK_END``, and both ``pread`` and -``pwrite`` are supported. However, the fd does **not** support ``mmap``, ``poll``/``select``, or -``splice``. +The qpair fd does **not** support ``read``, ``write``, ``pread``, ``pwrite``, ``mmap``, +``poll``/``select``, or ``splice`` for data movement. Buffer fds returned by +``SLASH_QDMA_IOCTL_BUF_CREATE`` **are** mappable with ``mmap`` (full length, +offset 0). All transfers are synchronous and block until the transfer completes or times out. The timeout is **10 seconds**; after expiry the call returns ``-ETIME``. Partial transfers are possible; the return value is the number of bytes transferred, and the file position is advanced accordingly. +The userspace buffer address and ``count`` must be page-aligned: the address +must be 4 KiB-aligned and ``count`` must be a non-zero multiple of 4 KiB. The +transfer is backed by 4 KiB base pages, one descriptor per page. Transparent +hugepages are not accepted, so callers using anonymous mappings should apply +``MADV_NOHUGEPAGE`` before faulting pages when they need deterministic +base-page transfers. + Multiple fds can be obtained for the same qpair via multiple ``QPAIR_GET_FD`` calls, including from different processes. Concurrent ``read()``/``write()`` calls on the same qpair (from any fd or thread) are serialized by the kernel and execute one at a time; for parallel I/O, allocate @@ -425,7 +451,7 @@ The following errno values can be returned by ``read()`` and ``write()`` on the * - ``-ENODEV`` - Device shutting down, or the required direction is not enabled for this qpair * - ``-EINVAL`` - - Zero-length transfer (``count`` results in 0 pages) + - Zero-length, unaligned, or non-page-multiple transfer * - ``-ENOMEM`` - SGL allocation failure * - ``-EFAULT`` @@ -667,38 +693,49 @@ removed. ``SLASH_QDMA_IOCTL_QPAIR_GET_FD`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Creates a new file descriptor for data transfer on an existing queue pair. The returned fd supports -``read``, ``write``, ``pread``, ``pwrite``, and ``lseek``; it does **not** support ``mmap``, -``poll``/``select``, or ``splice``. Multiple fds can be obtained for the same qpair via multiple -calls. The fd is returned as the ``ioctl()`` return value. +Creates a new file descriptor for data transfer. The fd is a **collection of one or two queue +pairs** (typically one per AXI-MM/NoC channel): a transfer issued on it selects a bound queue pair +by index, so one transfer ioctl can fan across both channels. The returned fd is ioctl-only for +data movement: it supports buffer register/unregister and transfer ioctls, but not ``read``, +``write``, ``pread``, ``pwrite``, ``mmap``, ``poll``/``select``, or ``splice`` (an optional +``io_uring`` ``uring_cmd`` async transfer path is available on capable kernels). Multiple fds can +be obtained for the same qpair(s) via multiple calls. The fd is returned as the ``ioctl()`` return +value. **Interface:** .. code-block:: c + #define SLASH_QDMA_FD_MAX_QPAIRS 2u + #define SLASH_QDMA_IOCTL_QPAIR_GET_FD _IOWR('v', 0x53, struct slash_qdma_qpair_fd_request) struct slash_qdma_qpair_fd_request { - __u32 size; /* [in/out] ABI version */ - __u32 qid; /* [in] Queue pair ID (must exist and be non-empty) */ - __u32 flags; /* [in] fd flags: only O_CLOEXEC is honoured */ + __u32 size; /* [in/out] ABI version */ + __u32 qid; /* [in] Legacy single qpair ID; used when qpair_count == 0 */ + __u32 flags; /* [in] fd flags: only O_CLOEXEC is honoured */ + __u32 qpair_count; /* [in] Number of qpair_ids (1..SLASH_QDMA_FD_MAX_QPAIRS); 0 = use qid */ + __u32 qpair_ids[SLASH_QDMA_FD_MAX_QPAIRS]; /* [in] qpair IDs; index == qpair_index */ }; -**Direction:** ``_IOWR`` — userspace writes ``qid`` and ``flags``; the kernel returns the new fd -as the ``ioctl()`` return value (not as a struct field). +**Direction:** ``_IOWR`` — userspace writes the qpair selection and ``flags``; the kernel returns +the new fd as the ``ioctl()`` return value (not as a struct field). **Preconditions:** -- ``size`` must cover at least ``flags`` (the trailing input field) — otherwise ``-EINVAL`` -- ``qid`` must refer to an existing, non-empty queue pair +- ``size`` must cover at least ``flags`` (the trailing input field of the legacy form) — otherwise ``-EINVAL`` +- The selected queue pairs must exist and be non-empty (``qpair_count == 0`` selects the single ``qid``) +- ``qpair_count`` must not exceed ``SLASH_QDMA_FD_MAX_QPAIRS`` - ``flags & ~O_CLOEXEC == 0`` (any other bits cause ``-EINVAL``) -- The queue pair should be in the started state for I/O to work +- The queue pairs should be in the started state for I/O to work +- Each bound qpair keeps the per-qpair configuration (``mm_channel``, ring sizes, directions) it was + given at ``QPAIR_ADD`` time, so the two channels can be configured independently **Postconditions:** - The return value is a non-negative fd number on success. -- The fd holds a reference on both the qpair entry and the device; neither can be freed while - this fd is open. +- The fd holds a reference on the qpair entry, device, and the client context that owns registered + buffers; neither can be freed while this fd is open. **Return values:** @@ -710,6 +747,146 @@ as the ``ioctl()`` return value (not as a struct field). - ``-ENOMEM`` — allocation failure - Other negative errno from ``anon_inode_getfile()`` or ``get_unused_fd_flags()`` +``SLASH_QDMA_IOCTL_BUF_CREATE`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Creates a kernel-owned DMA buffer and returns a mappable fd for it. The ioctl may be issued on +either the QDMA control fd or a qpair fd of the same device. The kernel allocates ``length`` bytes +as a set of 4 KiB base pages (not physically contiguous), builds the transfer scatter-gather list, +and DMA-maps every page **once** — so the steady-state transfer path only slices the prebuilt SGL, +syncs the touched pages, and submits. Userspace maps the returned fd with ``mmap`` to obtain a CPU +pointer and passes the fd in ``struct slash_qdma_subxfer`` to move data. The buffer is bound to the +fd's QDMA device; transfers must use a qpair fd of that same device. + +**Interface:** + +.. code-block:: c + + #define SLASH_QDMA_IOCTL_BUF_CREATE _IOWR('v', 0x54, struct slash_qdma_buf_create) + + struct slash_qdma_buf_create { + __u32 size; /* [in/out] ABI version */ + __u32 flags; /* [in] Only O_CLOEXEC is honoured */ + __u64 length; /* [in] Buffer length in bytes (page multiple) */ + __u32 granule; /* [out] Bytes per SGL descriptor (host page size) */ + __u32 transfer_hint; /* [out] enum slash_qdma_transfer_hint */ + }; + +**Direction:** ``_IOWR`` — issued on the control fd or a qpair fd. Userspace writes ``flags`` and +``length``; the kernel writes back ``granule`` and ``transfer_hint`` and returns the new buffer fd +as the ``ioctl()`` return value (same convention as the BAR/queue-pair fd ioctls). + +The returned fd: + +- is ``mmap``-able (full length, offset 0, ``MAP_SHARED``) for CPU access to the buffer; +- releases the buffer when it (and any mapping) is closed — there is no explicit unregister ioctl; +- keeps its pages (and DMA mapping) alive as long as either the fd or any mapping exists. + +``transfer_hint`` is advisory and tells userspace which queue topology the kernel expects to be +best for this buffer on the current hardware. Current SLASH hardware returns +``SLASH_QDMA_TRANSFER_HINT_V80``; userspace may ignore this value. Known values are: + +.. code-block:: c + + enum slash_qdma_transfer_hint { + SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR = 1, + SLASH_QDMA_TRANSFER_HINT_V80 = 2, + }; + +``SLASH_QDMA_TRANSFER_HINT_V80`` asks userspace to apply the V80 placement-aware channel policy: +spread a transfer across both AXI-MM channels so each NoC ingress master (NMU) drives an +independent memory endpoint (NSU). The marker is opaque; the client computes the actual split from +the buffer's device address (DDR ranges are halved across the two channels, while HBM ranges are +routed by the 16 GiB half-memory boundary). ``SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR`` keeps all +traffic on a single queue. + +**Preconditions:** + +- ``size`` must cover at least ``length`` (the trailing input field) — otherwise ``-EINVAL`` +- ``flags`` must contain only ``O_CLOEXEC`` +- ``length`` must be a non-zero multiple of the page size + +**Postconditions:** + +- the ``ioctl()`` return value is the new buffer fd (``>= 0``) +- ``granule`` is the per-descriptor page size (4 KiB); ``transfer_hint`` is an advisory topology hint +- the pages stay allocated and DMA-mapped until the fd and all mappings are closed and no transfer + is in flight + +**Return values:** + +- ``>= 0`` — the new buffer fd (success) +- ``-EFAULT`` — copy failure +- ``-EINVAL`` — ``size`` too small, unsupported ``flags`` bits, or misaligned/zero ``length`` +- ``-ENOMEM`` — page allocation or DMA-mapping failure +- ``-ENODEV`` — device shutting down +- Other negative errno from ``anon_inode_getfile()`` or ``get_unused_fd_flags()`` + +The ``'v'`` ``0x55`` ioctl number is reserved (it was the removed +``SLASH_QDMA_IOCTL_BUF_UNREGISTER``; kernel buffers are now released by closing the fd). + +``SLASH_QDMA_QPAIR_IOCTL_TRANSFER`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Performs a DMA transfer batch using kernel buffers. Unlike ``read``/``write``/``pread``/``pwrite``, +this ioctl is issued on a **queue-pair I/O fd** (from ``SLASH_QDMA_IOCTL_QPAIR_GET_FD``), not the +control device. The transfer carries an array of per-qpair sub-transfers; sub-transfers that target +distinct queue pairs are submitted **concurrently** (all but the last asynchronously, the last +blocking, then awaited), so a single ioctl can drive both NoC channels in parallel. No pages are +allocated or DMA-mapped on this path — that work was amortised at ``BUF_CREATE`` time — so each +sub-transfer syncs and submits the cached, pre-DMA-mapped SGL slice directly. + +**Interface:** + +.. code-block:: c + + #define SLASH_QDMA_QPAIR_IOCTL_TRANSFER _IOWR('v', 0x56, struct slash_qdma_transfer) + + struct slash_qdma_subxfer { + __u32 qpair_index; /* [in] Index into the fd's bound qpairs */ + __u32 direction; /* [in] 1=H2C (write), 2=C2H (read) */ + __s32 buf_fd; /* [in] Kernel buffer fd from BUF_CREATE */ + __u32 pad0; /* padding */ + __u64 buf_offset; /* [in] Byte offset within the buffer */ + __u64 dev_addr; /* [in] Device-side (endpoint) address */ + __u64 length; /* [in] Number of bytes to transfer */ + }; + + struct slash_qdma_transfer { + __u32 size; /* [in/out] ABI version */ + __u32 count; /* [in] Number of sub-transfers (1..SLASH_QDMA_FD_MAX_QPAIRS) */ + struct slash_qdma_subxfer xfers[SLASH_QDMA_FD_MAX_QPAIRS]; + }; + +**Direction:** ``_IOWR`` — userspace writes all input fields; the total number of bytes transferred +across all sub-transfers is returned as the ``ioctl()`` return value (not as a struct field). + +**Preconditions:** + +- ``size`` must cover at least ``count`` (the trailing header field) — otherwise ``-EINVAL`` +- ``count`` must be in ``[1, SLASH_QDMA_FD_MAX_QPAIRS]`` +- each sub-transfer's ``qpair_index`` must be ``< `` the number of qpairs the fd owns +- each ``direction`` must be 1 (H2C) or 2 (C2H) and must be enabled on the selected queue pair +- each ``buf_fd`` must be a buffer fd (from ``BUF_CREATE``) bound to the same device as this qpair fd +- each ``buf_offset`` and ``length`` must be aligned to the buffer's page granule, ``length`` non-zero + and ``<= UINT_MAX``, and ``buf_offset + length`` must not exceed the buffer length + +**Return values:** + +- ``>= 0`` — total number of bytes transferred (success) +- ``-EFAULT`` — copy failure +- ``-EBADF`` — a ``buf_fd`` is not a valid open fd +- ``-EINVAL`` — ``size``/``count`` invalid, bad ``qpair_index``/``direction``, a ``buf_fd`` that is not + a SLASH buffer or belongs to another device, or an out-of-range / misaligned slice +- ``-ENODEV`` — device shutting down or the requested direction is not enabled on the qpair +- Other negative errno from libqdma's ``qdma_request_submit()`` (the first sub-transfer error wins) + +An optional asynchronous form of this transfer is exposed via ``io_uring`` ``uring_cmd`` (opcode +``SLASH_QDMA_URING_CMD_TRANSFER``), available only on kernels built with ``CONFIG_IO_URING`` and +``uring_cmd`` support. The SQE inline command carries a single ``__u64`` userspace pointer to a +``struct slash_qdma_transfer``; the completion CQE ``res`` holds the total bytes transferred or a +negative errno. This lets many buffer transfers be kept in flight from a single thread. + Device resets and hotplugging: ``/dev/slash_hotplug`` ===================================================== diff --git a/docs/reference/smi/commands.rst b/docs/reference/smi/commands.rst index 563a81b5..2fd1ccb9 100644 --- a/docs/reference/smi/commands.rst +++ b/docs/reference/smi/commands.rst @@ -151,11 +151,27 @@ validate -------- Run memory integrity and bandwidth tests against a board's HBM and DDR -subsystems. +subsystems. For each memory path, bandwidth is reported as single-direction +C2H read, single-direction H2C write, and simultaneous bidirectional +throughput (read, write, and total). After the per-memory phases, a final +parallel phase drives HBM and DDR simultaneously with ``2 * N`` buffers for +single-direction tests and ``4 * N`` threads for bidirectional tests; this +phase is skipped when ``--ddr-only`` or ``--hbm-only`` is given. .. code-block:: text - v80-smi validate -d [-j|--threads ] + v80-smi validate -d [-j|--threads ] [-R|--no-reset] [--mm-channel ] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--ring-size-index <0-15>] [--bandwidth-iterations ] [--bandwidth-duration ] + +Requirements by mode: + +* Default mode uses VRTD buffers, requires a running VRTD daemon, and resets + the board unless ``--no-reset`` is given. +* ``--raw-transfer-test`` bypasses VRTD for transfers and requires the SLASH + QDMA driver device node for the board. It skips reset. +* ``--use-qdma-driver`` bypasses both VRTD and SLASH for transfers and requires + the stock ``qdma-pf`` driver to be bound to the board's QDMA PF. This backend + is built only when ``SMI_ENABLE_QDMA_DRIVER_BACKEND`` is enabled at CMake + configure time. .. option:: -d, --device @@ -164,6 +180,140 @@ subsystems. .. option:: -j, --threads Number of parallel buffers/threads for the validation test (1–64, default 8). + Bidirectional phases use ``2 * N`` logical positions in each enabled memory + space. + +.. option:: --buffer-size + + Size of each test buffer. Values may be bare bytes or use ``k``/``K`` or + ``m``/``M`` suffixes. The default and maximum are ``512M``. Values must be + 4 KiB-aligned. + +.. option:: --offset + + Distance between logical buffer positions. The default is ``512M``. Values + may be bare bytes or use ``k``/``K`` or ``m``/``M`` suffixes, must be + 4 KiB-aligned, and must be at least ``--buffer-size`` so buffers do not + overlap. + +.. option:: --starting-offset + + Offset from each memory-space base for logical position 0. The default is + ``0``. Values may be bare bytes or use ``k``/``K`` or ``m``/``M`` suffixes + and must be 4 KiB-aligned. + +Buffers are placed at ``memory_base + starting_offset + position * offset``. +Single-direction phases use positions ``0..N-1``. Bidirectional phases use +positions ``0..2N-1`` with reads on even positions and writes on odd positions. +The full range must remain inside the 64 x 512 MB DDR/HBM address space. If any +placement option is specified in default VRTD mode, ``validate`` uses raw VRTD +buffers so the exact addresses are honored; this requires raw memory access +permission. + +The largest phase maps up to ``4 * N * buffer-size`` of host buffers when both +HBM and DDR are enabled, or ``2 * N * buffer-size`` with ``--ddr-only`` or +``--hbm-only``; the command fails early if that exceeds currently available +host memory. + +.. option:: -R, --no-reset + + Skip the device reset step before running memory tests. + +.. option:: --mm-channel + + AXI-MM / NoC channel selection for each buffer's QDMA queue pair, in every + mode. ``spec`` is either a single value applied to all buffers, or a + comma-separated list giving one channel per logical buffer position + (exactly ``2 x --threads`` entries; there is no repeating/wrap, and any + other length is an error): + + * ``auto`` (the default) lets the driver stripe queues across both channels + by ``qid & 1``. + * ``0`` / ``1`` pin the queue to that AXI-MM channel (and hence NoC channel). + * e.g. with ``-j 1`` the list ``0,1`` puts buffer position 0 on channel 0 and + position 1 on channel 1. Bidirectional phases use positions ``0..2N-1``; + single-direction phases use the first ``N`` entries. + + This is independent of ``--channel-allocation`` (which controls the device + address): ``--mm-channel`` controls the host-side NoC ingress (NMU) per + queue. With ``--use-qdma-driver`` the selection maps to the stock driver's + per-queue MM-channel attribute. + +.. option:: --raw-transfer-test + + Use libslash raw QDMA transfers instead of VRTD buffers. This mode implies + ``--no-reset`` and requires the SLASH QDMA driver device to be present. + +.. option:: --use-qdma-driver + + Run the raw transfer test over the off-the-shelf Xilinx QDMA driver + (``/dev/qdma-MM-``) instead of SLASH. smi provisions the queues + itself: it raises the function's ``qmax`` via sysfs if needed, creates and + starts bidirectional AXI-MM queue pairs over generic netlink (the same + ``xnl_pf`` interface ``dma-ctl`` uses), then transfers over the per-queue + char devices. Queue pairs are spread round-robin across the function's MM + engine channels (``channel = qid % mm_channel_max``); the CPM5 QDMA on the + V80 exposes two, so the test exercises both. This mode implies + ``--no-reset`` and is mutually exclusive with ``--raw-transfer-test``. It + requires the stock ``qdma-pf`` driver to be bound to the board's PF (it + cannot be bound at the same time as the SLASH driver), and typically + requires root to raise ``qmax`` and open the queue devices. + +.. option:: --ddr-only + + Run only the DDR memory tests and skip the HBM phase. Mutually exclusive + with ``--hbm-only``. + +.. option:: --hbm-only + + Run only the HBM memory tests and skip the DDR phase. Mutually exclusive + with ``--ddr-only``. + +.. option:: --channel-allocation + + Raw-transfer-only (``--raw-transfer-test`` or ``--use-qdma-driver``) control + over how QDMA MM/NoC channels map onto device memory. On CPM5 the host-side + NoC ingress port (NMU) is chosen per queue by the SW-context + mm-channel/host_id (SLASH uses ``qid & 1``), while the memory-side NoC egress + endpoint (NSU / pseudo-channel) is chosen by the device address. Default + ``auto`` keeps the historical behaviour: channel ``qid & 1`` with linear + addressing, so both NMUs can converge on a single NSU and bandwidth caps at + one path. ``paired`` couples the two: even positions land in memory region 0 + on channel 0, odd positions in region 1 on channel 1 (one + ``--channel-region-stride`` apart), giving two independent NMU->NSU paths. + This mirrors the off-the-shelf ``dma-perf`` ``offset_ch0``/``offset_ch1`` + knobs and is the placement that lets both NoC ports contribute bandwidth. + +.. option:: --channel-region-stride + + In ``--channel-allocation paired`` mode, the byte distance between the two + per-channel memory regions (the NSU / pseudo-channel stride). Default ``16G`` + (== half the per-memory address space, matching the dma-perf HBM + ``offset_ch1 - offset_ch0`` spacing). Must be a non-zero multiple of 4 KiB. + Accepts bare bytes or ``k``/``K``, ``m``/``M``, ``g``/``G`` suffixes. + +.. option:: --ring-size-index <0-15> + + Raw-transfer-only (``--raw-transfer-test`` or ``--use-qdma-driver``). + Override the QDMA descriptor-ring size index used when creating SLASH raw + queue pairs or starting stock-driver queues. When omitted, each backend keeps + its existing default. Useful A/B values for 4 KiB descriptor throughput are + ``0``, ``11``, ``13``, and ``15``. + +.. option:: --bandwidth-iterations + + Raw-transfer-only (``--raw-transfer-test`` or ``--use-qdma-driver``). Repeat + each whole-buffer transfer in every bandwidth phase ``N`` times and report + bandwidth over the sustained loop. The default is ``1``, which preserves the + historical one-shot measurement. + +.. option:: --bandwidth-duration + + Raw-transfer-only duration mode. When non-zero, each bandwidth phase repeats + whole-buffer transfers until the requested wall-clock duration has elapsed + and counts only completed transfers. This is useful for comparing SLASH's raw + path against long-running tools such as ``dma-perf``. A value of ``0`` uses + ``--bandwidth-iterations`` instead. debug ----- diff --git a/driver/Makefile b/driver/Makefile index 98a56815..ac28900e 100644 --- a/driver/Makefile +++ b/driver/Makefile @@ -42,8 +42,19 @@ else LIBQDMA_PATH := $(LIBQDMA_FALLBACK) endif +# SLASH carries a few local modifications to the pinned QDMA submodule's +# libqdma sources (see $(LIBQDMA_PATCH_DIR)/). The submodule itself stays +# pristine; the patches are applied to whichever libqdma tree is being built +# (the DKMS-local ./libqdma or the in-tree submodule) by the libqdma-patches +# target before the module is compiled. See that target for details. +LIBQDMA_PATCH_DIR := patches + SLASH_QDMA_OP_DEBUG ?= 0 +# Per-transfer timing instrumentation. Set to 1 to emit one dmesg line per +# DMA transfer breaking down the kernel phases. Default off (zero overhead). +SLASH_QDMA_TIMING ?= 0 + # Kcompat feature flags. Defaults are "n"; the all: recipe runs # driver/kcompat/probe.sh against $(KDIR) to detect the actual values # and passes them into the kbuild recursion. Each pair (modern API + @@ -51,6 +62,8 @@ SLASH_QDMA_OP_DEBUG ?= 0 # absent, the legacy form is the unconditional fallback in slash_compat.h. SLASH_HAVE_VM_FLAGS_SET ?= n SLASH_HAVE_MODULE_IMPORT_NS_TOKEN ?= n +SLASH_HAVE_URING_CMD ?= n +SLASH_HAVE_URING_SQE_CMD ?= n # Set GCOV=1 to instrument the module for kernel gcov coverage. # Not set by default — never enable this in production builds. @@ -72,6 +85,7 @@ ccflags-y += \ \ -DTANDEM_BOOT_SUPPORTED=1 \ -DSLASH_QDMA_OP_DEBUG=$(SLASH_QDMA_OP_DEBUG) \ + -DSLASH_QDMA_TIMING=$(SLASH_QDMA_TIMING) \ -DSLASH_VERSION_STR=\"$(SLASH_VERSION)\" ifeq ($(SLASH_HAVE_VM_FLAGS_SET),y) @@ -82,6 +96,25 @@ ifeq ($(SLASH_HAVE_MODULE_IMPORT_NS_TOKEN),y) ccflags-y += -DSLASH_HAVE_MODULE_IMPORT_NS_TOKEN endif +# Optional io_uring uring_cmd async transfer path. Probed by kcompat; absent on +# kernels without CONFIG_IO_URING or uring_cmd support (e.g. RHEL 9, Ubuntu +# 22.04 GA), where the synchronous transfer ioctl remains the only path. +ifeq ($(SLASH_HAVE_URING_CMD),y) +ccflags-y += -DSLASH_HAVE_URING_CMD +endif + +# Selects the io_uring SQE payload accessor: io_uring_sqe_cmd(cmd->sqe) when +# present (newer kernels + distro backports), else cmd->cmd. Only meaningful +# when SLASH_HAVE_URING_CMD is also set. +ifeq ($(SLASH_HAVE_URING_SQE_CMD),y) +ccflags-y += -DSLASH_HAVE_URING_SQE_CMD +endif + +# Force-include the compat header into every TU (including the pinned libqdma +# submodule sources we don't modify) so kernel-API shims such as from_timer() +# reach third-party code too. Safe on all kernels: the shims are guarded. +ccflags-y += -include $(src)/slash_compat.h + LIBQDMA_OBJS := \ $(LIBQDMA_PATH)/qdma_mbox.o \ @@ -120,18 +153,80 @@ $(MODULE)-objs += $(LIBQDMA_OBJS) $(QDMA_ACCESS_OBJS) KCOMPAT := "$(SHELL)" "$(PWD)/kcompat/probe.sh" -all: +all: libqdma-patches @flags="$$($(KCOMPAT) "$(KDIR)" | tr '\n' ' ')"; \ echo "slash: kcompat: $$flags"; \ $(MAKE) -C "$(KDIR)" M="$(PWD)" $$flags modules +# Apply SLASH's local libqdma patches ($(LIBQDMA_PATCH_DIR)/*.patch) to the +# libqdma source tree in use, in filename order, right before building. +# +# The pinned submodule is not edited directly by commits: patches live in-tree +# and are stamped onto the working copy here. Application is idempotent — each patch is first tested +# for being already applied (reverse dry-run) and skipped if so — so repeated +# `make` runs, incremental builds, and DKMS rebuilds are all safe. A patch that +# neither applies cleanly nor is already present aborts the build. +# +# $(PWD) is the driver dir for both `make` (in-tree) and DKMS (MAKE[0] runs +# `make -C driver ...`); ./libqdma is the DKMS-packaged copy, otherwise fall +# back to the in-tree submodule path. Uses patch(1) so it is independent of +# whether the libqdma tree lives inside a git checkout. +libqdma-patches: + @set -e; \ + patch_dir="$(PWD)/$(LIBQDMA_PATCH_DIR)"; \ + set -- "$$patch_dir"/*.patch; \ + if [ ! -e "$$1" ]; then exit 0; fi; \ + if [ -d "$(PWD)/libqdma" ]; then lq="$(PWD)/libqdma"; \ + else lq="$(PWD)/$(LIBQDMA_FALLBACK)"; fi; \ + if [ ! -d "$$lq" ]; then \ + echo "slash: ERROR libqdma sources not found at $$lq" >&2; \ + echo "slash: run 'git submodule update --init --recursive' first" >&2; \ + exit 1; \ + fi; \ + command -v patch >/dev/null 2>&1 || { \ + echo "slash: ERROR patch(1) not found; it is required to apply libqdma patches" >&2; \ + exit 1; }; \ + for p in "$$@"; do \ + name="$$(basename "$$p")"; \ + if patch -R -p1 -d "$$lq" --dry-run -f -s -i "$$p" >/dev/null 2>&1; then \ + echo "slash: libqdma patch already applied, skipping: $$name"; \ + elif patch -p1 -d "$$lq" --dry-run -f -s -i "$$p" >/dev/null 2>&1; then \ + echo "slash: applying libqdma patch: $$name"; \ + patch -p1 -d "$$lq" -f -s -i "$$p"; \ + else \ + echo "slash: ERROR libqdma patch does not apply cleanly: $$name" >&2; \ + echo "slash: (libqdma tree at $$lq is neither pristine nor already patched)" >&2; \ + exit 1; \ + fi; \ + done + +# Best-effort revert of the libqdma patches, restoring the submodule working +# copy to pristine. Useful when editing the patches themselves. Never fails the +# build: patches that are not currently applied are simply skipped. +unpatch-libqdma: + @set -e; \ + patch_dir="$(PWD)/$(LIBQDMA_PATCH_DIR)"; \ + set -- "$$patch_dir"/*.patch; \ + if [ ! -e "$$1" ]; then exit 0; fi; \ + if [ -d "$(PWD)/libqdma" ]; then lq="$(PWD)/libqdma"; \ + else lq="$(PWD)/$(LIBQDMA_FALLBACK)"; fi; \ + [ -d "$$lq" ] || exit 0; \ + for p in $$(printf '%s\n' "$$@" | tac); do \ + name="$$(basename "$$p")"; \ + if patch -R -p1 -d "$$lq" --dry-run -f -s -i "$$p" >/dev/null 2>&1; then \ + echo "slash: reverting libqdma patch: $$name"; \ + patch -R -p1 -d "$$lq" -f -s -i "$$p"; \ + fi; \ + done + clean: - $(MAKE) -C "$(KDIR)" M="$(PWD)" clean + -$(MAKE) -C "$(KDIR)" M="$(PWD)" clean rm -rf "$(PWD)/kcompat/.scratch" + $(MAKE) unpatch-libqdma install: all sudo install -d -m 755 /lib/modules/$(shell uname -r)/extra sudo install -m 644 $(MODULE).ko /lib/modules/$(shell uname -r)/extra sudo depmod -a -.PHONY: all clean install +.PHONY: all clean install libqdma-patches unpatch-libqdma diff --git a/driver/README.md b/driver/README.md index 65cd911a..7576dafb 100644 --- a/driver/README.md +++ b/driver/README.md @@ -1,10 +1,58 @@ # SLASH kernel module +## Module parameters + +Exposed under `/sys/module/slash/parameters/` (all writable at runtime; see +`modinfo slash.ko`): + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `qdma_num_threads` | uint | 8 | Number of libqdma worker threads. | +| `qdma_debugfs_path` | charp | disabled | debugfs mount path for libqdma. | + +### A/B testing NoC channel bandwidth + +The AXI-MM / NoC channel is chosen per queue pair when it is added (the +`mm_channel` field of the qpair-add ioctl, `enum slash_qdma_mm_channel`): +`auto` stripes queues across both channels by `qid & 1`, while `0` / `1` pin a +queue to a single channel. Every queue creator carries this setting, so it can +be driven per buffer to check whether both PCIe NMUs (NoC channels) actually +contribute bandwidth. With `v80-smi validate`: + +```sh +# All queues on NoC channel 0 (NMU S00) +sudo v80-smi validate -d --raw-transfer-test --no-reset --mm-channel 0 + +# All queues on NoC channel 1 (NMU S01) +sudo v80-smi validate -d --raw-transfer-test --no-reset --mm-channel 1 + +# Split across both channels (qid & 1) +sudo v80-smi validate -d --raw-transfer-test --no-reset --mm-channel auto + +# Explicit per-buffer split (even positions -> channel 0, odd -> channel 1) +sudo v80-smi validate -d --raw-transfer-test --no-reset --mm-channel 0,1 +``` + +Debug builds with `SLASH_QDMA_OP_DEBUG=1` log each queue's selected +`mm_channel` when it is added. If the split run is no faster than a single +forced channel, traffic is not being spread across both NMUs. The per-queue +setting affects every queue created through this driver (both the VRTD buffer +path and `--raw-transfer-test`); the off-the-shelf Xilinx QDMA driver path +(`--use-qdma-driver`) honors `--mm-channel` through its own channel attribute. + ## Testing The test suite requires a physical V80 to be present and the module to be loaded into a running kernel. +## Local libqdma patches + +SLASH carries small patches for the pinned `libqdma` submodule under +`driver/patches/`. The driver `Makefile` applies them before building, and +`make clean` attempts to revert them so the submodule working copy returns to +its pristine pinned state. DKMS packages include the same patch directory and +depend on `patch(1)`. + ### Prerequisites - A kernel built with `CONFIG_GCOV_KERNEL=y` (only needed for coverage runs). diff --git a/driver/kcompat/uring_cmd.c b/driver/kcompat/uring_cmd.c new file mode 100644 index 00000000..21e9ef93 --- /dev/null +++ b/driver/kcompat/uring_cmd.c @@ -0,0 +1,78 @@ +/** + * Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. + * This program is free software; you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation; version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without + * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with this program; if + * not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +/* + * Probe for the io_uring uring_cmd *infrastructure* in the exact shape + * slash_qdma.c uses, excluding the SQE payload accessor (that axis changed + * independently and is probed separately by uring_sqe_cmd.c): + * - struct file_operations has a .uring_cmd member, + * - struct io_uring_cmd exposes ->pdu, ->file, and ->cmd_op, + * - io_uring_cmd_complete_in_task() takes a (cmd, issue_flags) callback, + * - io_uring_cmd_done() takes (cmd, ret, res2, issue_flags). + * + * This requires CONFIG_IO_URING and a kernel >= 5.19 with the settled + * (>= 6.1) signatures; anywhere it fails to build, SLASH_HAVE_URING_CMD=n and + * the optional async transfer path is compiled out. The payload pointer is + * read via the SLASH_HAVE_URING_SQE_CMD-selected accessor (see slash_qdma.c): + * io_uring_sqe_cmd(cmd->sqe) on newer kernels, cmd->cmd on older ones. + */ + +#include +#include +#include +#include +#if __has_include() +#include +#endif + +static void conftest_tw(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + io_uring_cmd_done(cmd, 0, 0, issue_flags); +} + +static int conftest_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + void *p = cmd->pdu; + struct file *f = cmd->file; + u32 op = cmd->cmd_op; + + (void)p; + (void)f; + (void)op; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + io_uring_cmd_complete_in_task(cmd, conftest_tw); + return -EIOCBQUEUED; +} + +static const struct file_operations conftest_fops = { + .owner = THIS_MODULE, + .uring_cmd = conftest_uring_cmd, +}; + +static int __init conftest_init(void) +{ + (void)conftest_fops; + return 0; +} + +static void __exit conftest_exit(void) +{ +} + +MODULE_LICENSE("GPL"); +module_init(conftest_init); +module_exit(conftest_exit); diff --git a/driver/kcompat/uring_sqe_cmd.c b/driver/kcompat/uring_sqe_cmd.c new file mode 100644 index 00000000..62020b30 --- /dev/null +++ b/driver/kcompat/uring_sqe_cmd.c @@ -0,0 +1,59 @@ +/** + * Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. + * This program is free software; you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation; version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without + * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with this program; if + * not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +/* + * Probe for the *newer* io_uring uring_cmd SQE payload accessor. + * + * Upstream removed `struct io_uring_cmd::cmd` (a const void * pointing at the + * inline SQE command payload) and replaced it with `->sqe` plus the + * io_uring_sqe_cmd() accessor. This change was backported into distro kernels + * (e.g. Ubuntu 6.8), so a LINUX_VERSION_CODE check is unreliable — probe the + * accessor directly instead. + * + * - SLASH_HAVE_URING_SQE_CMD=y -> use io_uring_sqe_cmd(cmd->sqe) + * - SLASH_HAVE_URING_SQE_CMD=n -> fall back to cmd->cmd (older kernels) + * + * This probe only governs the payload accessor; the rest of the uring_cmd + * infrastructure is probed by uring_cmd.c (SLASH_HAVE_URING_CMD). + */ + +#include +#include +#include +#include +#if __has_include() +#include +#endif + +static int conftest_uring_sqe_cmd(struct io_uring_cmd *cmd) +{ + const void *payload = io_uring_sqe_cmd(cmd->sqe); + + (void)payload; + return 0; +} + +static int __init conftest_init(void) +{ + (void)conftest_uring_sqe_cmd; + return 0; +} + +static void __exit conftest_exit(void) +{ +} + +MODULE_LICENSE("GPL"); +module_init(conftest_init); +module_exit(conftest_exit); diff --git a/driver/libslash/README.md b/driver/libslash/README.md index 9e04813a..5dc4c07d 100644 --- a/driver/libslash/README.md +++ b/driver/libslash/README.md @@ -108,10 +108,23 @@ uint32_t qid = req.qid; slash_qdma_qpair_start(qdma, qid); -/* Get an fd for data transfer — read() = C2H, write() = H2C */ +/* Get an ioctl-only qpair fd for buffer transfers. */ int fd = slash_qdma_qpair_get_fd(qdma, qid, O_CLOEXEC); -write(fd, buf, len); /* H2C */ -read(fd, buf, len); /* C2H */ + +/* Create a kernel-owned DMA buffer (length must be a whole number of pages) + * and mmap it for CPU access via buf.addr. Current SLASH hardware reports + * SLASH_QDMA_TRANSFER_HINT_V80 in buf.transfer_hint. */ +struct slash_qdma_buffer buf; +slash_qdma_qpair_buffer_create(fd, len, &buf); +/* ... fill buf.addr from the CPU for an H2C transfer ... */ + +/* H2C: host -> device at dev_addr */ +slash_qdma_qpair_transfer(fd, buf.fd, /*buf_offset=*/0, dev_addr, len, + SLASH_QDMA_XFER_H2C); +/* C2H: device -> host */ +slash_qdma_qpair_transfer(fd, buf.fd, 0, dev_addr, len, SLASH_QDMA_XFER_C2H); + +slash_qdma_buffer_destroy(&buf); close(fd); slash_qdma_qpair_stop(qdma, qid); diff --git a/driver/libslash/include/slash/qdma.h b/driver/libslash/include/slash/qdma.h index 8d726544..6f097288 100644 --- a/driver/libslash/include/slash/qdma.h +++ b/driver/libslash/include/slash/qdma.h @@ -31,10 +31,18 @@ * 6. slash_qdma_qpair_del() — destroy * 7. slash_qdma_close() — close the device * - * The fd from qpair_get_fd() supports read() for C2H (card-to-host) - * and write() for H2C (host-to-card) DMA transfers. Positional I/O - * via lseek()/pread()/pwrite() is also supported. splice(), mmap(), - * and poll() are not available. + * The fd from qpair_get_fd() is ioctl-only for data movement: create kernel + * buffers with slash_qdma_buffer_create() (or slash_qdma_qpair_buffer_create() + * through a queue-pair fd), then move them with slash_qdma_qpair_transfer() / + * slash_qdma_qpair_transfer_batch(). read(), write(), and poll() are not + * available for SLASH transfers. + * + * Kernel buffers: + * For high-throughput transfers, the kernel allocates a DMA buffer once + * (pages + SGL + DMA mapping built at creation), returns a mappable fd, and + * userspace mmaps it for CPU access. Transfers reference the buffer by its + * fd instead of re-pinning per call. Closing the buffer fd (and unmapping) + * releases it. * * Error conventions: int-returning functions return -1 with errno set. * Pointer-returning functions return NULL with errno set. @@ -46,6 +54,7 @@ #include "uapi/slash_interface.h" #include +#include #ifdef __cplusplus extern "C" { @@ -144,13 +153,126 @@ int slash_qdma_qpair_del(struct slash_qdma *qdma, uint32_t qid); * @param flags Only O_CLOEXEC is accepted; the kernel returns -EINVAL for * any other bits. * - * The returned fd supports read() (C2H) and write() (H2C). Positional - * I/O via lseek()/pread()/pwrite() is also available. + * The returned fd supports transfer and buffer-registration ioctls. It does + * not support read/write data movement; use slash_qdma_qpair_transfer(). * * @return Non-negative fd on success, -1 on failure. */ int slash_qdma_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags); +/** + * @brief Obtain a transfer fd bound to one or more queue pairs. + * + * Like slash_qdma_qpair_get_fd(), but the returned fd is a collection of up to + * SLASH_QDMA_FD_MAX_QPAIRS queue pairs. A transfer issued on the fd selects a + * bound queue pair by its index in @qids, so a single transfer can fan across + * both AXI-MM/NoC channels. Each bound queue pair keeps whatever per-qpair + * settings (mm_channel, ring sizes, directions) it was given at add time. + * + * @param qdma Open QDMA handle. + * @param qids Array of @qpair_count queue pair IDs (must be started). + * @param qpair_count Number of entries in @qids (1..SLASH_QDMA_FD_MAX_QPAIRS). + * @param flags Only O_CLOEXEC is accepted. + * + * @return Non-negative fd on success, -1 on failure (errno set). + */ +int slash_qdma_qpair_get_fd_multi(struct slash_qdma *qdma, const uint32_t *qids, + uint32_t qpair_count, int flags); + +/** + * @brief A kernel-owned DMA buffer and its CPU mapping. + * + * Created by slash_qdma_buffer_create() / slash_qdma_qpair_buffer_create() and + * released by slash_qdma_buffer_destroy(). @addr is an mmap of the kernel + * buffer fd; write/read it from the CPU and move it with the transfer helpers, + * passing @fd as the sub-transfer's buf_fd. + */ +struct slash_qdma_buffer { + int fd; /**< Buffer fd (close via destroy). */ + void *addr; /**< CPU mapping of the buffer. */ + uint64_t length; /**< Buffer length in bytes. */ + uint32_t granule; /**< Bytes per DMA descriptor (page). */ + enum slash_qdma_transfer_hint transfer_hint; /**< Advisory channel policy. */ +}; + +/** + * @brief Create a kernel-owned DMA buffer and mmap it. + * + * Allocates @length bytes of kernel memory (DMA-mapped once), returns a buffer + * fd, and mmaps it into @buf_out->addr for CPU access. The buffer is bound to + * @qdma's device; transfers must use a queue-pair fd of the same device. + * + * @param qdma Open QDMA handle. + * @param length Buffer length in bytes (non-zero multiple of the page size). + * @param buf_out [out] Receives the created buffer (fd, mapping, metadata). + * + * @return 0 on success, -1 on failure (errno set). + */ +int slash_qdma_buffer_create(struct slash_qdma *qdma, uint64_t length, + struct slash_qdma_buffer *buf_out); + +/** + * @brief Create a kernel-owned DMA buffer through a queue-pair fd. + * + * Same semantics as slash_qdma_buffer_create(), but issues the create ioctl on + * @p qpair_fd. This is the preferred form for clients that received only qpair + * fds via SCM_RIGHTS (for example libvrtd clients). + * + * @return 0 on success, -1 on failure (errno set). + */ +int slash_qdma_qpair_buffer_create(int qpair_fd, uint64_t length, + struct slash_qdma_buffer *buf_out); + +/** + * @brief Release a buffer created with slash_qdma_buffer_create() or + * slash_qdma_qpair_buffer_create(). + * + * Unmaps @buf->addr and closes @buf->fd. Safe to call on a zeroed/partial + * buffer (fields are reset). + * + * @return 0 on success, -1 on failure (errno set). + */ +int slash_qdma_buffer_destroy(struct slash_qdma_buffer *buf); + +/** + * @brief Perform a DMA transfer using a single buffer fd. + * + * Convenience wrapper around slash_qdma_qpair_transfer_batch() for a single + * sub-transfer on qpair_index 0. + * + * @param qpair_fd Queue-pair I/O fd from slash_qdma_qpair_get_fd(). + * @param buf_fd Buffer fd (from slash_qdma_buffer_create()). + * @param buf_offset Byte offset within the buffer. + * @param dev_addr Device-side (endpoint) address. + * @param length Number of bytes to transfer. + * @param direction One of enum slash_qdma_transfer_dir (H2C or C2H). + * + * @return Number of bytes transferred (>= 0) on success, -1 on failure + * (errno set). + */ +ssize_t slash_qdma_qpair_transfer(int qpair_fd, int buf_fd, + uint64_t buf_offset, uint64_t dev_addr, + uint64_t length, uint32_t direction); + +/** + * @brief Perform a batch of buffer DMA sub-transfers in one call. + * + * Issues a single transfer ioctl carrying @count sub-transfers. The kernel + * runs sub-transfers that target distinct queue pairs concurrently, so one + * call can drive both NoC channels in parallel. Each sub-transfer names a + * bound queue pair by index (see slash_qdma_qpair_get_fd_multi()) and a buffer + * by its buf_fd. + * + * @param qpair_fd Transfer fd from slash_qdma_qpair_get_fd[_multi](). + * @param xfers Array of @count sub-transfer descriptors. + * @param count Number of sub-transfers (1..SLASH_QDMA_FD_MAX_QPAIRS). + * + * @return Total bytes transferred (>= 0) on success, -1 on failure (errno set). + */ +ssize_t slash_qdma_qpair_transfer_batch(int qpair_fd, + const struct slash_qdma_subxfer *xfers, + uint32_t count); + #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ diff --git a/driver/libslash/include/slash/uapi/slash_interface.h b/driver/libslash/include/slash/uapi/slash_interface.h index bbe6908d..1b1d85cb 100644 --- a/driver/libslash/include/slash/uapi/slash_interface.h +++ b/driver/libslash/include/slash/uapi/slash_interface.h @@ -153,6 +153,19 @@ struct slash_qdma_info { __u32 caps; /**< [out] Capability bitmask. */ }; +/** + * @brief AXI-MM / NoC channel selection for a queue pair. + * + * Selects which CPM5 AXI-MM channel a queue pair uses. libqdma mirrors the + * channel into the SW-context host_id, which selects the programmed Host + * Profile and hence the NoC channel. + */ +enum slash_qdma_mm_channel { + SLASH_QDMA_MM_CHANNEL_AUTO = 0, /**< Stripe across channels by (qid & 1). */ + SLASH_QDMA_MM_CHANNEL_0 = 1, /**< Pin to AXI-MM/NoC channel 0. */ + SLASH_QDMA_MM_CHANNEL_1 = 2, /**< Pin to AXI-MM/NoC channel 1. */ +}; + /** * @brief Add (allocate) a new QDMA queue pair. * @@ -176,6 +189,7 @@ struct slash_qdma_qpair_add { /* Userspace to kernel */ __u32 mode; /**< [in] Queue operating mode. */ __u32 dir_mask; /**< [in] Direction bitmask — which directions to enable. */ + __u32 mm_channel; /**< [in] AXI-MM/NoC channel selection (enum slash_qdma_mm_channel). */ __u32 h2c_ring_sz; /**< [in] Host-to-card descriptor ring size. */ __u32 c2h_ring_sz; /**< [in] Card-to-host descriptor ring size. */ @@ -208,24 +222,137 @@ struct slash_qdma_qpair_op { __u32 op; /**< [in] One of the SLASH_QDMA_QUEUE_OP_* constants. */ }; +/** + * @brief Maximum number of queue pairs a single transfer fd may own. + * + * A transfer fd is a collection of up to this many queue pairs (the intended + * use is one per AXI-MM/NoC channel). A single transfer ioctl issued on the + * fd may fan a buffer transfer across all of them, running up to this many + * hardware DMAs in parallel. Each bound qpair keeps whatever settings it was + * given at SLASH_QDMA_IOCTL_QPAIR_ADD time (mm_channel, ring sizes, etc.), so + * the two channels can be configured independently. + */ +#define SLASH_QDMA_FD_MAX_QPAIRS 2u + /** * @brief Obtain a file descriptor for queue I/O. * - * The returned fd can be used for read/write (or mmap) to transfer data - * through the queue pair. + * The returned fd is a collection of one or two queue pairs. It can be used + * for registered-buffer ioctls to transfer data through those queue pairs. * - * The fd is returned as the ioctl return value (same convention as - * the BAR fd ioctl). A single fd is returned per queue pair; - * read() on the fd performs C2H transfers and write() performs H2C - * transfers, using whichever directions were enabled in \@dir_mask - * when the queue pair was added. + * The fd is returned as the ioctl return value (same convention as the BAR fd + * ioctl). Data movement is issued via SLASH_QDMA_QPAIR_IOCTL_TRANSFER, whose + * sub-transfers select a bound queue pair by index and a direction (which must + * have been enabled in \@dir_mask when that queue pair was added). + * + * Set \@qpair_count to the number of queue pairs to bind and list their IDs in + * \@qpair_ids; the array index becomes the qpair_index used by + * struct slash_qdma_subxfer. For backward compatibility \@qpair_count == 0 + * binds the single queue pair named by \@qid. */ struct slash_qdma_qpair_fd_request { __u32 size; /**< Struct size for ABI versioning. */ /* Userspace to kernel */ - __u32 qid; /**< [in] Queue pair ID. */ + __u32 qid; /**< [in] Legacy single queue pair ID; used only when + * @qpair_count == 0. */ __u32 flags; /**< [in] File descriptor flags. Only O_CLOEXEC is honoured. */ + __u32 qpair_count; /**< [in] Number of valid entries in @qpair_ids + * (1..SLASH_QDMA_FD_MAX_QPAIRS); 0 = use @qid. */ + __u32 qpair_ids[SLASH_QDMA_FD_MAX_QPAIRS]; /**< [in] Queue pair IDs bound to + * this fd; the array index is the qpair_index. */ +}; + +/** + * @brief Transfer direction for a registered-buffer DMA transfer. + */ +enum slash_qdma_transfer_dir { + SLASH_QDMA_XFER_H2C = 1, /**< Host-to-Card (write to device). */ + SLASH_QDMA_XFER_C2H = 2, /**< Card-to-Host (read from device). */ +}; + +/** + * @brief Advisory transfer topology for a registered QDMA buffer. + * + * The kernel returns this hint when a buffer is registered so userspace can + * choose a suitable transfer strategy without hard-coding hardware-specific + * scheduling policy. The hint is advisory: transfers are still valid with any + * queue pair whose direction and ownership checks pass. + */ +enum slash_qdma_transfer_hint { + SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR = 1, /**< Prefer a single qpair (all traffic on one channel). */ + SLASH_QDMA_TRANSFER_HINT_V80 = 2, /**< Apply the V80 placement-aware channel policy. */ +}; + +/** + * @brief Create a kernel-owned DMA buffer and return a mappable fd. + * + * The kernel allocates @length bytes of host memory as a set of 4 KiB base + * pages (not physically contiguous), builds the transfer scatter-gather list, + * and DMA-maps every page once. All of this expensive setup happens here, at + * creation time, so the steady-state transfer path only slices the prebuilt + * SGL, syncs the relevant pages, and submits. + * + * The new buffer is returned as an fd (via the ioctl return value, same + * convention as the BAR/queue-pair fd ioctls). Userspace maps it with + * mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED, buf_fd, 0) to obtain + * a CPU pointer, and passes @buf_fd in struct slash_qdma_subxfer to move data. + * The pages stay alive as long as either the fd or any mapping exists, and the + * DMA mapping is torn down once both are gone and no transfer is in flight. + * + * The buffer is bound to the QDMA device of the fd it is created on (control + * fd or queue-pair fd); transfers must use a queue-pair fd of the same device. + * + * \@length must be a non-zero multiple of the host page size. The kernel + * returns the page @granule (bytes per descriptor) and a @transfer_hint; + * current SLASH hardware returns SLASH_QDMA_TRANSFER_HINT_V80. + */ +struct slash_qdma_buf_create { + __u32 size; /**< Struct size for ABI versioning. */ + + /* Userspace to kernel */ + __u32 flags; /**< [in] File descriptor flags. Only O_CLOEXEC is honoured. */ + __u64 length; /**< [in] Buffer length in bytes (page multiple). */ + + /* Kernel to userspace */ + __u32 granule; /**< [out] Bytes per SGL descriptor (host page size). */ + __u32 transfer_hint; /**< [out] enum slash_qdma_transfer_hint. */ +}; + +/** + * @brief One per-queue-pair sub-transfer within a transfer batch. + * + * Moves \@length bytes between the kernel buffer named by \@buf_fd at + * \@buf_offset and the device endpoint address \@dev_addr, on the queue pair + * selected by \@qpair_index (an index into the fd's bound qpairs). + * \@buf_offset and \@length must be aligned to the buffer's 4 KiB page granule, + * and \@buf_offset + \@length must not exceed the buffer length. \@direction + * must be one of enum slash_qdma_transfer_dir and must be enabled on the + * selected queue pair. + */ +struct slash_qdma_subxfer { + __u32 qpair_index; /**< [in] Index into the fd's bound qpairs. */ + __u32 direction; /**< [in] enum slash_qdma_transfer_dir (H2C or C2H). */ + __s32 buf_fd; /**< [in] Kernel buffer fd from SLASH_QDMA_IOCTL_BUF_CREATE. */ + __u32 pad0; /**< Padding for natural alignment. */ + __u64 buf_offset; /**< [in] Byte offset within the buffer. */ + __u64 dev_addr; /**< [in] Device-side (endpoint) address. */ + __u64 length; /**< [in] Number of bytes to transfer. */ +}; + +/** + * @brief Perform one or more buffer DMA sub-transfers in one call. + * + * Issued on a queue-pair I/O fd (from SLASH_QDMA_IOCTL_QPAIR_GET_FD). The + * kernel submits all \@count sub-transfers and waits for completion, running + * those that target distinct queue pairs concurrently (so a single syscall can + * drive both NoC channels in parallel). The total number of bytes transferred + * across all sub-transfers is returned as the ioctl return value. + */ +struct slash_qdma_transfer { + __u32 size; /**< Struct size for ABI versioning. */ + __u32 count; /**< [in] Number of sub-transfers (1..SLASH_QDMA_FD_MAX_QPAIRS). */ + struct slash_qdma_subxfer xfers[SLASH_QDMA_FD_MAX_QPAIRS]; /**< [in] Sub-transfers. */ }; /** Query QDMA subsystem capabilities. */ @@ -240,4 +367,30 @@ struct slash_qdma_qpair_fd_request { /** Obtain an I/O file descriptor for a queue pair. */ #define SLASH_QDMA_IOCTL_QPAIR_GET_FD _IOWR('v', 0x53, struct slash_qdma_qpair_fd_request) +/** + * Create a kernel-owned DMA buffer (allocate pages + build SGL + DMA-map once); + * returns a mappable buffer fd as the ioctl return value. May be issued on the + * control device or a queue-pair I/O fd. + */ +#define SLASH_QDMA_IOCTL_BUF_CREATE _IOWR('v', 0x54, struct slash_qdma_buf_create) + +/* 'v' 0x55 is reserved (previously SLASH_QDMA_IOCTL_BUF_UNREGISTER, removed: + * kernel buffers are released by closing their fd). */ + +/** + * Perform a buffer DMA transfer. Issued on a queue-pair I/O fd (not the + * control device); returns the number of bytes transferred. + */ +#define SLASH_QDMA_QPAIR_IOCTL_TRANSFER _IOWR('v', 0x56, struct slash_qdma_transfer) + +/** + * io_uring command opcode (SQE cmd_op) for an asynchronous buffer transfer + * batch, issued on a queue-pair I/O fd via IORING_OP_URING_CMD. The SQE inline + * command carries a single __u64: the userspace pointer to a struct + * slash_qdma_transfer. The completion CQE res holds the total bytes + * transferred (>= 0) or a negative errno. This path is optional and only + * available on kernels with io_uring uring_cmd support. + */ +#define SLASH_QDMA_URING_CMD_TRANSFER 0x56u + #endif diff --git a/driver/libslash/src/qdma.c b/driver/libslash/src/qdma.c index 68c38b6d..efe8c3d3 100644 --- a/driver/libslash/src/qdma.c +++ b/driver/libslash/src/qdma.c @@ -40,6 +40,179 @@ #include #include +#include +#include + +/* Bounce-copy chunk used by the @mock transfer fallback. */ +#define QDMA_XFER_BOUNCE_CHUNK (1u << 20) + +/* + * mmap a buffer fd (kernel buffer or @mock memfd) for CPU access. Always + * MAP_SHARED so writes are visible to the kernel/device and to pread/pwrite on + * the same fd. + */ +static int qdma_buffer_mmap(struct slash_qdma_buffer *buf) +{ + void *addr = mmap(NULL, (size_t)buf->length, PROT_READ | PROT_WRITE, + MAP_SHARED, buf->fd, 0); + + if (addr == MAP_FAILED) { + return -1; + } + buf->addr = addr; + return 0; +} + +/* + * @mock / fallback buffer: a memfd sized to @length and mmapped shared. Used + * when the BUF_CREATE ioctl is unavailable (the memfd-backed @mock path). + */ +static int qdma_buffer_create_memfd(uint64_t length, + struct slash_qdma_buffer *buf_out) +{ + int fd; + int saved_errno; + + fd = memfd_create("slash_qdma_buf", MFD_CLOEXEC); + if (fd < 0) { + return -1; + } + if (ftruncate(fd, (off_t)length) != 0) { + saved_errno = errno; + (void)close(fd); + errno = saved_errno; + return -1; + } + + buf_out->fd = fd; + buf_out->length = length; + buf_out->granule = 4096; + buf_out->transfer_hint = SLASH_QDMA_TRANSFER_HINT_V80; + buf_out->addr = NULL; + + if (qdma_buffer_mmap(buf_out) != 0) { + saved_errno = errno; + (void)close(fd); + buf_out->fd = -1; + errno = saved_errno; + return -1; + } + + return 0; +} + +/* + * Create a kernel buffer via the BUF_CREATE ioctl on @ioctl_fd (control fd or + * queue-pair fd), then mmap it. Falls back to a memfd buffer when the ioctl is + * not implemented (ENOTTY: the @mock path). + */ +static int qdma_buffer_create_on_fd(int ioctl_fd, uint64_t length, + struct slash_qdma_buffer *buf_out) +{ + struct slash_qdma_buf_create req; + int fd; + int saved_errno; + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.flags = O_CLOEXEC; + req.length = length; + + fd = ioctl(ioctl_fd, SLASH_QDMA_IOCTL_BUF_CREATE, &req); + if (fd < 0) { + if (errno == ENOTTY) { + return qdma_buffer_create_memfd(length, buf_out); + } + return -1; + } + + buf_out->fd = fd; + buf_out->length = length; + buf_out->granule = req.granule ? req.granule : 4096; + buf_out->transfer_hint = (enum slash_qdma_transfer_hint)req.transfer_hint; + buf_out->addr = NULL; + + if (qdma_buffer_mmap(buf_out) != 0) { + saved_errno = errno; + (void)close(fd); + buf_out->fd = -1; + errno = saved_errno; + return -1; + } + + return 0; +} + +/* + * @mock transfer fallback: bounce a single sub-transfer between the host buffer + * fd and the queue-pair memfd that stands in for device memory. Only used when + * the transfer ioctl returns ENOTTY. + */ +static ssize_t qdma_fallback_subxfer(int qpair_fd, + const struct slash_qdma_subxfer *x) +{ + uint8_t *tmp; + uint64_t done = 0; + + if (x->buf_fd < 0 || + (x->direction != SLASH_QDMA_XFER_H2C && + x->direction != SLASH_QDMA_XFER_C2H)) { + errno = EINVAL; + return -1; + } + + /* + * For C2H, make sure the device memfd is large enough that reads of + * never-written regions return zeros instead of a short read. Only ever + * grow the file: shrinking would discard data a prior H2C wrote. + */ + if (x->direction == SLASH_QDMA_XFER_C2H) { + struct stat st; + off_t want = (off_t)(x->dev_addr + x->length); + + if (fstat(qpair_fd, &st) == 0 && st.st_size < want) { + (void)ftruncate(qpair_fd, want); + } + } + + tmp = (uint8_t *)malloc(QDMA_XFER_BOUNCE_CHUNK); + if (tmp == NULL) { + return -1; + } + + while (done < x->length) { + uint64_t remaining = x->length - done; + size_t chunk = remaining < QDMA_XFER_BOUNCE_CHUNK + ? (size_t)remaining : QDMA_XFER_BOUNCE_CHUNK; + ssize_t r; + ssize_t w; + + if (x->direction == SLASH_QDMA_XFER_H2C) { + r = pread(x->buf_fd, tmp, chunk, (off_t)(x->buf_offset + done)); + if (r <= 0) { + free(tmp); + return -1; + } + w = pwrite(qpair_fd, tmp, (size_t)r, (off_t)(x->dev_addr + done)); + } else { + r = pread(qpair_fd, tmp, chunk, (off_t)(x->dev_addr + done)); + if (r <= 0) { + free(tmp); + return -1; + } + w = pwrite(x->buf_fd, tmp, (size_t)r, (off_t)(x->buf_offset + done)); + } + + if (w != r) { + free(tmp); + return -1; + } + done += (uint64_t)r; + } + + free(tmp); + return (ssize_t)done; +} struct slash_qdma *slash_qdma_open(const char *path) { @@ -146,6 +319,7 @@ int slash_qdma_qpair_add(struct slash_qdma *qdma, tmp.size = sizeof(tmp); tmp.mode = req->mode; tmp.dir_mask = req->dir_mask; + tmp.mm_channel = req->mm_channel; tmp.h2c_ring_sz = req->h2c_ring_sz; tmp.c2h_ring_sz = req->c2h_ring_sz; tmp.cmpt_ring_sz = req->cmpt_ring_sz; @@ -248,3 +422,156 @@ int slash_qdma_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags) return fd; } +int slash_qdma_qpair_get_fd_multi(struct slash_qdma *qdma, const uint32_t *qids, + uint32_t qpair_count, int flags) +{ + struct slash_qdma_qpair_fd_request req; + uint32_t i; + int fd; + + if (qdma == NULL || qids == NULL || + qpair_count == 0 || qpair_count > SLASH_QDMA_FD_MAX_QPAIRS) { + errno = EINVAL; + return -1; + } + + if (qdma->priv) { + return slash_qdma_mock_qpair_get_fd_multi(qdma, qids, qpair_count, + flags); + } + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.flags = flags; + req.qid = qids[0]; + req.qpair_count = qpair_count; + for (i = 0; i < qpair_count; ++i) { + req.qpair_ids[i] = qids[i]; + } + + fd = ioctl(qdma->fd, SLASH_QDMA_IOCTL_QPAIR_GET_FD, &req); + if (fd < 0) { + return -1; + } + + return fd; +} + +int slash_qdma_buffer_create(struct slash_qdma *qdma, uint64_t length, + struct slash_qdma_buffer *buf_out) +{ + if (qdma == NULL || buf_out == NULL || length == 0) { + errno = EINVAL; + return -1; + } + + /* @mock has no character device: back the buffer with a memfd directly. */ + if (qdma->priv) { + return qdma_buffer_create_memfd(length, buf_out); + } + + return qdma_buffer_create_on_fd(qdma->fd, length, buf_out); +} + +int slash_qdma_qpair_buffer_create(int qpair_fd, uint64_t length, + struct slash_qdma_buffer *buf_out) +{ + if (qpair_fd < 0 || buf_out == NULL || length == 0) { + errno = EINVAL; + return -1; + } + + return qdma_buffer_create_on_fd(qpair_fd, length, buf_out); +} + +int slash_qdma_buffer_destroy(struct slash_qdma_buffer *buf) +{ + int ret = 0; + + if (buf == NULL) { + errno = EINVAL; + return -1; + } + + if (buf->addr != NULL && buf->addr != MAP_FAILED && buf->length != 0) { + if (munmap(buf->addr, (size_t)buf->length) != 0) { + ret = -1; + } + } + buf->addr = NULL; + + if (buf->fd >= 0) { + if (close(buf->fd) != 0) { + ret = -1; + } + buf->fd = -1; + } + + return ret; +} + +ssize_t slash_qdma_qpair_transfer_batch(int qpair_fd, + const struct slash_qdma_subxfer *xfers, + uint32_t count) +{ + struct slash_qdma_transfer req; + uint32_t i; + int ret; + + if (qpair_fd < 0 || xfers == NULL || + count == 0 || count > SLASH_QDMA_FD_MAX_QPAIRS) { + errno = EINVAL; + return -1; + } + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.count = count; + for (i = 0; i < count; ++i) { + if (xfers[i].direction != SLASH_QDMA_XFER_H2C && + xfers[i].direction != SLASH_QDMA_XFER_C2H) { + errno = EINVAL; + return -1; + } + req.xfers[i] = xfers[i]; + } + + ret = ioctl(qpair_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req); + if (ret < 0) { + if (errno == ENOTTY) { + /* @mock path: bounce each sub-transfer through the memfds. */ + uint64_t total = 0; + + for (i = 0; i < count; ++i) { + ssize_t n = qdma_fallback_subxfer(qpair_fd, &xfers[i]); + + if (n < 0) { + return -1; + } + total += (uint64_t)n; + } + return (ssize_t)total; + } + return -1; + } + + return (ssize_t)ret; +} + +ssize_t slash_qdma_qpair_transfer(int qpair_fd, int buf_fd, + uint64_t buf_offset, uint64_t dev_addr, + uint64_t length, uint32_t direction) +{ + struct slash_qdma_subxfer xfer; + + memset(&xfer, 0, sizeof(xfer)); + xfer.qpair_index = 0; + xfer.direction = direction; + xfer.buf_fd = buf_fd; + xfer.buf_offset = buf_offset; + xfer.dev_addr = dev_addr; + xfer.length = length; + + return slash_qdma_qpair_transfer_batch(qpair_fd, &xfer, 1); +} + diff --git a/driver/libslash/src/qdma_mock.c b/driver/libslash/src/qdma_mock.c index 92a24c6c..d72762bb 100644 --- a/driver/libslash/src/qdma_mock.c +++ b/driver/libslash/src/qdma_mock.c @@ -257,3 +257,44 @@ int slash_qdma_mock_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flag return new_fd; } + +int slash_qdma_mock_qpair_get_fd_multi(struct slash_qdma *qdma, + const uint32_t *qids, + uint32_t qpair_count, int flags) +{ + struct slash_qdma_mock *ctx; + uint32_t i; + int new_fd; + (void) flags; /* O_CLOEXEC already set on the memfd */ + + if (qdma == NULL || qids == NULL || + qpair_count == 0 || qpair_count > SLASH_QDMA_FD_MAX_QPAIRS) { + errno = EINVAL; + return -1; + } + + ctx = mock_ctx(qdma); + + for (i = 0; i < qpair_count; ++i) { + if (qids[i] >= QDMA_MOCK_MAX_QUEUES || + !ctx->queues[qids[i]].in_use || !ctx->queues[qids[i]].started) { + errno = EINVAL; + return -1; + } + } + + /* + * The mock backs the device address space with one memfd per queue pair. + * Both NoC channels address the same device memory, so a multi-qpair fd is + * emulated by a single backing store: dup the first queue pair's memfd and + * route every sub-transfer through it. This keeps round-trips consistent + * regardless of which channel a sub-transfer used. + */ + new_fd = dup(ctx->queues[qids[0]].fd); + if (new_fd < 0) { + return -1; + } + + return new_fd; +} + diff --git a/driver/libslash/src/qdma_mock.h b/driver/libslash/src/qdma_mock.h index 36f3d596..cd7e54e6 100644 --- a/driver/libslash/src/qdma_mock.h +++ b/driver/libslash/src/qdma_mock.h @@ -25,6 +25,8 @@ #include +#include + struct slash_qdma *slash_qdma_mock_open(void); int slash_qdma_mock_close(struct slash_qdma *qdma); int slash_qdma_mock_info_read(struct slash_qdma *qdma, struct slash_qdma_info *info); @@ -33,5 +35,8 @@ int slash_qdma_mock_qpair_start(struct slash_qdma *qdma, uint32_t qid); int slash_qdma_mock_qpair_stop(struct slash_qdma *qdma, uint32_t qid); int slash_qdma_mock_qpair_del(struct slash_qdma *qdma, uint32_t qid); int slash_qdma_mock_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags); +int slash_qdma_mock_qpair_get_fd_multi(struct slash_qdma *qdma, + const uint32_t *qids, + uint32_t qpair_count, int flags); #endif /* LIBSLASH_QDMA_MOCK_H */ diff --git a/driver/libslash/tests/qdma_test.cpp b/driver/libslash/tests/qdma_test.cpp index 5b024111..9519302e 100644 --- a/driver/libslash/tests/qdma_test.cpp +++ b/driver/libslash/tests/qdma_test.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -100,6 +101,36 @@ TEST(QdmaNullTest, QpaiGetFd) { EXPECT_EQ(errno, EINVAL); } +TEST(QdmaNullTest, BufferCreate) { + struct slash_qdma_buffer buf{}; + errno = 0; + EXPECT_EQ(slash_qdma_buffer_create(nullptr, 4096, &buf), -1); + EXPECT_EQ(errno, EINVAL); + + struct slash_qdma fake{}; + fake.fd = -1; + errno = 0; + EXPECT_EQ(slash_qdma_buffer_create(&fake, 4096, nullptr), -1); + EXPECT_EQ(errno, EINVAL); + + errno = 0; + EXPECT_EQ(slash_qdma_qpair_buffer_create(-1, 4096, &buf), -1); + EXPECT_EQ(errno, EINVAL); +} + +TEST(QdmaNullTest, BufferDestroy) { + errno = 0; + EXPECT_EQ(slash_qdma_buffer_destroy(nullptr), -1); + EXPECT_EQ(errno, EINVAL); +} + +TEST(QdmaNullTest, Transfer) { + errno = 0; + /* Invalid qpair fd is rejected. */ + EXPECT_EQ(slash_qdma_qpair_transfer(-1, 4, 0, 0, 4096, SLASH_QDMA_XFER_H2C), -1); + EXPECT_EQ(errno, EINVAL); +} + // ─── Real device tests (requires /dev/slash_qdma_ctl0) ─────────────────────── class ParametrizedQdmaTest : public ::testing::TestWithParam { @@ -158,26 +189,207 @@ TEST_P(ParametrizedQdmaTest, QueueDmaTransfer) { int queue_fd = slash_qdma_qpair_get_fd(qdma_, qid, 0); ASSERT_GE(queue_fd, 0); - // Write a known pattern to DDR (H2C). - uint8_t src[XFER_SIZE]; + // Kernel-owned buffers created through the queue-pair fd. + struct slash_qdma_buffer src_buf{}; + struct slash_qdma_buffer dst_buf{}; + ASSERT_EQ(slash_qdma_qpair_buffer_create(queue_fd, XFER_SIZE, &src_buf), 0); + ASSERT_EQ(slash_qdma_qpair_buffer_create(queue_fd, XFER_SIZE, &dst_buf), 0); + auto *src = static_cast(src_buf.addr); + auto *dst = static_cast(dst_buf.addr); for (size_t i = 0; i < XFER_SIZE; ++i) { src[i] = static_cast(i & 0xFF); } - ssize_t written = pwrite(queue_fd, src, XFER_SIZE, static_cast(DDR_BASE_ADDRESS)); + std::memset(dst, 0, XFER_SIZE); + + ssize_t written = slash_qdma_qpair_transfer( + queue_fd, src_buf.fd, 0, DDR_BASE_ADDRESS, XFER_SIZE, SLASH_QDMA_XFER_H2C); EXPECT_EQ(written, static_cast(XFER_SIZE)); // Read back from DDR (C2H) and verify. - uint8_t dst[XFER_SIZE]{}; - ssize_t read_bytes = pread(queue_fd, dst, XFER_SIZE, static_cast(DDR_BASE_ADDRESS)); + ssize_t read_bytes = slash_qdma_qpair_transfer( + queue_fd, dst_buf.fd, 0, DDR_BASE_ADDRESS, XFER_SIZE, SLASH_QDMA_XFER_C2H); EXPECT_EQ(read_bytes, static_cast(XFER_SIZE)); EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); + EXPECT_EQ(slash_qdma_buffer_destroy(&src_buf), 0); + EXPECT_EQ(slash_qdma_buffer_destroy(&dst_buf), 0); + EXPECT_EQ(close(queue_fd), 0); EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qid), 0); EXPECT_EQ(slash_qdma_qpair_del(qdma_, qid), 0); } +TEST_P(ParametrizedQdmaTest, BufferCreateTransfer) { + static constexpr size_t XFER_SIZE = 4096; + + struct slash_qdma_qpair_add req{}; + req.mode = 0; /* QDMA_Q_MODE_MM */ + req.dir_mask = 0x3; /* H2C | C2H */ + + ASSERT_EQ(slash_qdma_qpair_add(qdma_, &req), 0); + uint32_t qid = req.qid; + ASSERT_EQ(slash_qdma_qpair_start(qdma_, qid), 0); + + int queue_fd = slash_qdma_qpair_get_fd(qdma_, qid, 0); + ASSERT_GE(queue_fd, 0); + + // Kernel-owned buffers created through the control handle. + struct slash_qdma_buffer src_buf{}; + struct slash_qdma_buffer dst_buf{}; + ASSERT_EQ(slash_qdma_buffer_create(qdma_, XFER_SIZE, &src_buf), 0); + ASSERT_EQ(slash_qdma_buffer_create(qdma_, XFER_SIZE, &dst_buf), 0); + EXPECT_EQ(src_buf.transfer_hint, SLASH_QDMA_TRANSFER_HINT_V80); + EXPECT_EQ(dst_buf.transfer_hint, SLASH_QDMA_TRANSFER_HINT_V80); + auto *src = static_cast(src_buf.addr); + auto *dst = static_cast(dst_buf.addr); + for (size_t i = 0; i < XFER_SIZE; ++i) { + src[i] = static_cast(i & 0xFF); + } + std::memset(dst, 0, XFER_SIZE); + + // H2C: push the source buffer to the device. + ssize_t written = slash_qdma_qpair_transfer(queue_fd, src_buf.fd, 0, + DDR_BASE_ADDRESS, XFER_SIZE, + SLASH_QDMA_XFER_H2C); + EXPECT_EQ(written, static_cast(XFER_SIZE)); + + // C2H: pull it back into the destination buffer and verify. + ssize_t read_bytes = slash_qdma_qpair_transfer(queue_fd, dst_buf.fd, 0, + DDR_BASE_ADDRESS, XFER_SIZE, + SLASH_QDMA_XFER_C2H); + EXPECT_EQ(read_bytes, static_cast(XFER_SIZE)); + EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); + + EXPECT_EQ(slash_qdma_buffer_destroy(&src_buf), 0); + EXPECT_EQ(slash_qdma_buffer_destroy(&dst_buf), 0); + + EXPECT_EQ(close(queue_fd), 0); + EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qid), 0); + EXPECT_EQ(slash_qdma_qpair_del(qdma_, qid), 0); +} + +TEST_P(ParametrizedQdmaTest, MultiQpairBatchTransfer) { + // Two 4 KiB halves transferred concurrently across two queue pairs bound to + // a single fd, exercising the get-fd-multi + batch transfer API. + static constexpr size_t HALF = 4096; + static constexpr size_t XFER_SIZE = 2 * HALF; + + uint32_t qids[2] = {0, 0}; + for (int ch = 0; ch < 2; ++ch) { + struct slash_qdma_qpair_add req{}; + req.mode = 0; /* QDMA_Q_MODE_MM */ + req.dir_mask = 0x3; /* H2C | C2H */ + req.mm_channel = static_cast( + ch == 0 ? SLASH_QDMA_MM_CHANNEL_0 : SLASH_QDMA_MM_CHANNEL_1); + ASSERT_EQ(slash_qdma_qpair_add(qdma_, &req), 0); + qids[ch] = req.qid; + ASSERT_EQ(slash_qdma_qpair_start(qdma_, qids[ch]), 0); + } + + int fd = slash_qdma_qpair_get_fd_multi(qdma_, qids, 2, 0); + ASSERT_GE(fd, 0); + + struct slash_qdma_buffer src_buf{}; + struct slash_qdma_buffer dst_buf{}; + ASSERT_EQ(slash_qdma_qpair_buffer_create(fd, XFER_SIZE, &src_buf), 0); + ASSERT_EQ(slash_qdma_qpair_buffer_create(fd, XFER_SIZE, &dst_buf), 0); + auto *src = static_cast(src_buf.addr); + auto *dst = static_cast(dst_buf.addr); + for (size_t i = 0; i < XFER_SIZE; ++i) { + src[i] = static_cast((i * 7 + 1) & 0xFF); + } + std::memset(dst, 0, XFER_SIZE); + + // H2C: lower half on qpair 0, upper half on qpair 1, in one ioctl. + struct slash_qdma_subxfer h2c[2]{}; + h2c[0] = {0, SLASH_QDMA_XFER_H2C, src_buf.fd, 0, 0, DDR_BASE_ADDRESS, HALF}; + h2c[1] = {1, SLASH_QDMA_XFER_H2C, src_buf.fd, 0, HALF, DDR_BASE_ADDRESS + HALF, HALF}; + EXPECT_EQ(slash_qdma_qpair_transfer_batch(fd, h2c, 2), + static_cast(XFER_SIZE)); + + // C2H: read both halves back across both channels in one ioctl. + struct slash_qdma_subxfer c2h[2]{}; + c2h[0] = {0, SLASH_QDMA_XFER_C2H, dst_buf.fd, 0, 0, DDR_BASE_ADDRESS, HALF}; + c2h[1] = {1, SLASH_QDMA_XFER_C2H, dst_buf.fd, 0, HALF, DDR_BASE_ADDRESS + HALF, HALF}; + EXPECT_EQ(slash_qdma_qpair_transfer_batch(fd, c2h, 2), + static_cast(XFER_SIZE)); + + EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); + + EXPECT_EQ(slash_qdma_buffer_destroy(&src_buf), 0); + EXPECT_EQ(slash_qdma_buffer_destroy(&dst_buf), 0); + + EXPECT_EQ(close(fd), 0); + for (int ch = 0; ch < 2; ++ch) { + EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qids[ch]), 0); + EXPECT_EQ(slash_qdma_qpair_del(qdma_, qids[ch]), 0); + } +} + +TEST(QdmaNullTest, QpairGetFdMultiInvalid) { + uint32_t qids[2] = {0, 1}; + errno = 0; + EXPECT_EQ(slash_qdma_qpair_get_fd_multi(nullptr, qids, 2, 0), -1); + EXPECT_EQ(errno, EINVAL); + + struct slash_qdma fake{}; + fake.fd = -1; + errno = 0; + EXPECT_EQ(slash_qdma_qpair_get_fd_multi(&fake, qids, 0, 0), -1); + EXPECT_EQ(errno, EINVAL); + + errno = 0; + EXPECT_EQ(slash_qdma_qpair_get_fd_multi(&fake, qids, 3, 0), -1); + EXPECT_EQ(errno, EINVAL); +} + +TEST(QdmaNullTest, TransferBatchInvalid) { + struct slash_qdma_subxfer x{}; + x.direction = SLASH_QDMA_XFER_H2C; + errno = 0; + EXPECT_EQ(slash_qdma_qpair_transfer_batch(-1, &x, 1), -1); + EXPECT_EQ(errno, EINVAL); + + errno = 0; + EXPECT_EQ(slash_qdma_qpair_transfer_batch(3, nullptr, 1), -1); + EXPECT_EQ(errno, EINVAL); + + errno = 0; + EXPECT_EQ(slash_qdma_qpair_transfer_batch(3, &x, 0), -1); + EXPECT_EQ(errno, EINVAL); +} + +TEST_P(ParametrizedQdmaTest, QueueFdReadWriteRejectedOnHardware) { + if (mock) { + GTEST_SKIP() << "mock qpair fds are memfds and still support read/write"; + } + + struct slash_qdma_qpair_add req{}; + req.mode = 0; + req.dir_mask = 0x3; + + ASSERT_EQ(slash_qdma_qpair_add(qdma_, &req), 0); + uint32_t qid = req.qid; + ASSERT_EQ(slash_qdma_qpair_start(qdma_, qid), 0); + + int queue_fd = slash_qdma_qpair_get_fd(qdma_, qid, 0); + ASSERT_GE(queue_fd, 0); + + uint8_t byte = 0; + errno = 0; + EXPECT_EQ(write(queue_fd, &byte, sizeof(byte)), -1); + EXPECT_TRUE(errno == EINVAL || errno == EOPNOTSUPP || errno == EBADF); + + errno = 0; + EXPECT_EQ(read(queue_fd, &byte, sizeof(byte)), -1); + EXPECT_TRUE(errno == EINVAL || errno == EOPNOTSUPP || errno == EBADF); + + EXPECT_EQ(close(queue_fd), 0); + EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qid), 0); + EXPECT_EQ(slash_qdma_qpair_del(qdma_, qid), 0); +} + TEST_P(ParametrizedQdmaTest, CloseSucceeds) { EXPECT_EQ(slash_qdma_close(qdma_), 0); qdma_ = nullptr; diff --git a/driver/patches/0003-libqdma-pr-fmt-guard.patch b/driver/patches/0003-libqdma-pr-fmt-guard.patch new file mode 100644 index 00000000..d253070a --- /dev/null +++ b/driver/patches/0003-libqdma-pr-fmt-guard.patch @@ -0,0 +1,43 @@ +SLASH local modification to the pinned QDMA submodule (libqdma). + +libqdma: make qdma_platform_env.h self-sufficient for pr_fmt + +SLASH force-includes driver/slash_compat.h into every TU (driver/Makefile) so +kernel-API shims such as from_timer() reach the pinned libqdma sources. That +header pulls in early (via ) and then #undefs +pr_fmt, so each libqdma .c that sets its own + #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ +at the top compiles without a "pr_fmt redefined" warning. + +The qdma_access HAL files don't set their own pr_fmt -- they log via +qdma_log_* -> pr_*, which expand pr_fmt at the call site -- so after that +#undef they would reference an undefined pr_fmt ( is already +include-guarded by the time they include it, so its guarded default can no +longer re-arm). Re-arm the kernel default here, guarded by #ifndef so the +libqdma sources that do set a custom pr_fmt before including this header keep +it. Behaviour for the HAL files is unchanged (the kernel default is "fmt"). + +Generated against qdma_drv @ e0168be (pinned submodule commit). +Applied automatically by driver/Makefile (libqdma-patches target, patch -p1). +diff --git a/qdma_platform_env.h b/qdma_platform_env.h +index fa26c9a..c9e1082 100755 +--- a/qdma_platform_env.h ++++ b/qdma_platform_env.h +@@ -25,6 +25,17 @@ + #define QDMA_SNPRINTF_S(arg1, arg2, arg3, ...) \ + snprintf(arg1, arg3, ##__VA_ARGS__) + ++/* ++ * SLASH: re-arm the kernel-default pr_fmt for TUs that log via qdma_log_* -> ++ * pr_* but never set their own pr_fmt. SLASH force-includes a compat header ++ * that #undefs pr_fmt after is already include-guarded, so the ++ * default can no longer re-arm on its own. Guarded with #ifndef so the libqdma ++ * sources that set a custom pr_fmt before including this header keep it. ++ */ ++#ifndef pr_fmt ++#define pr_fmt(fmt) fmt ++#endif ++ + #define qdma_log_info(x_, ...) pr_info(x_, ##__VA_ARGS__) + #define qdma_log_warning(x_, ...) pr_warn(x_, ##__VA_ARGS__) + #define qdma_log_error(x_, ...) pr_err(x_, ##__VA_ARGS__) diff --git a/driver/slash_compat.h b/driver/slash_compat.h index 5b3a50c2..e6719487 100644 --- a/driver/slash_compat.h +++ b/driver/slash_compat.h @@ -17,6 +17,7 @@ #include #include +#include /* * Compat shims selected by the kcompat probes in driver/kcompat/. @@ -53,4 +54,43 @@ static inline void slash_vm_flags_set(struct vm_area_struct *vma, vm_flags_t fla #define SLASH_MODULE_IMPORT_NS(ns) MODULE_IMPORT_NS(#ns) #endif +/* + * from_timer() was renamed to timer_container_of() upstream in v6.16 + * (commit 41cb08555c41) and backported by RHEL/CentOS 9.8 (kernel + * 5.14.0-687; 9.7 / 5.14.0-611 and earlier still ship from_timer) into the + * 5.14 baseline, so a LINUX_VERSION_CODE / RHEL_RELEASE_CODE check is + * unreliable across the 9.x rebuilds. Both names are typeof()-based macros, + * so detect them directly and prefer the kernel's own API: + * 1. kernel still defines from_timer() -> use it as-is (no redefine) + * 2. kernel renamed it to timer_container_of() -> delegate to that + * 3. neither exists -> hand-roll the historical body + * is included above so the guard sees whichever name the + * kernel defines, regardless of -include ordering. + */ +#ifndef from_timer +# ifdef timer_container_of +# define from_timer(var, callback_timer, timer_fieldname) \ + timer_container_of(var, callback_timer, timer_fieldname) +# else +# define from_timer(var, callback_timer, timer_fieldname) \ + container_of(callback_timer, typeof(*var), timer_fieldname) +# endif +#endif + +/* + * The kernel headers included above ( -> ) install + * the default `#define pr_fmt(fmt) fmt` under an #ifndef guard. Because this + * header is force-included (-include, see driver/Makefile) ahead of every TU, + * that default lands before each pinned libqdma source's own top-of-file + * #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + * turning it into a redefinition ("pr_fmt redefined" warning). Undefine it here + * so each TU starts from the clean "nobody has defined pr_fmt yet" state the + * idiom relies on; the file's own #define is then the first and only one. + * + * TUs that never set their own pr_fmt and log via the kernel default (the + * qdma_access HAL, whose qdma_log_* macros expand to pr_*) re-arm that default + * from qdma_platform_env.h; see driver/patches/0003-libqdma-pr-fmt-guard.patch. + */ +#undef pr_fmt + #endif /* SLASH_COMPAT_H */ diff --git a/driver/slash_config.h b/driver/slash_config.h index c06c5962..acebe253 100644 --- a/driver/slash_config.h +++ b/driver/slash_config.h @@ -23,9 +23,9 @@ * * The SLASH design exposes two PCI physical functions per card: * - * - **PF1** (device 0x50B5) — QDMA function. Hosts the Xilinx QDMA - * IP used for high-throughput DMA transfers between host memory and - * the FPGA fabric. + * - **PF1** (device 0x50B5, or 0x50BD on AVED/V80P designs) — QDMA + * function. Hosts the Xilinx QDMA IP used for high-throughput DMA + * transfers between host memory and the FPGA fabric. * * - **PF2** (device 0x50B6) — Control function. Exposes PCI BARs * that the host can mmap for register-level MMIO access to the @@ -52,6 +52,8 @@ #define SLASH_QDMA_PCI_VENDOR_ID 0x10EE /** PCI device ID for the V80 SLASH QDMA function. */ #define SLASH_QDMA_PCI_DEVICE_ID 0x50B5 +/** PCI device ID for the V80P/AVED QDMA function. */ +#define SLASH_AVED_QDMA_PCI_DEVICE_ID 0x50BD /** Physical function number for the QDMA DMA engine. */ #define SLASH_QDMA_PF 1 diff --git a/driver/slash_qdma.c b/driver/slash_qdma.c index 6c64272b..3b63148e 100644 --- a/driver/slash_qdma.c +++ b/driver/slash_qdma.c @@ -23,8 +23,9 @@ * to provide queue-pair-based DMA transfers between host memory and the * FPGA fabric. * - * The QDMA subsystem binds to PF1 (PCI device ID 0x50B5), while the - * control device (slash_ctldev) binds to PF2 (device ID 0x50B6). + * The QDMA subsystem binds to PF1 (PCI device ID 0x50B5, or 0x50BD on + * AVED/V80P designs), while the control device (slash_ctldev) binds to + * PF2 (device ID 0x50B6). * * Queue pair lifecycle: * add -> start -> I/O (via anon_inode fd) -> stop -> del @@ -50,10 +51,16 @@ #include #include +#include +#include +#include #include #include #include +#include #include +#include +#include #include #include #include @@ -65,6 +72,31 @@ #include #include +#if defined(SLASH_HAVE_URING_CMD) +#include +#if __has_include() +#include +#endif + +/** + * slash_qdma_uring_cmd_payload() - Pointer to a uring_cmd's inline SQE payload. + * @cmd: The io_uring command. + * + * Abstracts the kernel API change that removed struct io_uring_cmd::cmd in + * favour of ->sqe + io_uring_sqe_cmd(). The accessor is selected at build + * time by the kcompat probe (SLASH_HAVE_URING_SQE_CMD); both forms return the + * same inline command payload pointer. + */ +static inline const void *slash_qdma_uring_cmd_payload(struct io_uring_cmd *cmd) +{ +#if defined(SLASH_HAVE_URING_SQE_CMD) + return io_uring_sqe_cmd(cmd->sqe); +#else + return cmd->cmd; +#endif +} +#endif + /* * Direction bitmask constants. * @@ -100,6 +132,45 @@ offsetofend(struct slash_qdma_qpair_op, op) #define SLASH_QDMA_QPAIR_GET_FD_MIN_SIZE \ offsetofend(struct slash_qdma_qpair_fd_request, flags) +#define SLASH_QDMA_BUF_CREATE_MIN_SIZE \ + offsetofend(struct slash_qdma_buf_create, length) +#define SLASH_QDMA_TRANSFER_MIN_SIZE \ + offsetofend(struct slash_qdma_transfer, count) + +/* + * CPM5 Host Profile indirect-context programming. + * + * The Host Profile context tells the CPM5 QDMA how to route AXI4-MM + * traffic onto the Versal NoC. It is programmed via the same indirect + * context command interface libqdma uses for queue contexts, but with + * the host-profile selector (0xA). Register offsets and the command + * word layout mirror eqdma_cpm5_reg.h: + * + * IND_CTXT_DATA base 0x804 (8 x u32 context words) + * IND_CTXT_MASK base 0x824 (8 x u32 write masks) + * IND_CTXT_CMD 0x844 (busy[0], sel[4:1], op[6:5], qid[18:7]) + * + * We program two profiles so the per-queue SW-context host_id selects + * the NoC channel: Host ID 0 -> NoC Channel 0, Host ID 1 -> NoC Channel 1. + */ +#define SLASH_QDMA_HP_DATA_ADDR 0x804u +#define SLASH_QDMA_HP_MASK_ADDR 0x824u +#define SLASH_QDMA_HP_CMD_ADDR 0x844u +#define SLASH_QDMA_HP_CMD_BUSY BIT(0) +#define SLASH_QDMA_HP_NUM_WORDS 8 +#define SLASH_QDMA_HP_SEL 0xAu /* QDMA_CTXT_SELC_HOST_PROFILE */ +#define SLASH_QDMA_HP_OP_WR 0x1u /* indirect context WR opcode */ +#define SLASH_QDMA_HP_OP_RD 0x2u /* indirect context RD opcode */ +#define SLASH_QDMA_HP_SMID_BASE 0x100u /* bit 8 set; base AXI-MM master ID */ +#define SLASH_QDMA_HP_POLL_US 1000 /* busy-wait budget in microseconds */ + +/* + * The qpair fd data path operates on spans of 4 KiB base pages. Each + * scatter-gather entry is exactly one base page, so a whole transfer is + * submitted to libqdma as a single multi-descriptor request and libqdma + * refills the descriptor ring as needed -- the transfer size is not bounded + * by the ring depth. + */ /** * SLASH_QDMA_QTYPE_COUNT - Number of queue types tracked per queue pair. @@ -152,6 +223,18 @@ } while (0) #endif +/* + * Per-transfer timing instrumentation (compile-time flag). + * + * Retained for parity with the userspace SLASH_QDMA_TIMING knob. With the + * kernel-owned buffer model all the expensive setup (page allocation, SGL + * build, DMA mapping) happens once at SLASH_QDMA_IOCTL_BUF_CREATE time, so the + * steady-state transfer cost is dominated by the libqdma submit/completion. + */ +#ifndef SLASH_QDMA_TIMING +#define SLASH_QDMA_TIMING 0 +#endif + /* Forward declaration; full definition follows. */ struct slash_qdma_dev; @@ -503,44 +586,57 @@ slash_qdma_qpair_remove(struct slash_qdma_dev *qdma_dev, u32 qid) /** * struct slash_qdma_qpair_file_ctx - Private data for an anon_inode qpair fd. - * @qdma_dev: Back-pointer to the owning QDMA device (ref held). - * @entry: The queue pair entry this fd operates on (ref held). - * @qid: Queue pair ID, cached for debug logging. + * @qdma_dev: Back-pointer to the owning QDMA device (ref held). + * @entries: The queue pair entries this fd operates on (one ref each). + * A transfer sub-transfer's qpair_index selects an entry here. + * @qids: Queue pair IDs, cached for debug logging. + * @n_qpairs: Number of valid entries in @entries / @qids + * (1..SLASH_QDMA_FD_MAX_QPAIRS). * * Allocated in slash_qdma_ioctl_qpair_get_fd_w() and freed in - * slash_qdma_qpair_release(). Both @qdma_dev and @entry have their - * reference counts incremented when the ctx is created, and decremented - * when the fd is closed. + * slash_qdma_qpair_release(). @qdma_dev and each entry have their reference + * counts incremented when the ctx is created, and decremented when the fd is + * closed. */ struct slash_qdma_qpair_file_ctx { struct slash_qdma_dev *qdma_dev; - struct slash_qdma_qpair_entry *entry; - u32 qid; + struct slash_qdma_qpair_entry *entries[SLASH_QDMA_FD_MAX_QPAIRS]; + u32 qids[SLASH_QDMA_FD_MAX_QPAIRS]; + u32 n_qpairs; }; /** - * struct slash_qdma_io_cb - I/O control block for a single DMA transfer. - * @buf: User-space buffer address (source for H2C, destination for C2H). - * @len: Transfer length in bytes. - * @pages_nr: Number of user pages pinned by get_user_pages_fast(). - * @sgl: Scatter-gather list of qdma_sw_sg entries, one per pinned page. - * Allocated as a single contiguous block together with @pages. - * @pages: Array of struct page pointers for the pinned user pages. - * Points into the same allocation as @sgl (immediately after it). - * @req: The libqdma request structure submitted to qdma_request_submit(). - * - * This is a stack-local structure (allocated in slash_qdma_qpair_read_write) - * that bundles all per-transfer state. The SGL and page array are heap- - * allocated in slash_qdma_map_user_buf_to_sgl() and freed in - * slash_qdma_iocb_release(). - */ -struct slash_qdma_io_cb { - void __user *buf; - size_t len; + * struct slash_qdma_buf - A kernel-owned, mmap-able DMA buffer. + * @ref: Reference count. The buffer fd holds one ref, each live VMA + * (mmap) holds one ref, and each in-flight transfer holds a + * temporary ref so a close cannot tear the buffer down under + * active DMA or while userspace still has it mapped. + * @qdma_dev: Device whose DMA mappings back this buffer (holds a device + * reference for the lifetime of the buffer object). + * @length: Buffer length in bytes (a multiple of @granule). + * @granule: Bytes per SGL entry / page (PAGE_SIZE). Uniform across all + * entries, so transfer slices are computed by simple division. + * @pages_nr: Number of base pages backing the buffer (length / granule). + * @pages: Array of @pages_nr kernel pages (alloc_page()), not physically + * contiguous. Used both for the CPU mmap and the DMA SGL. + * @sgl: Prebuilt scatter-gather list, one entry per page, each with its + * dma_addr filled in once at creation so transfers submit with + * req->dma_mapped = 1. + * @dma_mapped: True once @sgl entries have been DMA-mapped. + * + * All expensive setup (page allocation, SGL construction, DMA mapping) happens + * once at creation; the transfer fast path only slices @sgl, syncs the touched + * pages, and submits. + */ +struct slash_qdma_buf { + struct kref ref; + struct slash_qdma_dev *qdma_dev; + u64 length; + u32 granule; unsigned int pages_nr; - struct qdma_sw_sg *sgl; struct page **pages; - struct qdma_request req; + struct qdma_sw_sg *sgl; + bool dma_mapped; }; /* ───────────────────────────────────────────────────────────────────── @@ -586,32 +682,37 @@ static int slash_qdma_ioctl_qpair_op_apply(struct slash_qdma_dev *qdma_dev, static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc, struct slash_qdma_dev *qdma_dev, void __user *uarg); +static int slash_qdma_ioctl_buf_create_w(struct miscdevice *misc, + struct slash_qdma_dev *qdma_dev, + void __user *uarg); +static void slash_qdma_buf_release(struct kref *ref); +static void slash_qdma_buf_put(struct slash_qdma_buf *buf); +static long slash_qdma_qpair_transfer(struct file *file, void __user *uarg); -static ssize_t slash_qdma_qpair_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos); -static ssize_t slash_qdma_qpair_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos); static int slash_qdma_qpair_release(struct inode *inode, struct file *file); static long slash_qdma_qpair_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +#if defined(SLASH_HAVE_URING_CMD) +static int slash_qdma_qpair_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags); +#endif /** * slash_qdma_qpair_fops - File operations for per-qpair anon_inode fds. * - * read() performs a C2H (card-to-host) DMA transfer. - * write() performs an H2C (host-to-card) DMA transfer. - * llseek uses default_llseek so that pread/pwrite can set the - * device-side address via the file position. - * ioctl is a stub that returns -ENOTTY (no per-fd ioctls defined yet). - * release drops the refs on the qpair entry and device. + * ioctl performs buffer DMA transfers and buffer creation for clients that + * only hold a queue-pair fd. + * uring_cmd (optional) is the asynchronous equivalent of the transfer ioctl, + * available only on kernels with io_uring uring_cmd support. + * release drops the refs on the bound qpair entries and device. */ static const struct file_operations slash_qdma_qpair_fops = { .owner = THIS_MODULE, - .read = slash_qdma_qpair_read, - .write = slash_qdma_qpair_write, .unlocked_ioctl = slash_qdma_qpair_ioctl, +#if defined(SLASH_HAVE_URING_CMD) + .uring_cmd = slash_qdma_qpair_uring_cmd, +#endif .release = slash_qdma_qpair_release, - .llseek = default_llseek, }; @@ -625,10 +726,12 @@ static void slash_qdma_ioctl_info(struct miscdevice *misc, struct slash_qdma_dev /** * slash_qdma_ids - PCI device ID table for the QDMA PF. * - * Matches only PF1 (device ID 0x50B5) on AMD/Xilinx V80 cards. + * Matches PF1 QDMA functions on AMD/Xilinx V80 cards, including the + * AVED/V80P device ID. */ static const struct pci_device_id slash_qdma_ids[] = { {PCI_DEVICE(SLASH_QDMA_PCI_VENDOR_ID, SLASH_QDMA_PCI_DEVICE_ID)}, + {PCI_DEVICE(SLASH_QDMA_PCI_VENDOR_ID, SLASH_AVED_QDMA_PCI_DEVICE_ID)}, {0,} }; MODULE_DEVICE_TABLE(pci, slash_qdma_ids); @@ -850,6 +953,287 @@ void slash_qdma_exit(void) SLASH_QDMA_OP_LOG("libqdma_exit done\n"); } +/* ───────────────────────────────────────────────────────────────────── + * CPM5 Host Profile context programming + * ───────────────────────────────────────────────────────────────────── */ + +/** + * slash_qdma_hp_set_field() - Set a bit field in the host profile context. + * @words: Array of SLASH_QDMA_HP_NUM_WORDS u32s holding the 256-bit context + * (word i covers bits [32*i+31 : 32*i]). + * @hi: Most-significant bit index of the field (inclusive). + * @lo: Least-significant bit index of the field (inclusive). + * @val: Value to place in [hi:lo]; bits outside the field width are masked. + * + * Handles fields that straddle a 32-bit word boundary (e.g. the C2H + * AXI4-MM steering field at bits [97:94], which spans words 2 and 3). + */ +static void slash_qdma_hp_set_field(u32 *words, unsigned int hi, + unsigned int lo, u32 val) +{ + unsigned int width = hi - lo + 1; + u32 fmask = (width >= 32) ? ~0u : ((1u << width) - 1u); + unsigned int word = lo >> 5; + unsigned int off = lo & 31; + u64 wmask = (u64)fmask << off; + u64 wval = (u64)(val & fmask) << off; + + words[word] = (words[word] & ~(u32)(wmask & 0xFFFFFFFFu)) | + (u32)(wval & 0xFFFFFFFFu); + + if ((off + width) > 32 && (word + 1) < SLASH_QDMA_HP_NUM_WORDS) + words[word + 1] = (words[word + 1] & ~(u32)(wmask >> 32)) | + (u32)(wval >> 32); +} + +/** + * slash_qdma_hp_wait_ready() - Poll the indirect-context BUSY bit. + * @device: QDMA device (provides the libqdma handle for register access). + * @val_out: If non-NULL, receives the last QDMA_IND_CTXT_CMD value read. + * + * Spins (up to SLASH_QDMA_HP_POLL_US microseconds) until the indirect + * context command BUSY bit clears. Logging is left to the caller so the + * write path can treat a timeout as fatal while the readback path can treat + * it as a warning. + * + * Return: 0 once not busy, -ETIMEDOUT on timeout, or a negative errno from + * the register read. + */ +static int slash_qdma_hp_wait_ready(struct slash_qdma_dev *device, u32 *val_out) +{ + unsigned int waited_us = 0; + u32 val = 0; + int err; + + do { + err = qdma_device_read_config_register(device->qdma_handle, + SLASH_QDMA_HP_CMD_ADDR, &val); + if (err) + return err; + if (!(val & SLASH_QDMA_HP_CMD_BUSY)) { + if (val_out) + *val_out = val; + return 0; + } + udelay(1); + } while (++waited_us < SLASH_QDMA_HP_POLL_US); + + if (val_out) + *val_out = val; + return -ETIMEDOUT; +} + +/** + * slash_qdma_hp_get_field() - Read a bit field from the host profile context. + * @words: Array of SLASH_QDMA_HP_NUM_WORDS u32s holding the 256-bit context + * (word i covers bits [32*i+31 : 32*i]). + * @hi: Most-significant bit index of the field (inclusive). + * @lo: Least-significant bit index of the field (inclusive). + * + * Inverse of slash_qdma_hp_set_field(); handles fields that straddle a + * 32-bit word boundary (e.g. the C2H AXI4-MM steering field at bits + * [97:94], which spans words 2 and 3). + * + * Return: the value held in [hi:lo]. + */ +static u32 slash_qdma_hp_get_field(const u32 *words, unsigned int hi, + unsigned int lo) +{ + unsigned int width = hi - lo + 1; + u32 fmask = (width >= 32) ? ~0u : ((1u << width) - 1u); + unsigned int word = lo >> 5; + unsigned int off = lo & 31; + u64 two = (u64)words[word]; + + if ((word + 1) < SLASH_QDMA_HP_NUM_WORDS) + two |= (u64)words[word + 1] << 32; + + return (u32)((two >> off) & fmask); +} + +/** + * slash_qdma_read_host_profile() - Read one CPM5 Host Profile entry back. + * @device: QDMA device (provides the libqdma handle for register access). + * @host_id: Host Profile index to read. + * @out: Array of SLASH_QDMA_HP_NUM_WORDS u32s that receives the 256-bit + * context. + * + * Issues an indirect-context RD command for the host-profile selector, + * waits for the controller to complete it, and copies the IND_CTXT_DATA + * words back. Used to verify a preceding write. + * + * Return: 0 on success, negative errno on register-access error or + * -ETIMEDOUT if the BUSY bit never clears. + */ +static int slash_qdma_read_host_profile(struct slash_qdma_dev *device, + u32 host_id, u32 *out) +{ + u32 cmd = (host_id << 7) | (SLASH_QDMA_HP_OP_RD << 5) | + (SLASH_QDMA_HP_SEL << 1); + int err; + int i; + + err = qdma_device_write_config_register(device->qdma_handle, + SLASH_QDMA_HP_CMD_ADDR, cmd); + if (err) + return err; + + err = slash_qdma_hp_wait_ready(device, NULL); + if (err) + return err; + + for (i = 0; i < SLASH_QDMA_HP_NUM_WORDS; i++) { + err = qdma_device_read_config_register(device->qdma_handle, + SLASH_QDMA_HP_DATA_ADDR + (i * sizeof(u32)), &out[i]); + if (err) + return err; + } + + return 0; +} + +/** + * slash_qdma_write_host_profile() - Program and verify one CPM5 Host Profile. + * @device: QDMA device (provides the libqdma handle for register access). + * @host_id: Host Profile index to program (also the AXI4-MM steering value, + * i.e. the target NoC channel). + * + * Builds the 256-bit host profile context with the SMID and H2C/C2H + * AXI4-MM steering fields, writes it through the indirect-context + * registers via the libqdma-exported config register accessors, and + * polls the command BUSY bit until the controller completes the write. + * + * Once the write completes it reads the profile back and verifies the + * programmed fields (SMID and the two steering fields); a readback error + * or field mismatch is logged but is non-fatal (the profile is still + * considered applied). + * + * Only the SMID and the two steering fields are non-zero; the AXI + * prot/cache attributes are left at 0. + * + * Return: 0 on success, negative errno on register-access error or + * -ETIMEDOUT if the BUSY bit never clears. + */ +static int slash_qdma_write_host_profile(struct slash_qdma_dev *device, + u32 host_id) +{ + u32 data[SLASH_QDMA_HP_NUM_WORDS] = {0}; + u32 smid = SLASH_QDMA_HP_SMID_BASE + host_id; + u32 cmd; + u32 val = 0; + int err; + int i; + + /* SMID [201:192]; H2C steering [181:178]; C2H steering [97:94]. */ + slash_qdma_hp_set_field(data, 201, 192, smid); + slash_qdma_hp_set_field(data, 181, 178, host_id); + slash_qdma_hp_set_field(data, 97, 94, host_id); + + /* Context data words. */ + for (i = 0; i < SLASH_QDMA_HP_NUM_WORDS; i++) { + err = qdma_device_write_config_register(device->qdma_handle, + SLASH_QDMA_HP_DATA_ADDR + (i * sizeof(u32)), data[i]); + if (err) + goto err_reg; + } + + /* Context masks: write every bit. */ + for (i = 0; i < SLASH_QDMA_HP_NUM_WORDS; i++) { + err = qdma_device_write_config_register(device->qdma_handle, + SLASH_QDMA_HP_MASK_ADDR + (i * sizeof(u32)), 0xFFFFFFFFu); + if (err) + goto err_reg; + } + + /* Command: qid=host_id, op=WR, sel=HOST_PROFILE (0x34 for id 0, 0xB4 for id 1). */ + cmd = (host_id << 7) | (SLASH_QDMA_HP_OP_WR << 5) | (SLASH_QDMA_HP_SEL << 1); + err = qdma_device_write_config_register(device->qdma_handle, + SLASH_QDMA_HP_CMD_ADDR, cmd); + if (err) + goto err_reg; + + /* Wait for the controller to consume the command. */ + err = slash_qdma_hp_wait_ready(device, &val); + if (err == -ETIMEDOUT) { + dev_err(&device->pdev->dev, + "qdma: host profile %u programming timed out (cmd=0x%x)\n", + host_id, val); + return -ETIMEDOUT; + } + if (err) + goto err_reg; + + /* + * Read the profile back and verify the programmed fields. A readback + * error or field mismatch is non-fatal: the write itself completed, so + * the profile is still considered applied. + */ + { + u32 rb[SLASH_QDMA_HP_NUM_WORDS] = {0}; + int rerr = slash_qdma_read_host_profile(device, host_id, rb); + + if (rerr) { + dev_warn(&device->pdev->dev, + "slash: qdma: host profile %u applied (cmd=0x%02x) but readback failed: %d\n", + host_id, cmd, rerr); + } else { + u32 smid_rb = slash_qdma_hp_get_field(rb, 201, 192); + u32 h2c_rb = slash_qdma_hp_get_field(rb, 181, 178); + u32 c2h_rb = slash_qdma_hp_get_field(rb, 97, 94); + + if (smid_rb == smid && h2c_rb == host_id && c2h_rb == host_id) { + dev_info(&device->pdev->dev, + "slash: qdma: host profile %u applied and readback verified: H2C/C2H AXI-MM steering=%u (NoC channel %u), smid=0x%03x (cmd=0x%02x)\n", + host_id, host_id, host_id, smid, cmd); + } else { + dev_err(&device->pdev->dev, + "slash: qdma: host profile %u readback MISMATCH: smid exp=0x%03x got=0x%03x, h2c exp=%u got=%u, c2h exp=%u got=%u\n", + host_id, smid, smid_rb, host_id, h2c_rb, + host_id, c2h_rb); + } + } + } + return 0; + +err_reg: + dev_err(&device->pdev->dev, + "qdma: host profile %u register access failed: %d\n", + host_id, err); + return err; +} + +/** + * slash_qdma_program_host_profiles() - Program the CPM5 Host Profiles. + * @device: QDMA device. + * + * Programs Host Profile 0 (steer to NoC Channel 0) and Host Profile 1 + * (steer to NoC Channel 1). Must run after qdma_device_open() (which + * clears all contexts) and before any queue context is programmed, per + * the CPM5 requirement that the host profile exist before AXI4-MM + * queues are set up. + * + * Return: 0 on success, negative errno on failure. + */ +static int slash_qdma_program_host_profiles(struct slash_qdma_dev *device) +{ + u32 host_id; + int err; + + dev_info(&device->pdev->dev, + "slash: qdma: programming CPM5 host profiles (host_id 0 -> NoC channel 0, host_id 1 -> NoC channel 1)\n"); + + for (host_id = 0; host_id <= 1; host_id++) { + err = slash_qdma_write_host_profile(device, host_id); + if (err) + return err; + } + + dev_info(&device->pdev->dev, + "slash: qdma: CPM5 host profiles programmed\n"); + + return 0; +} + /* ───────────────────────────────────────────────────────────────────── * PCI probe / remove * ───────────────────────────────────────────────────────────────────── */ @@ -912,6 +1296,20 @@ static int slash_qdma_probe(struct pci_dev *pdev, const struct pci_device_id *id device->qdma_handle); device->have_qdma_handle = true; + /* + * Program the CPM5 Host Profiles before exposing the miscdevice, so + * they exist before userspace can add any queue. Host ID 0 steers + * AXI4-MM traffic to NoC Channel 0 and Host ID 1 to NoC Channel 1; + * the per-queue SW-context host_id (mirrored from mm_channel = qid & 1) + * selects between them. + */ + err = slash_qdma_program_host_profiles(device); + if (err) { + dev_err(&pdev->dev, + "slash: qdma: could not program host profiles: %d", err); + goto err_free; + } + /* Register the management miscdevice so userspace can issue ioctls. */ err = misc_register(&device->misc); if (err) { @@ -1236,13 +1634,15 @@ static void slash_qdma_conf_options(struct qdma_dev_conf *conf, struct pci_dev * static long slash_qdma_fop_ioctl(struct file *file, unsigned int op, unsigned long arg) { struct slash_qdma_dev *qdma_dev = file->private_data; - struct miscdevice *misc = &qdma_dev->misc; + struct miscdevice *misc; void __user *uarg = (void __user *)arg; long ret = 0; if (!qdma_dev) return -ENODEV; + misc = &qdma_dev->misc; + SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, "ioctl op=0x%x\n", op); /* Early rejection if the device is shutting down. */ @@ -1270,6 +1670,10 @@ static long slash_qdma_fop_ioctl(struct file *file, unsigned int op, unsigned lo ret = slash_qdma_ioctl_qpair_get_fd_w(misc, qdma_dev, uarg); break; + case SLASH_QDMA_IOCTL_BUF_CREATE: + ret = slash_qdma_ioctl_buf_create_w(misc, qdma_dev, uarg); + break; + default: ret = -ENOTTY; break; @@ -1295,18 +1699,18 @@ static int slash_qdma_fop_open(struct inode *inode, struct file *file) struct miscdevice *misc = file->private_data; struct slash_qdma_dev *qdma_dev = container_of(misc, struct slash_qdma_dev, misc); - int ret = 0; mutex_lock(&qdma_dev->lock); if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) { - ret = -ENODEV; - } else { - kref_get(&qdma_dev->ref); - file->private_data = qdma_dev; + mutex_unlock(&qdma_dev->lock); + return -ENODEV; } + kref_get(&qdma_dev->ref); mutex_unlock(&qdma_dev->lock); - return ret; + file->private_data = qdma_dev; + + return 0; } /** @@ -1323,8 +1727,12 @@ static int slash_qdma_fop_release(struct inode *inode, struct file *file) { struct slash_qdma_dev *qdma_dev = file->private_data; - if (qdma_dev) - kref_put(&qdma_dev->ref, slash_qdma_dev_release); + if (!qdma_dev) + return 0; + + kref_put(&qdma_dev->ref, slash_qdma_dev_release); + + file->private_data = NULL; return 0; } @@ -1481,6 +1889,12 @@ static int slash_qdma_ioctl_qpair_add_w(struct miscdevice *misc, if (req.h2c_ring_sz >= 16 || req.c2h_ring_sz >= 16 || req.cmpt_ring_sz >= 16) return -EINVAL; + /* Validate the per-queue AXI-MM channel selection. */ + if (req.mm_channel != SLASH_QDMA_MM_CHANNEL_AUTO && + req.mm_channel != SLASH_QDMA_MM_CHANNEL_0 && + req.mm_channel != SLASH_QDMA_MM_CHANNEL_1) + return -EINVAL; + mutex_lock(&qdma_dev->lock); if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) { mutex_unlock(&qdma_dev->lock); @@ -1619,8 +2033,9 @@ static int slash_qdma_ioctl_qpair_add(struct miscdevice *misc, * (required for poll-mode operation per the reference driver). * - qconf.cmpl_stat_en = 1: enable completion status generation * (required for poll-mode operation per the reference driver). - * - qconf.aperture_size = 4096: page-granularity (4 KB) for descriptor - * addressing. Each descriptor addresses one page-sized chunk. + * - qconf.aperture_size = 0: disables libqdma keyhole mode so MM + * transfers advance linearly through endpoint memory. Non-zero + * values are keyhole apertures and wrap addresses within that window. * - qconf.desc_rng_sz_idx: CSR table index (0-15) selecting the * descriptor ring depth. Not a raw descriptor count — the actual * count is looked up from the global CSR ring-size table. @@ -1664,7 +2079,28 @@ static int slash_qdma_ioctl_qpair_add_q(struct miscdevice *misc, qconf.cmpl_status_pend_chk = 1; /* Check pending completions (poll-mode req) */ qconf.cmpl_stat_en = 1; /* Enable completion status generation */ - qconf.aperture_size = 4096; /* Page-granularity descriptor addressing */ + qconf.aperture_size = 0; /* Linear MM addressing; non-zero enables keyhole mode */ + /* + * CPM5 exposes two MM channels. The per-queue mm_channel selection + * (validated in slash_qdma_ioctl_qpair_add_w) chooses the channel: AUTO + * stripes across channels by (qid & 1); CHANNEL_0/CHANNEL_1 pin to a single + * channel. libqdma mirrors mm_channel into the SW-context host_id, so this + * also selects the programmed Host Profile: channel 0 -> Host Profile 0 + * (NoC Channel 0), channel 1 -> Host Profile 1 (NoC Channel 1). See + * slash_qdma_program_host_profiles(). + */ + switch (req->mm_channel) { + case SLASH_QDMA_MM_CHANNEL_0: + qconf.mm_channel = 0; + break; + case SLASH_QDMA_MM_CHANNEL_1: + qconf.mm_channel = 1; + break; + case SLASH_QDMA_MM_CHANNEL_AUTO: + default: + qconf.mm_channel = req->qid & 1; + break; + } /* --- Per-direction ring configuration --- */ switch (qtype) { @@ -1688,8 +2124,9 @@ static int slash_qdma_ioctl_qpair_add_q(struct miscdevice *misc, } SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, - "qdma_queue_add start: qid=%u type=%u mode=%u\n", - req->qid, qtype, req->mode); + "queue add qid=%u type=%u mode=%u mm_channel=%u (req=%u)\n", + req->qid, qtype, req->mode, qconf.mm_channel, + req->mm_channel); err = qdma_queue_add(qdma_dev->qdma_handle, &qconf, &qhndl, errbuf, sizeof(errbuf)); if (err) { @@ -1705,6 +2142,38 @@ static int slash_qdma_ioctl_qpair_add_q(struct miscdevice *misc, "qdma_queue_add done: qid=%u type=%u qhndl=%lu\n", req->qid, qtype, qhndl); + /* + * Reconfigure the queue immediately after adding it. + * + * qdma_queue_add() runs qdma_descq_config(..., reconfig=0), which on + * Versal hard IP does NOT mirror qconf.mm_channel into descq->channel -- + * only the reconfig=1 branch does. descq->channel feeds the SW-context + * mm_chn/host_id programmed when the queue is started; without this step + * it would stay 0 and collapse both queues onto NoC channel 0, defeating + * mm-channel selection. Calling qdma_queue_config() here (the queue is in + * Q_STATE_ENABLED, before start) replays the same qconf through the + * reconfig=1 path, setting descq->channel. This replaces the former + * 0002-libqdma-versal-channel.patch without modifying libqdma. + */ + err = qdma_queue_config(qdma_dev->qdma_handle, qhndl, &qconf, + errbuf, sizeof(errbuf)); + if (err) { + SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, + "qdma_queue_config failed: qid=%u type=%u err=%d (%s)\n", + req->qid, qtype, err, errbuf); + dev_err(&qdma_dev->pdev->dev, + "qdma: queue config failed (qid=%u, type=%u): %d (%s)\n", + req->qid, qtype, err, errbuf); + /* + * The queue was added but is not yet tracked in @entry, so the + * caller's rollback (keyed on its local added[] array) will not + * reach it. Remove it here to avoid leaking the libqdma queue. + */ + slash_qdma_queue_remove_safe(qdma_dev->qdma_handle, qhndl, + errbuf, sizeof(errbuf)); + return err; + } + /* Record the handle and mark this direction as active. */ entry->qhndl[qtype] = qhndl; entry->dir_mask |= dir_bit; @@ -1973,359 +2442,1052 @@ static int slash_qdma_ioctl_qpair_op_apply(struct slash_qdma_dev *qdma_dev, } /* ───────────────────────────────────────────────────────────────────── - * DMA I/O: user buffer mapping, SGL construction, and transfer + * Kernel DMA buffers: page allocation, SGL, DMA mapping, mmap + * + * A buffer owns a set of individually-allocated 4 KiB base pages (not + * physically contiguous). At creation time the pages are allocated, a + * one-descriptor-per-page SGL is built, and every page is DMA-mapped once; + * the steady-state transfer path then only slices the SGL, syncs the touched + * pages for the relevant DMA direction, and submits. The same pages are also + * exposed to userspace through the buffer fd's mmap, so the CPU and the DMA + * engine share one allocation, coherent only at the transfer boundaries. * ───────────────────────────────────────────────────────────────────── */ /** - * slash_qdma_iocb_release() - Free resources in an I/O control block. - * @iocb: The IOCB to clean up. + * slash_qdma_buf_dma_unmap() - Tear down the cached DMA mapping of a buffer. + * @buf: Buffer whose SGL entries were DMA-mapped. * - * Frees the combined SGL + page-pointer allocation and clears the - * pointers. Does not unpin pages — that must be done separately via - * slash_qdma_unmap_user_buf() before calling this. + * Unmaps every SGL entry that carries a non-zero dma_addr and clears it. + * Safe to call on a partially-mapped buffer (used on the create error path). */ -static inline void slash_qdma_iocb_release(struct slash_qdma_io_cb *iocb) +static void slash_qdma_buf_dma_unmap(struct slash_qdma_buf *buf) { - if (iocb->pages) - iocb->pages = NULL; + struct device *dev = &buf->qdma_dev->pdev->dev; + unsigned int i; - kfree(iocb->sgl); - iocb->sgl = NULL; - iocb->buf = NULL; + if (!buf->sgl || !buf->dma_mapped) + return; + + for (i = 0; i < buf->pages_nr; i++) { + struct qdma_sw_sg *sg = &buf->sgl[i]; + + if (sg->dma_addr) { + dma_unmap_page(dev, sg->dma_addr, sg->len, DMA_BIDIRECTIONAL); + sg->dma_addr = 0UL; + } + } + + buf->dma_mapped = false; } /** - * slash_qdma_unmap_user_buf() - Unpin user pages after a DMA transfer. - * @iocb: I/O control block with pinned pages. - * @write: Transfer direction from the device's perspective. If false - * (i.e., a C2H/read transfer), the pages were written to by the - * device and must be marked dirty so the VM knows the page - * contents have changed. + * slash_qdma_buf_dma_map() - DMA-map every SGL entry of a buffer. + * @buf: Buffer with a freshly built SGL. * - * Iterates over pinned pages, marks them dirty if this was a read (C2H) - * transfer (because the device wrote data into those user pages), and - * releases each page reference acquired by get_user_pages_fast(). + * Maps each page with DMA_BIDIRECTIONAL so the same cached mapping serves both + * H2C and C2H transfers. On any failure all previously mapped entries are + * unmapped before returning. + * + * Return: 0 on success, negative errno on failure. */ -static void slash_qdma_unmap_user_buf(struct slash_qdma_io_cb *iocb, bool write) +static int slash_qdma_buf_dma_map(struct slash_qdma_buf *buf) { - int i; + struct device *dev = &buf->qdma_dev->pdev->dev; + unsigned int i; + + for (i = 0; i < buf->pages_nr; i++) { + struct qdma_sw_sg *sg = &buf->sgl[i]; + + sg->dma_addr = dma_map_page(dev, sg->pg, sg->offset, sg->len, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(dev, sg->dma_addr)) { + sg->dma_addr = 0UL; + pr_err("slash: qdma: buffer DMA map failed at entry %u/%u\n", + i, buf->pages_nr); + buf->dma_mapped = true; /* allow unmap of the entries done so far */ + slash_qdma_buf_dma_unmap(buf); + return -ENOMEM; + } + } - if (!iocb->pages || !iocb->pages_nr) - return; + buf->dma_mapped = true; + return 0; +} - for (i = 0; i < iocb->pages_nr; i++) { - if (iocb->pages[i]) { - /* - * For C2H (read) transfers (!write), the device wrote into - * these user pages, so mark them dirty to inform the VM. - */ - if (!write) - set_page_dirty(iocb->pages[i]); - put_page(iocb->pages[i]); - } else { - break; +/** + * slash_qdma_buf_free_pages() - Free a buffer's pages and SGL. + * @buf: Buffer to tear down. + * + * Releases each allocated page (put_page() so pages still mapped into a VMA + * stay alive until the last mapping is torn down) and frees the SGL/page + * arrays. The DMA mapping must already have been removed. + */ +static void slash_qdma_buf_free_pages(struct slash_qdma_buf *buf) +{ + unsigned int i; + + if (buf->pages) { + for (i = 0; i < buf->pages_nr; i++) { + if (buf->pages[i]) + put_page(buf->pages[i]); } } - if (i != iocb->pages_nr) - pr_err("slash: qdma: sgl pages %d/%u.\n", i, iocb->pages_nr); - - iocb->pages_nr = 0; + kvfree(buf->pages); + buf->pages = NULL; + kvfree(buf->sgl); + buf->sgl = NULL; + buf->pages_nr = 0; } /** - * slash_qdma_map_user_buf_to_sgl() - Pin user pages and build a scatter-gather list. - * @iocb: I/O control block. @iocb->buf and @iocb->len must be set - * before calling. On success, @iocb->sgl, @iocb->pages, and - * @iocb->pages_nr are populated. - * @write: Transfer direction (true = H2C write, false = C2H read). + * slash_qdma_buf_alloc() - Allocate pages, build the SGL, and DMA-map. + * @buf: Buffer with @length and @qdma_dev set; @granule defaults to PAGE_SIZE. * - * Steps: - * 1. Compute the number of pages spanned by the user buffer (accounting - * for the offset within the first page). - * 2. Allocate a single contiguous block for the SGL entries and the - * page pointer array (avoids two allocations). - * 3. Pin user pages via get_user_pages_fast() with write=1 (even for - * H2C, because libqdma may write status back). - * 4. Build the qdma_sw_sg linked list: one entry per page, with the - * first entry's offset reflecting the sub-page position of the - * user buffer, and the last entry's length truncated to the - * remaining byte count. - * 5. Flush the data cache for each page to ensure coherency between - * the CPU cache and the DMA engine's view of memory. + * Allocates @length / PAGE_SIZE individual base pages (not contiguous), builds + * a one-page-per-entry SGL, and DMA-maps every page. All of this is the + * amortised, do-it-once setup cost paid by SLASH_QDMA_IOCTL_BUF_CREATE. * - * Return: 0 on success, negative errno on failure (pages are unpinned - * and the SGL is freed on error). + * Return: 0 on success, negative errno on failure (partial state cleaned up). */ -static int slash_qdma_map_user_buf_to_sgl(struct slash_qdma_io_cb *iocb, - bool write) +static int slash_qdma_buf_alloc(struct slash_qdma_buf *buf) { - unsigned long len = iocb->len; - char *buf = (char *)iocb->buf; - struct qdma_sw_sg *sg; - unsigned int pg_off = offset_in_page(buf); - unsigned int pages_nr = (len + pg_off + PAGE_SIZE - 1) >> PAGE_SHIFT; - int i; + size_t entries = buf->length / PAGE_SIZE; + unsigned int i; int rv; - if (len == 0) - pages_nr = 1; - if (pages_nr == 0) + if (buf->length == 0 || (buf->length % PAGE_SIZE) != 0 || + entries == 0 || entries > UINT_MAX) return -EINVAL; - iocb->pages_nr = 0; + buf->granule = PAGE_SIZE; + buf->pages_nr = (unsigned int)entries; - /* - * Single allocation for both the SGL array and the page pointer - * array. The page pointers are placed immediately after the SGL - * entries in memory. - */ - sg = kmalloc(pages_nr * (sizeof(struct qdma_sw_sg) + - sizeof(struct page *)), GFP_KERNEL); - if (!sg) { - pr_err("slash: qdma: sgl allocation failed for %u pages\n", - pages_nr); + buf->pages = kvcalloc(entries, sizeof(*buf->pages), GFP_KERNEL); + if (!buf->pages) return -ENOMEM; - } - memset(sg, 0, pages_nr * (sizeof(struct qdma_sw_sg) + - sizeof(struct page *))); - iocb->sgl = sg; - - /* Page pointer array lives right after the SGL entries. */ - iocb->pages = (struct page **)(sg + pages_nr); - /* - * Pin the user pages into physical memory. The write=1 flag tells - * the kernel these pages may be written to (needed for C2H, but we - * always request write permission for simplicity). - */ - rv = get_user_pages_fast((unsigned long)buf, pages_nr, - 1 /* write */, iocb->pages); - if (rv < 0) { - pr_err("slash: qdma: unable to pin down %u user pages, %d\n", - pages_nr, rv); - goto err_out; - } - if (rv != pages_nr) { - pr_err("slash: qdma: unable to pin down all %u user pages, %d\n", - pages_nr, rv); - iocb->pages_nr = rv; - rv = -EFAULT; - goto err_out; + buf->sgl = kvcalloc(entries, sizeof(*buf->sgl), GFP_KERNEL); + if (!buf->sgl) { + kvfree(buf->pages); + buf->pages = NULL; + return -ENOMEM; } - /* - * Build the scatter-gather list. Each entry describes one page's - * worth of data. The first page may have a non-zero offset, and - * the last page may have fewer than PAGE_SIZE bytes. - */ - sg = iocb->sgl; - for (i = 0; i < pages_nr; i++, sg++) { - unsigned int offset = offset_in_page(buf); - unsigned int nbytes = min_t(unsigned int, - PAGE_SIZE - offset, len); - struct page *pg = iocb->pages[i]; + for (i = 0; i < entries; i++) { + struct page *pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + struct qdma_sw_sg *sg = &buf->sgl[i]; - /* Ensure CPU cache is flushed so the DMA engine sees fresh data. */ - flush_dcache_page(pg); + if (!pg) { + rv = -ENOMEM; + goto err_free; + } - sg->next = sg + 1; + buf->pages[i] = pg; + sg->next = (i + 1 < entries) ? &buf->sgl[i + 1] : NULL; sg->pg = pg; - sg->offset = offset; - sg->len = nbytes; + sg->offset = 0; + sg->len = PAGE_SIZE; sg->dma_addr = 0UL; - - buf += nbytes; - len -= nbytes; } - /* Terminate the linked list. */ - iocb->sgl[pages_nr - 1].next = NULL; - iocb->pages_nr = pages_nr; - return 0; + rv = slash_qdma_buf_dma_map(buf); + if (rv < 0) + goto err_free; -err_out: - slash_qdma_unmap_user_buf(iocb, write); - slash_qdma_iocb_release(iocb); + return 0; +err_free: + slash_qdma_buf_free_pages(buf); return rv; } /** - * slash_qdma_qpair_read_write() - Perform a DMA transfer via a qpair fd. - * @file: The anon_inode file for this queue pair. - * @buf: User-space buffer (source for write/H2C, destination for read/C2H). - * @count: Number of bytes to transfer. - * @ppos: File position — used as the device-side (endpoint) address. - * Updated on success to reflect the bytes transferred, enabling - * sequential positional I/O. - * @write: true for H2C (host-to-card write), false for C2H (card-to-host read). - * - * Transfer flow: - * 1. Validate context and check that the required direction (H2C or C2H) - * is enabled on this queue pair. - * 2. Pin user pages and build a scatter-gather list. - * 3. Populate a qdma_request: - * - ep_addr = *ppos: the device-side address (FPGA memory offset). - * - h2c_eot = 1: signals end-of-transfer to the FPGA, allowing it to - * process the complete data packet. - * - timeout_ms = 10000 (10 seconds): if the transfer doesn't complete - * in this time, qdma_request_submit returns an error. - * - fp_done = NULL: synchronous mode — the call blocks until completion. - * If fp_done were set, libqdma would call it asynchronously. - * - dma_mapped = 0: libqdma handles the DMA mapping internally. - * 4. Submit to libqdma via qdma_request_submit(). - * 5. On success, advance *ppos by the number of bytes transferred. - * 6. Unpin pages and free the SGL. - * - * Return: Number of bytes transferred (>= 0) on success, negative errno - * on failure. - */ -static ssize_t slash_qdma_qpair_read_write(struct file *file, char __user *buf, - size_t count, loff_t *ppos, - bool write) + * slash_qdma_buf_sync_for_device() - Hand a transfer slice to the device. + * @buf: Buffer being transferred. + * @start_entry: First page index of the slice. + * @n_entries: Number of pages in the slice. + * @dir: DMA direction (DMA_TO_DEVICE for H2C, DMA_FROM_DEVICE for C2H). + * + * Synchronises CPU-written data out to the device (and/or invalidates CPU + * caches) for exactly the pages a sub-transfer touches. On cache-coherent + * hosts these are no-ops; on others they bound coherency to the transfer. + */ +static void slash_qdma_buf_sync_for_device(struct slash_qdma_buf *buf, + u64 start_entry, u64 n_entries, + enum dma_data_direction dir) { - struct slash_qdma_qpair_file_ctx *ctx = file->private_data; - struct slash_qdma_dev *qdma_dev; - struct slash_qdma_qpair_entry *entry; - struct slash_qdma_io_cb iocb; - struct qdma_request *req; - unsigned long qhndl; - ssize_t res; - int rv; + struct device *dev = &buf->qdma_dev->pdev->dev; + u64 i; - if (!ctx) - return -EINVAL; - - qdma_dev = ctx->qdma_dev; - entry = ctx->entry; - - if (!qdma_dev || !entry) - return -ENODEV; - - /* Check device liveness and resolve the queue handle for the direction. */ - mutex_lock(&qdma_dev->lock); - if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) { - mutex_unlock(&qdma_dev->lock); - return -ENODEV; - } + for (i = 0; i < n_entries; i++) { + struct qdma_sw_sg *sg = &buf->sgl[start_entry + i]; - if (write) { - /* H2C: writing data from host to card */ - if (!(entry->dir_mask & SLASH_QDMA_DIR_H2C) || - !slash_qdma_qhndl_is_valid(entry->qhndl[Q_H2C])) { - mutex_unlock(&qdma_dev->lock); - return -ENODEV; - } - qhndl = entry->qhndl[Q_H2C]; - } else { - /* C2H: reading data from card to host */ - if (!(entry->dir_mask & SLASH_QDMA_DIR_C2H) || - !slash_qdma_qhndl_is_valid(entry->qhndl[Q_C2H])) { - mutex_unlock(&qdma_dev->lock); - return -ENODEV; - } - qhndl = entry->qhndl[Q_C2H]; + dma_sync_single_for_device(dev, sg->dma_addr, sg->len, dir); } - mutex_unlock(&qdma_dev->lock); - - /* Pin user pages and build the scatter-gather list. */ - memset(&iocb, 0, sizeof(iocb)); - iocb.buf = buf; - iocb.len = count; - rv = slash_qdma_map_user_buf_to_sgl(&iocb, write); - if (rv < 0) - return rv; - - /* Populate the libqdma request structure. */ - req = &iocb.req; - req->sgcnt = iocb.pages_nr; /* Number of SGL entries */ - req->sgl = iocb.sgl; /* Scatter-gather list */ - req->write = write ? 1 : 0; /* Direction flag for libqdma */ - req->dma_mapped = 0; /* Let libqdma handle DMA mapping */ - req->udd_len = 0; /* No user-defined data */ - req->ep_addr = (u64)*ppos; /* Device-side (endpoint) address */ - req->count = count; /* Total byte count */ - req->timeout_ms = 10 * 1000; /* 10-second timeout */ - req->fp_done = NULL; /* Synchronous: block until complete */ - req->h2c_eot = 1; /* End-of-transfer marker for FPGA */ - - SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, - "qdma_request_submit start: qid=%u qhndl=%lu write=%d count=%zu ep_addr=0x%llx\n", - ctx->qid, qhndl, req->write, req->count, - (unsigned long long)req->ep_addr); - res = qdma_request_submit(qdma_dev->qdma_handle, qhndl, req); - SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, - "qdma_request_submit done: qid=%u qhndl=%lu res=%zd\n", - ctx->qid, qhndl, res); +} - /* Advance the file position by the number of bytes transferred. */ - if (res > 0) - *ppos += res; +/** + * slash_qdma_buf_sync_for_cpu() - Reclaim a transfer slice for the CPU. + * @buf: Buffer being transferred. + * @start_entry: First page index of the slice. + * @n_entries: Number of pages in the slice. + * @dir: DMA direction (DMA_FROM_DEVICE for a completed C2H read). + * + * Makes device-written data visible to the CPU for exactly the pages a C2H + * sub-transfer touched. Called after the transfer completes. + */ +static void slash_qdma_buf_sync_for_cpu(struct slash_qdma_buf *buf, + u64 start_entry, u64 n_entries, + enum dma_data_direction dir) +{ + struct device *dev = &buf->qdma_dev->pdev->dev; + u64 i; - /* Unpin pages (marking dirty for C2H reads) and free the SGL. */ - slash_qdma_unmap_user_buf(&iocb, write); - slash_qdma_iocb_release(&iocb); + for (i = 0; i < n_entries; i++) { + struct qdma_sw_sg *sg = &buf->sgl[start_entry + i]; - return res; + dma_sync_single_for_cpu(dev, sg->dma_addr, sg->len, dir); + } } /** - * slash_qdma_qpair_read() - Read (C2H) file operation for a qpair fd. - * @file: Anon_inode file for the queue pair. - * @buf: User-space destination buffer. - * @count: Number of bytes to read. - * @ppos: Device-side address to read from. - * - * Thin wrapper that delegates to slash_qdma_qpair_read_write() with - * write=false (C2H direction). + * slash_qdma_buf_release() - kref release callback for a buffer. + * @ref: kref embedded in the slash_qdma_buf being freed. * - * Return: Bytes transferred or negative errno. + * Runs when the last reference drops (fd ref, every live VMA ref, and any + * in-flight transfer ref). Tears down the DMA mapping, frees the pages and + * SGL, drops the device reference, and frees the struct. */ -static ssize_t slash_qdma_qpair_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static void slash_qdma_buf_release(struct kref *ref) { - return slash_qdma_qpair_read_write(file, buf, count, ppos, false); + struct slash_qdma_buf *buf = + container_of(ref, struct slash_qdma_buf, ref); + struct slash_qdma_dev *qdma_dev = buf->qdma_dev; + + slash_qdma_buf_dma_unmap(buf); + slash_qdma_buf_free_pages(buf); + if (qdma_dev) + kref_put(&qdma_dev->ref, slash_qdma_dev_release); + kfree(buf); } -/** - * slash_qdma_qpair_write() - Write (H2C) file operation for a qpair fd. - * @file: Anon_inode file for the queue pair. - * @buf: User-space source buffer. - * @count: Number of bytes to write. - * @ppos: Device-side address to write to. - * - * Thin wrapper that delegates to slash_qdma_qpair_read_write() with - * write=true (H2C direction). - * - * Return: Bytes transferred or negative errno. - */ -static ssize_t slash_qdma_qpair_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static inline void slash_qdma_buf_get(struct slash_qdma_buf *buf) { - return slash_qdma_qpair_read_write(file, (char __user *)buf, - count, ppos, true); + kref_get(&buf->ref); } -/** +static void slash_qdma_buf_put(struct slash_qdma_buf *buf) +{ + kref_put(&buf->ref, slash_qdma_buf_release); +} + +/* ───────────────────────────────────────────────────────────────────── + * Buffer fd: mmap support and lifetime + * ───────────────────────────────────────────────────────────────────── */ + +/** + * slash_qdma_buf_vm_open() - VMA open callback (fork / VMA split). + * @vma: The VMA gaining an independent reference. + * + * Each live VMA holds one buffer reference so the pages (and DMA mapping) + * outlive the buffer fd if userspace keeps the mapping after close(). + */ +static void slash_qdma_buf_vm_open(struct vm_area_struct *vma) +{ + struct slash_qdma_buf *buf = vma->vm_private_data; + + if (buf) + slash_qdma_buf_get(buf); +} + +/** + * slash_qdma_buf_vm_close() - VMA close callback (munmap / exit). + * @vma: The VMA being torn down. + */ +static void slash_qdma_buf_vm_close(struct vm_area_struct *vma) +{ + struct slash_qdma_buf *buf = vma->vm_private_data; + + if (buf) + slash_qdma_buf_put(buf); +} + +static const struct vm_operations_struct slash_qdma_buf_vm_ops = { + .open = slash_qdma_buf_vm_open, + .close = slash_qdma_buf_vm_close, +}; + +/** + * slash_qdma_buf_mmap() - mmap a kernel buffer's pages into userspace. + * @file: The buffer fd. + * @vma: The mapping request. + * + * Maps the whole buffer (offset 0, full length) into the calling process. + * The pages are ordinary kernel pages, so vm_map_pages_zero() inserts them + * directly; each VMA takes a buffer reference (initial one here, duplicated by + * the .open callback) so the pages stay valid for the life of the mapping. + * + * Return: 0 on success, negative errno on failure. + */ +static int slash_qdma_buf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct slash_qdma_buf *buf = file->private_data; + unsigned long span = vma->vm_end - vma->vm_start; + int rv; + + if (!buf) + return -ENODEV; + + /* Only a full, offset-0 mapping of the buffer is supported. */ + if (vma->vm_pgoff != 0) + return -EINVAL; + if (span != (unsigned long)buf->length) + return -EINVAL; + + /* + * Normal page mapping (no VM_PFNMAP): keep it from being expanded beyond + * the buffer and excluded from core dumps. + */ + slash_vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); + + rv = vm_map_pages_zero(vma, buf->pages, buf->pages_nr); + if (rv) + return rv; + + vma->vm_ops = &slash_qdma_buf_vm_ops; + vma->vm_private_data = buf; + slash_qdma_buf_get(buf); /* dropped by vm_close when this VMA goes away */ + + return 0; +} + +/** + * slash_qdma_buf_fop_release() - Release callback for a buffer fd. + * @inode: Unused (anon inode). + * @file: The buffer fd being closed. + * + * Drops the fd's buffer reference. Pages survive until any remaining VMA + * references are dropped too. + * + * Return: Always 0. + */ +static int slash_qdma_buf_fop_release(struct inode *inode, struct file *file) +{ + struct slash_qdma_buf *buf = file->private_data; + + (void)inode; + + if (buf) { + slash_qdma_buf_put(buf); + file->private_data = NULL; + } + + return 0; +} + +/** + * slash_qdma_buf_fops - File operations for buffer fds. + * + * mmap maps the buffer's pages for CPU access. + * release drops the fd's reference on the buffer. + */ +static const struct file_operations slash_qdma_buf_fops = { + .owner = THIS_MODULE, + .mmap = slash_qdma_buf_mmap, + .release = slash_qdma_buf_fop_release, +}; + +/** + * slash_qdma_buf_from_file() - Resolve a buffer fd to its buffer object. + * @file: A file obtained from fget() on a candidate buffer fd. + * + * Return: The buffer if @file is a SLASH buffer fd, else NULL. + */ +static struct slash_qdma_buf *slash_qdma_buf_from_file(struct file *file) +{ + if (!file || file->f_op != &slash_qdma_buf_fops) + return NULL; + return file->private_data; +} + +/* ───────────────────────────────────────────────────────────────────── + * IOCTL: buffer create + * ───────────────────────────────────────────────────────────────────── */ + +/** + * slash_qdma_ioctl_buf_create_w() - Allocate a kernel buffer and return its fd. + * @misc: Miscdevice handle (for logging). + * @qdma_dev: QDMA device the buffer is bound to (for DMA mapping). + * @uarg: User pointer to a struct slash_qdma_buf_create. + * + * Allocates the buffer's pages, builds the SGL, and DMA-maps everything once, + * then wraps it in an anon_inode fd whose mmap exposes the pages for CPU + * access. The fd is returned as the ioctl return value (same convention as + * the BAR/queue-pair fd ioctls). Closing the fd (and unmapping any VMA) + * releases the buffer. + * + * Return: The new buffer fd (>= 0) on success, negative errno on failure. + */ +static int slash_qdma_ioctl_buf_create_w(struct miscdevice *misc, + struct slash_qdma_dev *qdma_dev, + void __user *uarg) +{ + struct slash_qdma_buf_create req; + struct slash_qdma_buf *buf; + struct file *file; + __u32 user_size = 0; + size_t copy_size; + int fd; + int rv; + + if (copy_from_user(&user_size, uarg, sizeof(user_size))) + return -EFAULT; + + if (user_size < SLASH_QDMA_BUF_CREATE_MIN_SIZE) { + dev_warn(misc->this_device, + "qdma: BUF_CREATE size too small (%u)\n", user_size); + return -EINVAL; + } + + memset(&req, 0, sizeof(req)); + if (copy_from_user(&req, uarg, min_t(size_t, user_size, sizeof(req)))) + return -EFAULT; + + if (req.flags & ~O_CLOEXEC) + return -EINVAL; + + if (req.length == 0 || (req.length % PAGE_SIZE) != 0) + return -EINVAL; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + kref_init(&buf->ref); + /* The buffer holds a device reference for its whole lifetime. */ + kref_get(&qdma_dev->ref); + buf->qdma_dev = qdma_dev; + buf->length = req.length; + + rv = slash_qdma_buf_alloc(buf); + if (rv < 0) { + kref_put(&qdma_dev->ref, slash_qdma_dev_release); + kfree(buf); + return rv; + } + + file = anon_inode_getfile("slash_qdma_buf", &slash_qdma_buf_fops, buf, + O_RDWR | (req.flags & O_CLOEXEC)); + if (IS_ERR(file)) { + rv = PTR_ERR(file); + slash_qdma_buf_put(buf); /* drops the only ref: frees buf + dev ref */ + return rv; + } + + fd = get_unused_fd_flags(req.flags & O_CLOEXEC); + if (fd < 0) { + fput(file); /* triggers buf release */ + return fd; + } + + SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, + "buf create: fd=%d len=%llu granule=%u pages=%u\n", + fd, (unsigned long long)req.length, + buf->granule, buf->pages_nr); + + /* Fill the output fields before installing the fd. */ + req.size = sizeof(req); + req.granule = buf->granule; + req.transfer_hint = SLASH_QDMA_TRANSFER_HINT_V80; + copy_size = min_t(size_t, user_size, sizeof(req)); + if (copy_to_user(uarg, &req, copy_size)) { + put_unused_fd(fd); + fput(file); + return -EFAULT; + } + if (user_size > sizeof(req)) { + if (clear_user((void __user *)((unsigned long)uarg + sizeof(req)), + user_size - sizeof(req))) { + put_unused_fd(fd); + fput(file); + return -EFAULT; + } + } + + fd_install(fd, file); + + return fd; +} + +/** + * struct slash_qdma_xfer_req - Runtime state for one sub-transfer submission. + * @qreq: libqdma request (built by slash_qdma_xfer_prep()). + * @done: Completion signalled by @qreq.fp_done for async submissions. + * @buf: Kernel buffer the transfer references (one ref held). + * @qhndl: Resolved libqdma queue handle for the direction/qpair. + * @start_entry: First page index of the buffer slice being transferred. + * @n_entries: Number of pages in the slice (for the DMA sync). + * @dma_dir: DMA direction for the streaming sync calls. + * @is_c2h: True for a C2H (device-to-host) sub-transfer, so the slice + * is synced back for the CPU after completion. + * @bytes_done: Bytes transferred, filled on completion. + * @err: Negative errno if the sub-transfer failed, else 0. + * @async_inflight: True once queued asynchronously and awaiting fp_done. + * + * Allocated as an array (one per sub-transfer) for the duration of a transfer + * batch. @qreq must outlive the in-flight async request, so the array stays + * alive until every async completion has fired. + */ +struct slash_qdma_xfer_req { + struct qdma_request qreq; + struct completion done; + struct slash_qdma_buf *buf; + unsigned long qhndl; + u64 start_entry; + u64 n_entries; + enum dma_data_direction dma_dir; + bool is_c2h; + unsigned int bytes_done; + int err; + bool async_inflight; +}; + +/** + * slash_qdma_xfer_done() - libqdma fp_done callback for async sub-transfers. + * @qreq: The completed request (embedded in a slash_qdma_xfer_req). + * @bytes_done: Bytes transferred. + * @err: Negative errno on failure, else 0. + * + * Records the result and wakes the submitter waiting on @done. Runs in + * libqdma worker-thread context. + * + * Return: Always 0 (libqdma may free/re-task the request). + */ +static int slash_qdma_xfer_done(struct qdma_request *qreq, + unsigned int bytes_done, int err) +{ + struct slash_qdma_xfer_req *xr = + container_of(qreq, struct slash_qdma_xfer_req, qreq); + + xr->bytes_done = bytes_done; + xr->err = err; + complete(&xr->done); + return 0; +} + +/** + * slash_qdma_xfer_prep() - Validate one sub-transfer and build its request. + * @qdma_dev: QDMA device. + * @entry: Queue pair entry selected by the sub-transfer's qpair_index. + * @desc: User-supplied sub-transfer descriptor. + * @xr: [out] Receives a built (but not yet submitted) request, and a + * reference on the kernel buffer it targets. + * + * Shared submit core used by both the synchronous transfer ioctl and the + * optional io_uring uring_cmd path. Resolves the buffer fd named by the + * descriptor and refs the buffer, validates the slice against the buffer's + * page granule and length, resolves the queue handle for the requested + * direction, syncs the slice for the device, and fills the cached, + * pre-DMA-mapped SGL slice into @xr->qreq (dma_mapped = 1, fp_done = NULL). + * No pages are allocated or DMA-mapped here; that was amortised at creation. + * + * On success the caller owns the buffer ref in @xr->buf and must release it + * with slash_qdma_buf_put() once the request is no longer in flight. + * + * Return: 0 on success, negative errno on failure (no ref held on failure). + */ +static int slash_qdma_xfer_prep(struct slash_qdma_dev *qdma_dev, + struct slash_qdma_qpair_entry *entry, + const struct slash_qdma_subxfer *desc, + struct slash_qdma_xfer_req *xr) +{ + struct slash_qdma_buf *buf; + struct file *file; + unsigned long qhndl; + bool write; + u32 dir_bit; + enum queue_type_t qtype; + enum dma_data_direction dma_dir; + u64 start_entry, n_entries; + + switch (desc->direction) { + case SLASH_QDMA_XFER_H2C: + write = true; + dir_bit = SLASH_QDMA_DIR_H2C; + qtype = Q_H2C; + dma_dir = DMA_TO_DEVICE; + break; + case SLASH_QDMA_XFER_C2H: + write = false; + dir_bit = SLASH_QDMA_DIR_C2H; + qtype = Q_C2H; + dma_dir = DMA_FROM_DEVICE; + break; + default: + return -EINVAL; + } + + /* libqdma's request count is a 32-bit byte count. */ + if (desc->length == 0 || desc->length > UINT_MAX) + return -EINVAL; + + /* Resolve the buffer fd and take a ref that outlives the fd. */ + file = fget(desc->buf_fd); + if (!file) + return -EBADF; + buf = slash_qdma_buf_from_file(file); + if (!buf) { + fput(file); + return -EINVAL; + } + /* DMA mappings are device-specific: the buffer must belong to this device. */ + if (buf->qdma_dev != qdma_dev) { + fput(file); + return -EINVAL; + } + slash_qdma_buf_get(buf); + fput(file); + + /* Validate the requested slice against the buffer's page granule. */ + if (buf->granule == 0 || + (desc->buf_offset % buf->granule) != 0 || + (desc->length % buf->granule) != 0) { + slash_qdma_buf_put(buf); + return -EINVAL; + } + if (desc->buf_offset > buf->length || + desc->length > buf->length - desc->buf_offset) { + slash_qdma_buf_put(buf); + return -EINVAL; + } + + start_entry = desc->buf_offset / buf->granule; + n_entries = desc->length / buf->granule; + if (start_entry + n_entries > buf->pages_nr) { + slash_qdma_buf_put(buf); + return -EINVAL; + } + + /* Check device liveness and resolve the queue handle for the direction. */ + mutex_lock(&qdma_dev->lock); + if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) { + mutex_unlock(&qdma_dev->lock); + slash_qdma_buf_put(buf); + return -ENODEV; + } + if (!(entry->dir_mask & dir_bit) || + !slash_qdma_qhndl_is_valid(entry->qhndl[qtype])) { + mutex_unlock(&qdma_dev->lock); + slash_qdma_buf_put(buf); + return -ENODEV; + } + qhndl = entry->qhndl[qtype]; + mutex_unlock(&qdma_dev->lock); + + /* + * Hand the touched pages to the device. The mapping is persistent + * (dma_mapped = 1); only this slice is synced, so coherency cost scales + * with the transfer, not the whole buffer. + */ + slash_qdma_buf_sync_for_device(buf, start_entry, n_entries, dma_dir); + + /* + * Build the request from the cached SGL slice. dma_mapped = 1 tells + * libqdma the SGL is already DMA-mapped (dma_addr filled at creation), + * so it skips the per-request map/unmap entirely. + */ + memset(&xr->qreq, 0, sizeof(xr->qreq)); + xr->qreq.sgcnt = (unsigned int)n_entries; + xr->qreq.sgl = &buf->sgl[start_entry]; + xr->qreq.write = write ? 1 : 0; + xr->qreq.dma_mapped = 1; + xr->qreq.udd_len = 0; + xr->qreq.ep_addr = (u64)desc->dev_addr; + xr->qreq.count = (unsigned int)desc->length; + xr->qreq.timeout_ms = 10 * 1000; + xr->qreq.fp_done = NULL; + xr->qreq.h2c_eot = 1; + + xr->buf = buf; + xr->qhndl = qhndl; + xr->start_entry = start_entry; + xr->n_entries = n_entries; + xr->dma_dir = dma_dir; + xr->is_c2h = !write; + xr->bytes_done = 0; + xr->err = 0; + xr->async_inflight = false; + return 0; +} + +/** + * slash_qdma_xfer_finish() - Post-completion DMA sync + buffer ref drop. + * @xr: A prepared (and now completed) sub-transfer request. + * + * For a C2H sub-transfer that moved data, makes the device-written pages + * visible to the CPU before releasing the buffer reference taken in prep. + */ +static void slash_qdma_xfer_finish(struct slash_qdma_xfer_req *xr) +{ + if (xr->is_c2h && xr->bytes_done) + slash_qdma_buf_sync_for_cpu(xr->buf, xr->start_entry, xr->n_entries, + xr->dma_dir); + slash_qdma_buf_put(xr->buf); +} + +/** + * slash_qdma_qpair_transfer() - Buffer DMA transfer batch on a queue-pair fd. + * @file: Anon_inode file for the queue-pair collection. + * @uarg: User pointer to a struct slash_qdma_transfer (1..N sub-transfers). + * + * Validates and prepares every sub-transfer, then submits them so those that + * target distinct queue pairs run concurrently: all but the last are submitted + * asynchronously (fp_done set), the last is submitted synchronously (blocking), + * and the async ones are then waited on. A single sub-transfer therefore takes + * the plain blocking path with no async overhead. + * + * Return: total number of bytes transferred (>= 0) on success; the first + * sub-transfer error (negative errno) on failure. + */ +static long slash_qdma_qpair_transfer(struct file *file, void __user *uarg) +{ + struct slash_qdma_qpair_file_ctx *ctx = file->private_data; + struct slash_qdma_dev *qdma_dev; + struct slash_qdma_transfer req; + struct slash_qdma_xfer_req *xrs; + __u32 user_size = 0; + u32 count, i, last; + u64 total = 0; + int first_err = 0; + ssize_t res; + + if (!ctx) + return -EINVAL; + + qdma_dev = ctx->qdma_dev; + + if (!qdma_dev || ctx->n_qpairs == 0) + return -ENODEV; + + if (copy_from_user(&user_size, uarg, sizeof(user_size))) + return -EFAULT; + + if (user_size < SLASH_QDMA_TRANSFER_MIN_SIZE) + return -EINVAL; + + memset(&req, 0, sizeof(req)); + if (copy_from_user(&req, uarg, min_t(size_t, user_size, sizeof(req)))) + return -EFAULT; + + count = req.count; + if (count == 0 || count > SLASH_QDMA_FD_MAX_QPAIRS) + return -EINVAL; + + xrs = kcalloc(count, sizeof(*xrs), GFP_KERNEL); + if (!xrs) + return -ENOMEM; + + /* Validate and prepare every sub-transfer (each takes a buffer ref). */ + for (i = 0; i < count; i++) { + const struct slash_qdma_subxfer *d = &req.xfers[i]; + int rv; + + if (d->qpair_index >= ctx->n_qpairs) + rv = -EINVAL; + else + rv = slash_qdma_xfer_prep(qdma_dev, + ctx->entries[d->qpair_index], d, + &xrs[i]); + if (rv) { + while (i-- > 0) + slash_qdma_buf_put(xrs[i].buf); + kfree(xrs); + return rv; + } + + SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, + "transfer[%u]: qid=%u buf_fd=%d off=%llu dev=0x%llx len=%llu dir=%s\n", + i, ctx->qids[d->qpair_index], d->buf_fd, + (unsigned long long)d->buf_offset, + (unsigned long long)d->dev_addr, + (unsigned long long)d->length, + d->direction == SLASH_QDMA_XFER_H2C ? "H2C" : "C2H"); + } + + last = count - 1; + + /* + * Submit all but the last asynchronously so the sub-transfers run on their + * (distinct) queue pairs in parallel; libqdma calls fp_done on completion. + */ + for (i = 0; i < last; i++) { + init_completion(&xrs[i].done); + xrs[i].qreq.fp_done = slash_qdma_xfer_done; + res = qdma_request_submit(qdma_dev->qdma_handle, xrs[i].qhndl, + &xrs[i].qreq); + if (res < 0) + xrs[i].err = (int)res; /* not queued: fp_done will not fire */ + else + xrs[i].async_inflight = true; + } + + /* Submit the last sub-transfer synchronously (blocks until complete). */ + res = qdma_request_submit(qdma_dev->qdma_handle, xrs[last].qhndl, + &xrs[last].qreq); + if (res < 0) + xrs[last].err = (int)res; + else + xrs[last].bytes_done = (unsigned int)res; + + /* Wait for the async sub-transfers, then aggregate (first error wins). */ + for (i = 0; i < last; i++) { + if (xrs[i].async_inflight) + wait_for_completion(&xrs[i].done); + } + + for (i = 0; i < count; i++) { + if (xrs[i].err && !first_err) + first_err = xrs[i].err; + total += xrs[i].bytes_done; + slash_qdma_xfer_finish(&xrs[i]); + } + + kfree(xrs); + + if (first_err) + return (long)first_err; + + return (long)total; +} + +#if defined(SLASH_HAVE_URING_CMD) +/** + * struct slash_qdma_uring_cmd_ctx - Async state for one uring_cmd transfer. + * @cmd: The io_uring command being served. + * @xrs: Per-sub-transfer requests (buffer refs held until completion). + * @count: Number of sub-transfers. + * @outstanding: Sub-transfers not yet completed; the one that drops it to 0 + * schedules the completion task-work. + * @total_bytes: Aggregate bytes transferred. + * @first_err: First negative errno seen, or 0. + * + * Heap-allocated for the lifetime of an asynchronous transfer; a pointer to it + * is stashed in cmd->pdu so the completion task-work can recover it. + */ +struct slash_qdma_uring_cmd_ctx { + struct io_uring_cmd *cmd; + struct slash_qdma_xfer_req xrs[SLASH_QDMA_FD_MAX_QPAIRS]; + u32 count; + atomic_t outstanding; + atomic_long_t total_bytes; + atomic_t first_err; +}; + +/** + * slash_qdma_uring_cmd_complete() - Task-work that finishes a uring_cmd. + * @cmd: The io_uring command. + * @issue_flags: io_uring issue flags for io_uring_cmd_done(). + * + * Runs in task context once all sub-transfers have completed: drops the + * buffer refs, completes the CQE with the total bytes (or first error), and + * frees the command context. + */ +static void slash_qdma_uring_cmd_complete(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct slash_qdma_uring_cmd_ctx *uc; + int err; + long ret; + u32 i; + + memcpy(&uc, cmd->pdu, sizeof(uc)); + err = atomic_read(&uc->first_err); + ret = err ? err : atomic_long_read(&uc->total_bytes); + + for (i = 0; i < uc->count; i++) + slash_qdma_xfer_finish(&uc->xrs[i]); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); + kfree(uc); +} + +/** + * slash_qdma_uring_xfer_done() - fp_done for async uring_cmd sub-transfers. + * @qreq: Completed request (embedded in a slash_qdma_xfer_req). + * @bytes_done: Bytes transferred. + * @err: Negative errno on failure, else 0. + * + * Accumulates the result and, when the last sub-transfer of the command + * completes, schedules the completion task-work. Runs in libqdma worker + * context. + * + * Return: Always 0. + */ +static int slash_qdma_uring_xfer_done(struct qdma_request *qreq, + unsigned int bytes_done, int err) +{ + struct slash_qdma_xfer_req *xr = + container_of(qreq, struct slash_qdma_xfer_req, qreq); + struct slash_qdma_uring_cmd_ctx *uc = + (struct slash_qdma_uring_cmd_ctx *)qreq->uld_data; + + xr->bytes_done = bytes_done; + xr->err = err; + if (bytes_done) + atomic_long_add(bytes_done, &uc->total_bytes); + if (err) + atomic_cmpxchg(&uc->first_err, 0, err); + + if (atomic_dec_and_test(&uc->outstanding)) + io_uring_cmd_complete_in_task(uc->cmd, + slash_qdma_uring_cmd_complete); + return 0; +} + +/** + * slash_qdma_qpair_uring_cmd() - Asynchronous transfer batch via io_uring. + * @cmd: The io_uring command; its inline SQE data is a single __u64 + * userspace pointer to a struct slash_qdma_transfer. + * @issue_flags: io_uring issue flags. + * + * The optional async sibling of SLASH_QDMA_QPAIR_IOCTL_TRANSFER: it prepares + * every sub-transfer, submits them all asynchronously (so they run on their + * distinct queue pairs concurrently), and completes the CQE from task-work + * once they all finish. Many such commands can be in flight at once, which is + * the intended multi-buffer optimization. + * + * Return: -EIOCBQUEUED once submission is under way (completion arrives via + * the CQE); a negative errno if the command is rejected before any + * sub-transfer is queued; -EAGAIN to defer a non-blocking issue. + */ +static int slash_qdma_qpair_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct file *file = cmd->file; + struct slash_qdma_qpair_file_ctx *ctx = file->private_data; + struct slash_qdma_dev *qdma_dev; + struct slash_qdma_uring_cmd_ctx *uc; + struct slash_qdma_transfer req; + u64 uptr = 0; + u32 count, i; + ssize_t res; + + if (cmd->cmd_op != SLASH_QDMA_URING_CMD_TRANSFER) + return -EOPNOTSUPP; + + if (!ctx) + return -EINVAL; + + qdma_dev = ctx->qdma_dev; + if (!qdma_dev || ctx->n_qpairs == 0) + return -ENODEV; + + /* + * Copying the descriptor from userspace may fault and sleep, so defer a + * non-blocking issue to a blocking io_uring context. + */ + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + /* The SQE inline command carries the user pointer to the descriptor. */ + memcpy(&uptr, slash_qdma_uring_cmd_payload(cmd), sizeof(uptr)); + + memset(&req, 0, sizeof(req)); + if (copy_from_user(&req, u64_to_user_ptr(uptr), sizeof(req))) + return -EFAULT; + + count = req.count; + if (count == 0 || count > SLASH_QDMA_FD_MAX_QPAIRS) + return -EINVAL; + + uc = kzalloc(sizeof(*uc), GFP_KERNEL); + if (!uc) + return -ENOMEM; + + uc->cmd = cmd; + uc->count = count; + atomic_set(&uc->outstanding, count); + atomic_long_set(&uc->total_bytes, 0); + atomic_set(&uc->first_err, 0); + + /* Validate and prepare every sub-transfer before queueing any of them. */ + for (i = 0; i < count; i++) { + const struct slash_qdma_subxfer *d = &req.xfers[i]; + int rv; + + if (d->qpair_index >= ctx->n_qpairs) + rv = -EINVAL; + else + rv = slash_qdma_xfer_prep(qdma_dev, + ctx->entries[d->qpair_index], d, + &uc->xrs[i]); + if (rv) { + while (i-- > 0) + slash_qdma_buf_put(uc->xrs[i].buf); + kfree(uc); + return rv; + } + uc->xrs[i].qreq.uld_data = (unsigned long)uc; + uc->xrs[i].qreq.fp_done = slash_qdma_uring_xfer_done; + } + + /* Stash the context for the completion task-work. */ + memcpy(cmd->pdu, &uc, sizeof(uc)); + + /* + * Submit all sub-transfers asynchronously. Completion (success or the + * inline submit-failure path below) is funnelled through the outstanding + * counter so the CQE is posted exactly once from task-work. + */ + for (i = 0; i < count; i++) { + res = qdma_request_submit(qdma_dev->qdma_handle, uc->xrs[i].qhndl, + &uc->xrs[i].qreq); + if (res < 0) { + /* Not queued: fp_done will not fire, account for it here. */ + uc->xrs[i].err = (int)res; + atomic_cmpxchg(&uc->first_err, 0, (int)res); + if (atomic_dec_and_test(&uc->outstanding)) + io_uring_cmd_complete_in_task(uc->cmd, + slash_qdma_uring_cmd_complete); + } + } + + return -EIOCBQUEUED; +} +#endif /* SLASH_HAVE_URING_CMD */ + +/** * slash_qdma_qpair_ioctl() - Ioctl handler for per-qpair anon_inode fds. * @file: Anon_inode file. * @cmd: Ioctl command number. * @arg: User-space argument. * - * Currently a stub — no per-fd ioctls are defined. Returns -ENOTTY - * for all commands. + * Supports SLASH_QDMA_IOCTL_BUF_CREATE (allocate a kernel buffer for clients + * that hold only a queue-pair fd) and SLASH_QDMA_QPAIR_IOCTL_TRANSFER (buffer + * DMA transfer). * - * Return: -ENOTTY (no valid ioctl). + * Return: bytes transferred (>= 0) for TRANSFER, a new fd for BUF_CREATE, or + * -ENOTTY for any other command. */ static long slash_qdma_qpair_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - (void)file; - (void)cmd; - (void)arg; + struct slash_qdma_qpair_file_ctx *ctx = file->private_data; - return -ENOTTY; + if (!ctx || !ctx->qdma_dev) + return -ENODEV; + + switch (cmd) { + case SLASH_QDMA_IOCTL_BUF_CREATE: + return slash_qdma_ioctl_buf_create_w(&ctx->qdma_dev->misc, + ctx->qdma_dev, + (void __user *)arg); + case SLASH_QDMA_QPAIR_IOCTL_TRANSFER: + return slash_qdma_qpair_transfer(file, (void __user *)arg); + default: + return -ENOTTY; + } } /** @@ -2346,12 +3508,15 @@ static long slash_qdma_qpair_ioctl(struct file *file, static int slash_qdma_qpair_release(struct inode *inode, struct file *file) { struct slash_qdma_qpair_file_ctx *ctx = file->private_data; + u32 i; (void)inode; if (ctx) { - if (ctx->entry) - slash_qdma_qpair_put(ctx->entry); + for (i = 0; i < ctx->n_qpairs; i++) { + if (ctx->entries[i]) + slash_qdma_qpair_put(ctx->entries[i]); + } if (ctx->qdma_dev) kref_put(&ctx->qdma_dev->ref, slash_qdma_dev_release); kfree(ctx); @@ -2366,24 +3531,26 @@ static int slash_qdma_qpair_release(struct inode *inode, struct file *file) * ───────────────────────────────────────────────────────────────────── */ /** - * slash_qdma_ioctl_qpair_get_fd_w() - Create an anon_inode fd for a queue pair. + * slash_qdma_ioctl_qpair_get_fd_w() - Create an anon_inode fd for queue I/O. * @misc: Miscdevice handle (unused). * @qdma_dev: QDMA device. * @uarg: User-space pointer to a slash_qdma_qpair_fd_request struct. * - * Creates an anonymous inode file descriptor that userspace can use - * for read() (C2H) and write() (H2C) DMA transfers on the specified - * queue pair. The fd holds references to both the qpair entry and the - * device, preventing either from being freed while the fd is open. + * Creates an anonymous inode file descriptor that userspace can use for + * buffer transfer ioctls. The fd is a collection of one or two queue pairs + * (see slash_qdma_qpair_fd_request): @qpair_count == 0 binds the single qpair + * named by @qid (back-compat), otherwise @qpair_count IDs from @qpair_ids are + * bound, their array index becoming the transfer qpair_index. * - * The only supported flag is O_CLOEXEC (close-on-exec). + * The fd holds references to each bound qpair entry and the device, preventing + * either from being freed while the fd is open. Each bound qpair keeps the + * per-qpair configuration (mm_channel, ring sizes, directions) it was given at + * add time, so the channels can differ. * - * The file is created with FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE - * enabled, allowing pread/pwrite and lseek to set the device-side - * address for DMA transfers. + * The only supported flag is O_CLOEXEC (close-on-exec). * - * Error handling: on any failure after resources are acquired, all - * refs and allocations are cleaned up before returning. + * Error handling: on any failure after resources are acquired, all refs and + * allocations are cleaned up before returning. * * Return: The new fd (>= 0) on success, negative errno on failure. */ @@ -2393,8 +3560,10 @@ static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc, { struct slash_qdma_qpair_fd_request req; __u32 user_size = 0; + __u32 ids[SLASH_QDMA_FD_MAX_QPAIRS]; + u32 n_qpairs; + u32 i; size_t copy_size; - struct slash_qdma_qpair_entry *entry; struct slash_qdma_qpair_file_ctx *ctx; struct file *file; int fd; @@ -2418,55 +3587,75 @@ static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc, if (req.flags & ~O_CLOEXEC) return -EINVAL; - /* Look up the qpair entry and take refs while holding the lock. */ + /* + * Resolve the requested qpair-id set. qpair_count == 0 is the legacy + * single-qpair form using @qid; otherwise bind @qpair_count ids. + */ + if (req.qpair_count == 0) { + n_qpairs = 1; + ids[0] = req.qid; + } else { + if (req.qpair_count > SLASH_QDMA_FD_MAX_QPAIRS) + return -EINVAL; + n_qpairs = req.qpair_count; + for (i = 0; i < n_qpairs; i++) + ids[i] = req.qpair_ids[i]; + } + + /* Allocate the per-fd context. */ + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + /* Look up each qpair entry and take refs while holding the lock. */ mutex_lock(&qdma_dev->lock); if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) { mutex_unlock(&qdma_dev->lock); + kfree(ctx); return -ENODEV; } - entry = slash_qdma_qpair_lookup(qdma_dev, req.qid); - if (!entry || !entry->dir_mask) { - mutex_unlock(&qdma_dev->lock); - return -ENOENT; + for (i = 0; i < n_qpairs; i++) { + struct slash_qdma_qpair_entry *entry = + slash_qdma_qpair_lookup(qdma_dev, ids[i]); + + if (!entry || !entry->dir_mask) { + /* Drop refs taken so far for the earlier entries. */ + while (i-- > 0) + slash_qdma_qpair_put(ctx->entries[i]); + mutex_unlock(&qdma_dev->lock); + kfree(ctx); + return -ENOENT; + } + + /* + * Take a ref on the entry. These refs are held by the file context + * and released when the fd is closed, ensuring the entries cannot be + * freed prematurely. + */ + slash_qdma_qpair_get(entry); + ctx->entries[i] = entry; + ctx->qids[i] = ids[i]; } + ctx->n_qpairs = n_qpairs; - /* - * Take a ref on the entry and the device. These refs are held by - * the file context and released when the fd is closed, ensuring - * neither the entry nor the device can be freed prematurely. - */ - slash_qdma_qpair_get(entry); kref_get(&qdma_dev->ref); mutex_unlock(&qdma_dev->lock); - /* Allocate the per-fd context. */ - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) { - slash_qdma_qpair_put(entry); - kref_put(&qdma_dev->ref, slash_qdma_dev_release); - return -ENOMEM; - } - ctx->qdma_dev = qdma_dev; - ctx->entry = entry; - ctx->qid = req.qid; /* Create the anonymous inode file with read/write access. */ file = anon_inode_getfile("slash_qdma_qpair", &slash_qdma_qpair_fops, ctx, O_RDWR | (req.flags & O_CLOEXEC)); if (IS_ERR(file)) { err = PTR_ERR(file); - slash_qdma_qpair_put(entry); + for (i = 0; i < ctx->n_qpairs; i++) + slash_qdma_qpair_put(ctx->entries[i]); kref_put(&qdma_dev->ref, slash_qdma_dev_release); kfree(ctx); return err; } - /* Enable seek and positional read/write for device-address control. */ - file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - - /* Allocate a file descriptor number. */ fd = get_unused_fd_flags(req.flags & O_CLOEXEC); if (fd < 0) { diff --git a/driver/tests/test_slash_qdma.c b/driver/tests/test_slash_qdma.c index 07784dc0..904b3e81 100644 --- a/driver/tests/test_slash_qdma.c +++ b/driver/tests/test_slash_qdma.c @@ -2,9 +2,10 @@ /* * QDMA control device (/dev/slash_qdma_ctl) ABI tests. * - * Covers QPAIR_ADD / Q_OP / QPAIR_GET_FD / INFO and the per-qpair - * anon-inode fd (read/write/lseek/pread/pwrite, multi-fd, wrong-direction, - * mmap-unsupported, HBM/DDR region round trips). See + * Covers QPAIR_ADD / Q_OP / QPAIR_GET_FD / INFO, the kernel-owned buffer fd + * (BUF_CREATE + mmap), and the per-qpair anon-inode transfer fd + * (TRANSFER ioctl, multi-fd, wrong-direction, read/write/lseek/mmap + * unsupported, HBM/DDR region round trips). See * docs/reference/kernel-abi/index.rst for the spec. */ @@ -35,6 +36,61 @@ static void fill_pattern(uint8_t *buf, size_t len) buf[i] = (uint8_t)(i & 0xff); } +/* + * Create a kernel-owned DMA buffer via BUF_CREATE on @ioctl_fd (control fd or + * queue-pair fd). Returns the new buffer fd (>= 0), or -errno on failure. + */ +static int qdma_buf_create(int ioctl_fd, uint64_t length, uint32_t *granule, + uint32_t *transfer_hint) +{ + struct slash_qdma_buf_create req; + int fd; + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.flags = O_CLOEXEC; + req.length = length; + + fd = ioctl(ioctl_fd, SLASH_QDMA_IOCTL_BUF_CREATE, &req); + if (fd < 0) + return -errno; + + if (granule) + *granule = req.granule; + if (transfer_hint) + *transfer_hint = req.transfer_hint; + return fd; +} + +/* mmap a buffer fd for CPU access; returns the mapping or MAP_FAILED. */ +static void *qdma_buf_map(int buf_fd, uint64_t length) +{ + return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED, buf_fd, 0); +} + +/* + * Issue a single-sub-transfer buffer transfer on a qpair fd (qpair_index 0); + * returns the ioctl result (bytes transferred or -1 with errno set). + */ +static long qdma_buf_transfer(int io_fd, int buf_fd, uint64_t buf_offset, + uint64_t dev_addr, uint64_t length, + uint32_t direction) +{ + struct slash_qdma_transfer req; + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.count = 1; + req.xfers[0].qpair_index = 0; + req.xfers[0].direction = direction; + req.xfers[0].buf_fd = buf_fd; + req.xfers[0].buf_offset = buf_offset; + req.xfers[0].dev_addr = dev_addr; + req.xfers[0].length = length; + + return ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req); +} + /* ---------- fixture ---------- */ FIXTURE(qdma) @@ -126,30 +182,118 @@ TEST_F(qdma, qpair_lifecycle) TEST_F(qdma, write_read_verify) { - uint8_t *write_buf, *read_buf; uint64_t dma_addr = get_dma_addr(); - ssize_t ret; + int write_fd, read_fd; + uint8_t *write_buf, *read_buf; + long ret; bring_up_qpair(_metadata, self, 0x3); - write_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, write_buf); - read_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, read_buf); + write_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(write_fd, 0); + read_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(read_fd, 0); + + write_buf = qdma_buf_map(write_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, write_buf); + read_buf = qdma_buf_map(read_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, read_buf); fill_pattern(write_buf, TRANSFER_SIZE); memset(read_buf, 0, TRANSFER_SIZE); - ret = pwrite(self->io_fd, write_buf, TRANSFER_SIZE, (off_t)dma_addr); + ret = qdma_buf_transfer(self->io_fd, write_fd, 0, dma_addr, + TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); ASSERT_EQ(TRANSFER_SIZE, ret); - ret = pread(self->io_fd, read_buf, TRANSFER_SIZE, (off_t)dma_addr); + ret = qdma_buf_transfer(self->io_fd, read_fd, 0, dma_addr, + TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); ASSERT_EQ(TRANSFER_SIZE, ret); EXPECT_EQ(0, memcmp(write_buf, read_buf, TRANSFER_SIZE)); - free(write_buf); - free(read_buf); + munmap(write_buf, TRANSFER_SIZE); + munmap(read_buf, TRANSFER_SIZE); + close(write_fd); + close(read_fd); +} + +/* ---------- buffer fd behaviour ---------- */ + +TEST_F(qdma, buf_create_zero_length_returns_einval) +{ + EXPECT_EQ(-EINVAL, qdma_buf_create(self->ctl_fd, 0, NULL, NULL)); +} + +TEST_F(qdma, buf_create_unaligned_length_returns_einval) +{ + /* Length must be a multiple of the page size. */ + EXPECT_EQ(-EINVAL, + qdma_buf_create(self->ctl_fd, TRANSFER_SIZE + 1, NULL, NULL)); +} + +TEST_F(qdma, buf_create_reports_granule_and_hint) +{ + uint32_t granule = 0; + uint32_t hint = 0; + int buf_fd; + + buf_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, &granule, &hint); + ASSERT_GE(buf_fd, 0); + EXPECT_EQ(4096u, granule); + EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, hint); + close(buf_fd); +} + +TEST_F(qdma, buf_create_via_qpair_fd) +{ + int buf_fd; + uint8_t *map; + long ret; + uint64_t dma_addr = get_dma_addr(); + + bring_up_qpair(_metadata, self, 0x3); + + /* Buffers can be created through the queue-pair fd too (SCM_RIGHTS use). */ + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); + + map = qdma_buf_map(buf_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, map); + fill_pattern(map, TRANSFER_SIZE); + + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, dma_addr, TRANSFER_SIZE, + SLASH_QDMA_XFER_H2C); + ASSERT_EQ(TRANSFER_SIZE, ret); + + munmap(map, TRANSFER_SIZE); + close(buf_fd); +} + +TEST_F(qdma, buf_fd_mapping_outlives_fd_close) +{ + int buf_fd; + uint8_t *map; + uint64_t dma_addr = get_dma_addr(); + long ret; + + bring_up_qpair(_metadata, self, 0x3); + + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); + map = qdma_buf_map(buf_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, map); + + /* Closing the fd must not invalidate an existing mapping. */ + close(buf_fd); + + fill_pattern(map, TRANSFER_SIZE); + /* The mapping is still valid; the bytes are readable. */ + EXPECT_EQ(0u, map[0]); + (void)dma_addr; + (void)ret; + + munmap(map, TRANSFER_SIZE); } /* ---------- error paths ---------- */ @@ -271,62 +415,56 @@ TEST_F(qdma, qpair_get_fd_unknown_qid) TEST_F(qdma, io_read_on_h2c_only_returns_enodev) { - uint8_t *buf; - ssize_t ret; + int buf_fd; + long ret; bring_up_qpair(_metadata, self, 0x1); /* H2C only */ - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); - ret = pread(self->io_fd, buf, TRANSFER_SIZE, (off_t)SLASH_TEST_HBM_BASE); + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, SLASH_TEST_HBM_BASE, + TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); EXPECT_EQ(-1, ret); EXPECT_EQ(ENODEV, errno); - free(buf); + close(buf_fd); } TEST_F(qdma, io_write_on_c2h_only_returns_enodev) { - uint8_t *buf; - ssize_t ret; + int buf_fd; + long ret; bring_up_qpair(_metadata, self, 0x2); /* C2H only */ - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); - ret = pwrite(self->io_fd, buf, TRANSFER_SIZE, (off_t)SLASH_TEST_HBM_BASE); + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, SLASH_TEST_HBM_BASE, + TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); EXPECT_EQ(-1, ret); EXPECT_EQ(ENODEV, errno); - free(buf); + close(buf_fd); } -/* - * TODO: spec at docs/reference/kernel-abi/index.rst:417 documents zero-length - * transfers as returning -EINVAL, but the kernel's map_user_buf_to_sgl path - * (slash_qdma.c:2033-2034) explicitly patches around the len==0 case - * (`if (len == 0) pages_nr = 1;`), making the -EINVAL branch unreachable. - * The observed behaviour is ret == 0. Desired behaviour is under - * investigation — keep this test as-is so the discrepancy is visible. - */ TEST_F(qdma, io_zero_length_returns_einval) { - SKIP(return, "Test is disabled since the desired behavior is under investigation"); - uint8_t *buf; - ssize_t ret; + int buf_fd; + long ret; bring_up_qpair(_metadata, self, 0x3); - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); - ret = pwrite(self->io_fd, buf, 0, (off_t)SLASH_TEST_HBM_BASE); + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, SLASH_TEST_HBM_BASE, + 0, SLASH_QDMA_XFER_H2C); EXPECT_EQ(-1, ret); EXPECT_EQ(EINVAL, errno); - free(buf); + close(buf_fd); } TEST_F(qdma, io_mmap_unsupported) @@ -335,17 +473,17 @@ TEST_F(qdma, io_mmap_unsupported) bring_up_qpair(_metadata, self, 0x3); + /* The transfer (queue-pair) fd is not mappable — only buffer fds are. */ p = mmap(NULL, 4096, PROT_READ, MAP_SHARED, self->io_fd, 0); EXPECT_EQ(MAP_FAILED, p); if (p != MAP_FAILED) munmap(p, 4096); } -TEST_F(qdma, io_ioctl_returns_enotty) +TEST_F(qdma, io_junk_ioctl_returns_enotty) { - /* The per-qpair anon_inode fd defines no ioctls; the handler - * returns -ENOTTY for any cmd. Exercising this path keeps the stub - * formally covered. */ + /* The per-qpair fd defines only BUF_CREATE / TRANSFER; any other cmd + * returns -ENOTTY. */ unsigned int junk = _IO('v', 0xFE); bring_up_qpair(_metadata, self, 0x3); @@ -354,138 +492,99 @@ TEST_F(qdma, io_ioctl_returns_enotty) EXPECT_EQ(ENOTTY, errno); } -TEST_F(qdma, io_lseek_set_cur_end) +TEST_F(qdma, io_lseek_unsupported) { off_t pos; bring_up_qpair(_metadata, self, 0x3); pos = lseek(self->io_fd, (off_t)SLASH_TEST_HBM_BASE, SEEK_SET); - EXPECT_EQ((off_t)SLASH_TEST_HBM_BASE, pos); - - pos = lseek(self->io_fd, 0, SEEK_CUR); - EXPECT_EQ((off_t)SLASH_TEST_HBM_BASE, pos); - - pos = lseek(self->io_fd, 4096, SEEK_CUR); - EXPECT_EQ((off_t)(SLASH_TEST_HBM_BASE + 4096), pos); - - /* - * SEEK_END semantics are driver-defined for this anon-inode; the - * contract is "doesn't error", not any specific value. - */ - pos = lseek(self->io_fd, 0, SEEK_END); - EXPECT_NE((off_t)-1, pos); + EXPECT_EQ((off_t)-1, pos); + EXPECT_EQ(ESPIPE, errno); } -TEST_F(qdma, io_write_advances_file_position) +TEST_F(qdma, io_read_write_unsupported) { - uint8_t *buf; - off_t pos; - ssize_t ret; + uint8_t buf[TRANSFER_SIZE]; + long ret; bring_up_qpair(_metadata, self, 0x3); - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); - fill_pattern(buf, TRANSFER_SIZE); - - ASSERT_EQ((off_t)SLASH_TEST_HBM_BASE, - lseek(self->io_fd, (off_t)SLASH_TEST_HBM_BASE, SEEK_SET)); - ret = write(self->io_fd, buf, TRANSFER_SIZE); - ASSERT_EQ(TRANSFER_SIZE, ret); - - pos = lseek(self->io_fd, 0, SEEK_CUR); - EXPECT_EQ((off_t)(SLASH_TEST_HBM_BASE + TRANSFER_SIZE), pos); - - free(buf); -} - -TEST_F(qdma, io_pwrite_does_not_advance_file_position) -{ - uint8_t *buf; - off_t pos; - ssize_t ret; - - bring_up_qpair(_metadata, self, 0x3); - - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); - fill_pattern(buf, TRANSFER_SIZE); - - ASSERT_EQ((off_t)0, lseek(self->io_fd, 0, SEEK_SET)); - - ret = pwrite(self->io_fd, buf, TRANSFER_SIZE, - (off_t)SLASH_TEST_HBM_BASE); - ASSERT_EQ(TRANSFER_SIZE, ret); - - /* p* variants must not advance the file position. */ - pos = lseek(self->io_fd, 0, SEEK_CUR); - EXPECT_EQ((off_t)0, pos); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); - free(buf); + ret = read(self->io_fd, buf, TRANSFER_SIZE); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); } TEST_F(qdma, io_multiple_fds_same_qpair) { + int write_fd, read_fd, io_fd_b; uint8_t *write_buf, *read_buf; - int io_fd_b; - ssize_t ret; + long ret; bring_up_qpair(_metadata, self, 0x3); io_fd_b = slash_qpair_get_fd(self->ctl_fd, self->qid, O_CLOEXEC); ASSERT_GE(io_fd_b, 0); - write_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, write_buf); - read_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, read_buf); + write_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(write_fd, 0); + read_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(read_fd, 0); + + write_buf = qdma_buf_map(write_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, write_buf); + read_buf = qdma_buf_map(read_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, read_buf); fill_pattern(write_buf, TRANSFER_SIZE); memset(read_buf, 0, TRANSFER_SIZE); - ret = pwrite(self->io_fd, write_buf, TRANSFER_SIZE, - (off_t)SLASH_TEST_HBM_BASE); + ret = qdma_buf_transfer(self->io_fd, write_fd, 0, SLASH_TEST_HBM_BASE, + TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); ASSERT_EQ(TRANSFER_SIZE, ret); - ret = pread(io_fd_b, read_buf, TRANSFER_SIZE, - (off_t)SLASH_TEST_HBM_BASE); + ret = qdma_buf_transfer(io_fd_b, read_fd, 0, SLASH_TEST_HBM_BASE, + TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); ASSERT_EQ(TRANSFER_SIZE, ret); EXPECT_EQ(0, memcmp(write_buf, read_buf, TRANSFER_SIZE)); + munmap(write_buf, TRANSFER_SIZE); + munmap(read_buf, TRANSFER_SIZE); + close(write_fd); + close(read_fd); close(io_fd_b); - free(write_buf); - free(read_buf); } TEST_F(qdma, io_fd_outlives_qpair_del) { - uint8_t *buf; - ssize_t ret; + int buf_fd; + long ret; bring_up_qpair(_metadata, self, 0x3); + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); + /* DEL the qpair while io_fd is still open. */ ASSERT_EQ(0, slash_qpair_op(self->ctl_fd, self->qid, SLASH_QDMA_QUEUE_OP_DEL)); self->qpair_added = 0; self->qpair_started = 0; - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); - /* - * fd is still valid but the qpair's HW queues are gone. The spec - * (index.rst:613-616) does not name a specific errno, so we only - * assert the call fails — not which errno it returns. + * fd is still valid but the qpair's HW queues are gone. The spec does + * not name a specific errno, so we only assert the call fails. */ - ret = pwrite(self->io_fd, buf, TRANSFER_SIZE, - (off_t)SLASH_TEST_HBM_BASE); + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, SLASH_TEST_HBM_BASE, + TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); EXPECT_EQ(-1, ret); - free(buf); + close(buf_fd); /* close(io_fd) happens in fixture teardown — must not crash. */ } @@ -494,31 +593,41 @@ TEST_F(qdma, io_fd_outlives_qpair_del) static void region_round_trip(struct __test_metadata *_metadata, FIXTURE_DATA(qdma) * self, uint64_t base) { + int write_fd, read_fd; uint8_t *write_buf, *read_buf; - ssize_t ret; + long ret; bring_up_qpair(_metadata, self, 0x3); - write_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, write_buf); - read_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, read_buf); + write_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(write_fd, 0); + read_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(read_fd, 0); + + write_buf = qdma_buf_map(write_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, write_buf); + read_buf = qdma_buf_map(read_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, read_buf); fill_pattern(write_buf, TRANSFER_SIZE); memset(read_buf, 0, TRANSFER_SIZE); - ret = pwrite(self->io_fd, write_buf, TRANSFER_SIZE, (off_t)base); + ret = qdma_buf_transfer(self->io_fd, write_fd, 0, base, + TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); ASSERT_EQ(TRANSFER_SIZE, ret) - TH_LOG("pwrite to 0x%llx failed: %s", + TH_LOG("H2C transfer to 0x%llx failed: %s", (unsigned long long)base, strerror(errno)); - ret = pread(self->io_fd, read_buf, TRANSFER_SIZE, (off_t)base); + ret = qdma_buf_transfer(self->io_fd, read_fd, 0, base, + TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); ASSERT_EQ(TRANSFER_SIZE, ret); EXPECT_EQ(0, memcmp(write_buf, read_buf, TRANSFER_SIZE)); - free(write_buf); - free(read_buf); + munmap(write_buf, TRANSFER_SIZE); + munmap(read_buf, TRANSFER_SIZE); + close(write_fd); + close(read_fd); } TEST_F(qdma, transfer_hbm) @@ -707,4 +816,146 @@ TEST_F(qdma, qpair_get_fd_oversized_struct_zeros_tail) free(buf); } +TEST_F(qdma, reject_partial_4k_transfer) +{ + int buf_fd; + uint64_t dma_addr = get_dma_addr(); + long ret; + + bring_up_qpair(_metadata, self, 0x3); + + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); + + /* A sub-page length is not a multiple of the buffer granule. */ + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, dma_addr, + TRANSFER_SIZE / 2, SLASH_QDMA_XFER_H2C); + ASSERT_EQ(-1, ret); + ASSERT_EQ(EINVAL, errno); + + close(buf_fd); +} + +TEST_F(qdma, multipage_4k_write_read_verify) +{ + const size_t xfer_size = TRANSFER_SIZE * 8; /* 8 base pages, one request */ + int write_fd, read_fd; + uint8_t *write_buf, *read_buf; + uint64_t dma_addr = get_dma_addr(); + long ret; + + bring_up_qpair(_metadata, self, 0x3); + + write_fd = qdma_buf_create(self->ctl_fd, xfer_size, NULL, NULL); + ASSERT_GE(write_fd, 0); + read_fd = qdma_buf_create(self->ctl_fd, xfer_size, NULL, NULL); + ASSERT_GE(read_fd, 0); + + write_buf = qdma_buf_map(write_fd, xfer_size); + ASSERT_NE(MAP_FAILED, write_buf); + read_buf = qdma_buf_map(read_fd, xfer_size); + ASSERT_NE(MAP_FAILED, read_buf); + + fill_pattern(write_buf, xfer_size); + memset(read_buf, 0, xfer_size); + + ret = qdma_buf_transfer(self->io_fd, write_fd, 0, dma_addr, xfer_size, + SLASH_QDMA_XFER_H2C); + ASSERT_EQ((ssize_t)xfer_size, ret); + + ret = qdma_buf_transfer(self->io_fd, read_fd, 0, dma_addr, xfer_size, + SLASH_QDMA_XFER_C2H); + ASSERT_EQ((ssize_t)xfer_size, ret); + + EXPECT_EQ(0, memcmp(write_buf, read_buf, xfer_size)); + + munmap(write_buf, xfer_size); + munmap(read_buf, xfer_size); + close(write_fd); + close(read_fd); +} + +/* ---------- transfer error paths ---------- */ + +TEST_F(qdma, transfer_size_below_input_min_returns_einval) +{ + struct slash_qdma_transfer req; + + bring_up_qpair(_metadata, self, 0x3); + + memset(&req, 0, sizeof(req)); + req.size = sizeof(__u32); /* below the trailing input field */ + EXPECT_EQ(-1, ioctl(self->io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req)); + EXPECT_EQ(EINVAL, errno); +} + +TEST_F(qdma, transfer_invalid_buf_fd_returns_einval) +{ + long ret; + + bring_up_qpair(_metadata, self, 0x3); + + /* The control fd is a valid fd but not a buffer fd. */ + ret = qdma_buf_transfer(self->io_fd, self->ctl_fd, 0, + get_dma_addr(), TRANSFER_SIZE, + SLASH_QDMA_XFER_H2C); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); +} + +TEST_F(qdma, transfer_bad_fd_returns_ebadf) +{ + long ret; + + bring_up_qpair(_metadata, self, 0x3); + + ret = qdma_buf_transfer(self->io_fd, -1, 0, + get_dma_addr(), TRANSFER_SIZE, + SLASH_QDMA_XFER_H2C); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EBADF, errno); +} + +TEST_F(qdma, transfer_wrong_direction_returns_enodev) +{ + int buf_fd; + uint32_t transfer_hint = 0; + long ret; + + bring_up_qpair(_metadata, self, 0x1); /* H2C only */ + + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, &transfer_hint); + ASSERT_GE(buf_fd, 0); + EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, transfer_hint); + + /* C2H is not enabled on this qpair. */ + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, + get_dma_addr(), TRANSFER_SIZE, + SLASH_QDMA_XFER_C2H); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ENODEV, errno); + + close(buf_fd); +} + +TEST_F(qdma, transfer_out_of_range_returns_einval) +{ + int buf_fd; + long ret; + + bring_up_qpair(_metadata, self, 0x3); + + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); + + /* Slice extends past the buffer length. */ + ret = qdma_buf_transfer(self->io_fd, buf_fd, TRANSFER_SIZE, + get_dma_addr(), TRANSFER_SIZE, + SLASH_QDMA_XFER_H2C); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + close(buf_fd); +} + TEST_HARNESS_MAIN diff --git a/packaging/debian/control b/packaging/debian/control index ce02f2dd..09534d96 100644 --- a/packaging/debian/control +++ b/packaging/debian/control @@ -48,7 +48,7 @@ Description: SLASH/VRT System for simulation and emulation Package: slash-dkms Architecture: all -Depends: dkms, gcc, make, ${misc:Depends} +Depends: dkms, gcc, make, patch, ${misc:Depends} Provides: slash-kernel-module Description: SLASH kernel module (DKMS) @@ -89,7 +89,7 @@ Description: VRT Runtime (development files) Package: v80-smi Architecture: any -Depends: libvrt (= ${binary:Version}), ${shlibs:Depends}, ${misc:Depends} +Depends: libvrt (= ${binary:Version}), libslash (= ${binary:Version}), ${shlibs:Depends}, ${misc:Depends} Description: V80 System Management Interface Package: slashkit diff --git a/packaging/debian/slash-dkms.install b/packaging/debian/slash-dkms.install index d0496a33..c377f69f 100644 --- a/packaging/debian/slash-dkms.install +++ b/packaging/debian/slash-dkms.install @@ -22,6 +22,7 @@ driver/*.c usr/src/slash-@VERSION@/driver/ driver/*.h usr/src/slash-@VERSION@/driver/ driver/Makefile usr/src/slash-@VERSION@/driver/ driver/kcompat usr/src/slash-@VERSION@/driver/ +driver/patches usr/src/slash-@VERSION@/driver/ driver/libslash/include/slash/uapi usr/src/slash-@VERSION@/driver/libslash/include/slash/ submodules/qdma_drv/QDMA/linux-kernel/driver/libqdma/ usr/src/slash-@VERSION@/driver/ diff --git a/packaging/rpm/slash.spec b/packaging/rpm/slash.spec index a18ccd59..3568a859 100644 --- a/packaging/rpm/slash.spec +++ b/packaging/rpm/slash.spec @@ -90,7 +90,7 @@ SLASH/VRT System for simulation and emulation (development files) %package -n slash-dkms Summary: SLASH kernel module (DKMS) -Requires: dkms, gcc, make +Requires: dkms, gcc, make, patch BuildArch: noarch %description -n slash-dkms @@ -157,6 +157,7 @@ VRT Runtime (development files) %package -n v80-smi Summary: V80 System Management Interface Requires: libvrt = %{version}-%{release} +Requires: libslash = %{version}-%{release} %description -n v80-smi V80 System Management Interface @@ -211,6 +212,8 @@ install -m 0644 driver/Makefile %{buildroot}%{_usrsrc}/%{dkms_name}-%{dkms_versi cp -a driver/kcompat %{buildroot}%{_usrsrc}/%{dkms_name}-%{dkms_version}/driver/ +cp -a driver/patches %{buildroot}%{_usrsrc}/%{dkms_name}-%{dkms_version}/driver/ + cp -a driver/libslash/include/slash/uapi \ %{buildroot}%{_usrsrc}/%{dkms_name}-%{dkms_version}/driver/libslash/include/slash/ diff --git a/scripts/package-ami.sh b/scripts/package-ami.sh index f12cc6f6..66ff2833 100755 --- a/scripts/package-ami.sh +++ b/scripts/package-ami.sh @@ -33,14 +33,23 @@ ARTIFACTS_DIR="${ARTIFACTS_DIR:-$(pwd)/ami}" AMI_BUILD_DIR="$(pwd)/ami-build" AVED_DIR="$(pwd)/submodules/AVED" AMI_DIR="${AVED_DIR}/sw/AMI" -PKG_PY="${AMI_DIR}/scripts/package_data/pkg.py" -GEN_PKG_PY="${AMI_DIR}/scripts/gen_package.py" +AMI_SRC_DIR="${AMI_BUILD_DIR}/src/AMI" +AMI_OUTPUT_DIR="${AMI_BUILD_DIR}/pkg" +PKG_PY="${AMI_SRC_DIR}/scripts/package_data/pkg.py" +GEN_PKG_PY="${AMI_SRC_DIR}/scripts/gen_package.py" rm -rf "${AMI_BUILD_DIR}" mkdir -p "${ARTIFACTS_DIR}" +mkdir -p "$(dirname "${AMI_SRC_DIR}")" +cp -a "${AMI_DIR}" "${AMI_SRC_DIR}" -# Restore submodule files and clean up build directory on exit -trap 'git -C "${AVED_DIR}" checkout -- sw/AMI/scripts/package_data/pkg.py sw/AMI/scripts/gen_package.py; rm -rf "${AMI_BUILD_DIR}"' EXIT +# Clean up build directory on exit. Packaging patches a disposable AMI copy so +# this also works from source trees copied without usable submodule gitdirs. +trap 'rm -rf "${AMI_BUILD_DIR}"' EXIT + +# Avoid stale generated headers from copied build trees. gen_package.py will +# otherwise prefer api/build/ami_version.h over the checked-in version header. +rm -f "${AMI_SRC_DIR}/api/build/ami_version.h" # Patch in Rocky Linux support (RHEL-compatible, RPM-based) sed -i "/^DIST_ID_RHEL /a DIST_ID_ROCKY = 'rocky'" "${PKG_PY}" @@ -48,13 +57,17 @@ sed -i "/^ DIST_ID_RHEL,$/a\\ DIST_ID_ROCKY," "${PKG_PY}" sed -i "s/DIST_RPM = \[DIST_ID_CENTOS, DIST_ID_REDHAT, DIST_ID_REDHAT2, DIST_ID_SLES, DIST_ID_RHEL\]/DIST_RPM = [DIST_ID_CENTOS, DIST_ID_REDHAT, DIST_ID_REDHAT2, DIST_ID_SLES, DIST_ID_RHEL, DIST_ID_ROCKY]/" "${PKG_PY}" sed -i "s/DIST_ID_CENTOS, DIST_ID_REDHAT, DIST_ID_REDHAT2, DIST_ID_RHEL\]/DIST_ID_CENTOS, DIST_ID_REDHAT, DIST_ID_REDHAT2, DIST_ID_RHEL, DIST_ID_ROCKY]/" "${GEN_PKG_PY}" -cd "${AMI_DIR}" +cd "${AMI_SRC_DIR}" # --no_driver skips a pre-flight driver compilation check (build+clean) only; # it does NOT affect which files are included in the package. # We skip it here so the packaging can run in environments (eg. containers) # that may not have linux-headers available to compile the driver. -python3 scripts/gen_package.py --no_driver -o "${AMI_BUILD_DIR}" +# +# --no_gen_version skips AVED's git-based version regeneration. This wrapper is +# often run from copied worktrees where the submodule .git file points back to a +# non-existent source checkout, causing an empty hash and an invalid RPM Release. +python3 scripts/gen_package.py --no_driver --no_gen_version -o "${AMI_OUTPUT_DIR}" # Copy only the package files to the artifacts directory -cp "${AMI_BUILD_DIR}"/*.rpm "${ARTIFACTS_DIR}/" 2>/dev/null || \ -cp "${AMI_BUILD_DIR}"/*.deb "${ARTIFACTS_DIR}/" 2>/dev/null || true +cp "${AMI_OUTPUT_DIR}"/*.rpm "${ARTIFACTS_DIR}/" 2>/dev/null || \ +cp "${AMI_OUTPUT_DIR}"/*.deb "${ARTIFACTS_DIR}/" 2>/dev/null || true diff --git a/scripts/test-fresh-install.sh b/scripts/test-fresh-install.sh index 247d6b2b..cd20e363 100755 --- a/scripts/test-fresh-install.sh +++ b/scripts/test-fresh-install.sh @@ -197,7 +197,7 @@ elif [[ "${PKG_TYPE}" == "rpm" ]]; then if [[ ${#INSTALLED[@]} -gt 0 ]]; then echo "Removing: ${INSTALLED[*]}" - dnf remove -y "${INSTALLED[@]}" + dnf remove -y --setopt='*.skip_if_unavailable=True' "${INSTALLED[@]}" else echo "No SLASH packages currently installed." fi @@ -226,7 +226,7 @@ elif [[ "${PKG_TYPE}" == "rpm" ]]; then # Exclude source, debuginfo, and debugsource RPMs mapfile -t RPMS < <(find "${ARTIFACTS_DIR}" -maxdepth 1 -name '*.rpm' \ ! -name '*.src.rpm' ! -name '*-debuginfo-*' ! -name '*-debugsource-*') - dnf install -y "${RPMS[@]}" + dnf install -y --setopt='*.skip_if_unavailable=True' "${RPMS[@]}" fi # ========================================================================= diff --git a/smi/CMakeLists.txt b/smi/CMakeLists.txt index 58cf9771..46ae8c16 100644 --- a/smi/CMakeLists.txt +++ b/smi/CMakeLists.txt @@ -40,6 +40,7 @@ project( ) option(SMI_INCLUDE_VRT "Include vrtd as subdirectory instead of building from system" OFF) +option(SMI_ENABLE_QDMA_DRIVER_BACKEND "Build validate --use-qdma-driver backend" ON) include(GNUInstallDirs) @@ -55,6 +56,10 @@ if(NOT TARGET vrt::vrt) "Build and install vrt first (cmake --install), then configure smi again.") endif() +if(NOT TARGET slash::slash) + find_package(slash REQUIRED CONFIG) +endif() + find_package(CLI11 CONFIG REQUIRED) configure_file( diff --git a/smi/README.md b/smi/README.md index d528ed92..a7fef4fe 100644 --- a/smi/README.md +++ b/smi/README.md @@ -178,43 +178,119 @@ programmed with the static SLASH design. ### validate -Reset a board, then test HBM and DDR memory for data integrity and -bandwidth. +Optionally reset a board, then test HBM and DDR memory for data integrity and +bandwidth. Raw transfer modes skip reset and bypass the default VRTD buffer +path for data movement. ``` -v80-smi validate -d [-j ] +v80-smi validate -d [-j ] [-R] [--mm-channel ] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--ring-size-index <0-15>] [--bandwidth-iterations ] [--bandwidth-duration ] ``` | Flag | Description | |-------------------|------------------------------------------------------| | `-d,--device` | Board address (required), e.g. `03:00` or `0000:03:00` | -| `-j,--threads` | Parallel buffers/threads, 1-64 (default 8) | - -Each buffer is 64 MB. The integrity test writes a pattern, syncs to -device, clears host memory, syncs back, and verifies. The bandwidth -test runs parallel H2C writes and C2H reads. +| `-j,--threads` | Parallel buffers/threads, 1-64 (default 8). Bidirectional phases use `2 * threads` logical positions in each enabled memory space. | +| `-R,--no-reset` | Skip the device reset step before running memory tests | +| `--mm-channel` | AXI-MM/NoC channel per buffer queue: `auto` (default; driver stripes by `qid&1`), `0`, or `1`, or a comma-separated list with exactly one entry per buffer position (`2 x --threads` entries, e.g. `-j 1` -> `0,1`); no repeating, wrong length errors. Independent of `--channel-allocation`; also honored by `--use-qdma-driver`. | +| `--buffer-size` | Size of each test buffer, accepting bytes or `k`/`K`/`m`/`M` suffixes (default `512M`, maximum `512M`) | +| `--offset` | Distance between logical buffer positions (default `512M`) | +| `--starting-offset` | Offset from each memory-space base for logical position 0 (default `0`) | +| `--raw-transfer-test` | Use libslash raw QDMA transfers instead of VRTD buffers; implies `--no-reset` | +| `--use-qdma-driver` | Run the raw transfer test over the off-the-shelf Xilinx QDMA driver instead of SLASH; implies `--no-reset`; mutually exclusive with `--raw-transfer-test` | +| `--ddr-only` | Run only DDR memory tests (skip HBM); mutually exclusive with `--hbm-only` | +| `--hbm-only` | Run only HBM memory tests (skip DDR); mutually exclusive with `--ddr-only` | +| `--channel-allocation` | Raw-transfer-only placement: `auto` (default; mm-channel `qid&1`, linear addressing) or `paired` (couple mm-channel to a distinct memory region/NSU: even positions -> region 0/channel 0, odd -> region 1/channel 1). `paired` mirrors dma-perf `offset_ch0`/`offset_ch1` so both NoC NMUs drive independent memory endpoints. | +| `--channel-region-stride` | In `--channel-allocation paired`, byte distance between the two per-channel regions (NSU stride). Default `16G` (half the per-memory space); accepts `k`/`K`/`m`/`M`/`g`/`G`. | +| `--ring-size-index` | Raw-transfer-only descriptor-ring size index, `0`-`15`. Overrides the backend default when creating SLASH raw qpairs or starting stock-driver queues. | +| `--bandwidth-iterations` | Raw-transfer-only sustained bandwidth mode: repeat each whole-buffer transfer this many times in each bandwidth phase (default `1`). | +| `--bandwidth-duration` | Raw-transfer-only duration mode: repeat whole-buffer transfers until this many seconds have elapsed; `0` disables duration mode and uses `--bandwidth-iterations`. | + +Each buffer defaults to 512 MB (one HBM/DDR allocator region). The integrity test +writes a pattern, syncs to device, clears host memory, syncs back, and +verifies. Each bandwidth +phase reports single-direction C2H reads, single-direction H2C writes, +and simultaneous bidirectional throughput (read, write, and total). After +the per-memory phases, a final parallel phase drives HBM and DDR together +using `2 x ` buffers for single-direction tests and `4 x ` +threads for bidirectional tests; it is skipped when `--ddr-only` or +`--hbm-only` is given. With `--raw-transfer-test`, the command bypasses +VRTD for transfers and opens the board's SLASH QDMA device directly, so +the SLASH QDMA driver node must be present. + +Buffers are placed at `memory_base + starting-offset + position * offset`. +The position sequence is `0..N-1` for single-direction phases and `0..2N-1` +for bidirectional phases (reads on even positions, writes on odd positions). +`--buffer-size`, `--offset`, and `--starting-offset` must be 4 KiB-aligned, +`--offset` must be at least +`--buffer-size`, and the highest buffer must fit within the 64 x 512 MB DDR/HBM +address space. If any placement option is +specified in default VRTD mode, `validate` uses raw VRTD buffers so the exact +addresses are honored; this requires raw memory access permission. + +The largest phase maps up to `4 x x ` of host buffers +when HBM and DDR are both enabled, or `2 x x ` with +`--ddr-only` or `--hbm-only`; `validate` fails early if that footprint exceeds +currently available host memory. + +Raw transfer modes can repeat the bandwidth phases without changing buffer +placement. `--bandwidth-iterations` repeats each whole-buffer +transfer a fixed number of times, while `--bandwidth-duration` runs each +bandwidth phase for a wall-clock duration and counts completed whole-buffer +transfers. Integrity checks remain one-shot. +`--ring-size-index` can override the QDMA descriptor-ring size index for these +raw modes; useful A/B values for 4 KiB descriptor throughput are `0`, `11`, +`13`, and `15`. + +With `--use-qdma-driver`, the command runs the same raw test over the +off-the-shelf Xilinx QDMA driver (`submodules/qdma_drv`) instead of SLASH. +smi provisions the queues itself: it raises the function's `qmax` via sysfs +if needed, creates and starts bidirectional AXI-MM queue pairs over generic +netlink (the same `xnl_pf` interface `dma-ctl` uses), then transfers over the +per-queue char devices `/dev/qdma-MM-`. This requires the stock +`qdma-pf` driver to be bound to the board's PF (it cannot be bound at the same +time as the SLASH driver), and typically needs root to raise `qmax` and open +the queue devices. The device memory addresses tested (HBM/DDR) are the same +AXI addresses used by the SLASH path. + +Requirements depend on the selected mode: the default path needs VRTD and root +for reset unless `--no-reset` is used; `--raw-transfer-test` needs the SLASH +QDMA driver node; `--use-qdma-driver` needs a build with +`SMI_ENABLE_QDMA_DRIVER_BACKEND=ON` and the stock QDMA driver bound to the +board. ```console $ v80-smi validate -d 03:00 Resetting device 0000:03:00... Testing HBM data integrity (8 regions)... - HBM0: OK - HBM1: OK - ... -Testing HBM bandwidth (8 threads)... + 8/8 OK +Testing HBM read bandwidth (8 threads)... + Read: 9547.22 MB/s +Testing HBM write bandwidth (8 threads)... Write: 9832.10 MB/s - Read: 9547.22 MB/s +Testing HBM bidirectional bandwidth (16 threads)... + Read: 9210.15 MB/s + Write: 9475.81 MB/s + Total: 18685.96 MB/s Testing DDR data integrity (8 buffers)... - DDR0: OK - DDR1: OK - ... -Testing DDR bandwidth (8 threads)... + 8/8 OK +Testing DDR read bandwidth (8 threads)... + Read: 4980.33 MB/s +Testing DDR write bandwidth (8 threads)... Write: 5120.45 MB/s - Read: 4980.33 MB/s +Testing DDR bidirectional bandwidth (16 threads)... + Read: 4860.12 MB/s + Write: 5012.34 MB/s + Total: 9872.46 MB/s +Testing HBM+DDR read bandwidth (16 threads)... + Read: 11890.55 MB/s +Testing HBM+DDR write bandwidth (16 threads)... + Write: 12450.78 MB/s +Testing HBM+DDR bidirectional bandwidth (32 threads)... + Read: 11340.12 MB/s + Write: 12020.34 MB/s + Total: 23360.46 MB/s ``` -Requires root access and a running VRTD daemon. - ### debug bar-poke Perform low-level BAR reads or writes for troubleshooting. @@ -364,6 +440,8 @@ since v80-smi always operates at board granularity. |------------|--------------------------------------------------| | libvrt | VRT runtime library (device, kernel, vrtbin APIs) | | vrtd | Runtime daemon (sensors, reset, validate, query) | +| libslash | Raw SLASH QDMA backend for `validate --raw-transfer-test` | +| qdma_nl.h | Optional stock QDMA-driver backend (`SMI_ENABLE_QDMA_DRIVER_BACKEND=ON`) | ## Project layout @@ -376,6 +454,8 @@ smi/ program.cpp/hpp Device programming reset.cpp/hpp Hardware reset via VRTD validate.cpp/hpp Memory integrity and bandwidth testing + raw_transfer.hpp Shared raw QDMA host mapping and transfer helpers + qdma_driver_backend.cpp/hpp Optional stock QDMA-driver validate backend debug/bar_poke.cpp/hpp BAR read/write debug command debug/mem_poke.cpp/hpp Raw device memory read/write command debug/clockwiz.cpp/hpp Clock read/set debug command diff --git a/smi/src/CMakeLists.txt b/smi/src/CMakeLists.txt index 30e509aa..ad0c721a 100644 --- a/smi/src/CMakeLists.txt +++ b/smi/src/CMakeLists.txt @@ -32,6 +32,21 @@ add_executable( smi.cpp ) +if(SMI_ENABLE_QDMA_DRIVER_BACKEND) + target_sources(v80-smi PRIVATE qdma_driver_backend.cpp) + + # Off-the-shelf Xilinx QDMA driver netlink UAPI header (qdma_nl.h), used by + # the --use-qdma-driver validate backend. + set(QDMA_DRV_APPS_INCLUDE + "${CMAKE_CURRENT_SOURCE_DIR}/../../submodules/qdma_drv/QDMA/linux-kernel/apps/include") + if(NOT EXISTS "${QDMA_DRV_APPS_INCLUDE}/qdma_nl.h") + message(FATAL_ERROR + "Missing ${QDMA_DRV_APPS_INCLUDE}/qdma_nl.h. " + "Initialize submodules (git submodule update --init submodules/qdma_drv), " + "or configure with -DSMI_ENABLE_QDMA_DRIVER_BACKEND=OFF.") + endif() +endif() + target_compile_features(v80-smi PRIVATE cxx_std_20) target_include_directories( @@ -43,11 +58,17 @@ target_include_directories( ${CMAKE_CURRENT_BINARY_DIR}/../generated # For version.hpp ) +if(SMI_ENABLE_QDMA_DRIVER_BACKEND) + target_include_directories(v80-smi PRIVATE ${QDMA_DRV_APPS_INCLUDE}) + target_compile_definitions(v80-smi PRIVATE SMI_ENABLE_QDMA_DRIVER_BACKEND=1) +endif() + target_link_libraries( v80-smi PRIVATE vrt::vrt + slash::slash CLI11::CLI11 ) diff --git a/smi/src/qdma_driver_backend.cpp b/smi/src/qdma_driver_backend.cpp new file mode 100644 index 00000000..da47d6aa --- /dev/null +++ b/smi/src/qdma_driver_backend.cpp @@ -0,0 +1,550 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software + * and associated documentation files (the "Software"), to deal in the Software without restriction, + * including without limitation the rights to use, copy, modify, merge, publish, distribute, + * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or + * substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT + * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/// @file qdma_driver_backend.cpp +/// @brief Implementation of the off-the-shelf QDMA-driver raw-transfer backend. + +#include "qdma_driver_backend.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +// qdma_nl.h defines unused file-scope static lookup arrays (xnl_attr_str / +// xnl_op_str); silence the resulting -Wunused warnings without touching the +// vendored upstream header. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-const-variable" +extern "C" { +#include +} +#pragma GCC diagnostic pop + +#include "bdf.hpp" + +namespace smi::qdma_driver { + +namespace { + +/// Generous receive buffer: the device list dump grows with the number of +/// queues/functions, so keep this comfortably larger than XNL_RESP_BUFLEN_MAX. +constexpr size_t RESP_BUF_LEN = 256 * 1024; + +[[noreturn]] void throwSystemError(const std::string& message) { + throw std::runtime_error(message + ": " + std::strerror(errno)); +} + +} // namespace + +/// Minimal generic-netlink client for the QDMA driver's "xnl_pf" family. +/// +/// This is a focused port of the netlink plumbing in the upstream `dma-ctl` +/// utility (QDMA/linux-kernel/apps/dma-utils/dmactl.c): resolve the family id, +/// send a command carrying a handful of u32 attributes, and parse the reply's +/// attributes / generic message text. +class XnlClient { +public: + XnlClient() { + fd_ = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (fd_ < 0) { + throwSystemError("Failed to open QDMA netlink socket"); + } + + struct sockaddr_nl addr{}; + addr.nl_family = AF_NETLINK; + if (bind(fd_, reinterpret_cast(&addr), sizeof(addr)) < 0) { + const int err = errno; + close(fd_); + fd_ = -1; + errno = err; + throwSystemError("Failed to bind QDMA netlink socket"); + } + + // Don't block forever if the driver isn't present / doesn't answer. + struct timeval tv{}; + tv.tv_sec = 5; + tv.tv_usec = 0; + (void)setsockopt(fd_, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + + family_ = resolveFamily(XNL_NAME_PF); + } + + ~XnlClient() { + if (fd_ >= 0) { + close(fd_); + } + } + + XnlClient(const XnlClient&) = delete; + XnlClient& operator=(const XnlClient&) = delete; + + /// Parsed netlink response: scalar attributes plus any generic message text. + struct Response { + std::array attrs{}; + std::array present{}; + std::string genmsg; + }; + + /// Send command @p op for device index @p devIndex with the given u32 + /// attributes (DEV_IDX and a response-buffer-length hint are added + /// automatically) and return the parsed response. + Response sendCmd(uint8_t op, uint32_t devIndex, + const std::vector>& attrs) { + std::vector buf(RESP_BUF_LEN, 0); + auto* n = reinterpret_cast(buf.data()); + + n->nlmsg_type = family_; + n->nlmsg_flags = NLM_F_REQUEST; + n->nlmsg_pid = getpid(); + n->nlmsg_seq = seq_++; + n->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + + auto* g = reinterpret_cast(NLMSG_DATA(n)); + g->cmd = op; + g->version = XNL_VERSION; + + addIntAttr(n, XNL_ATTR_DEV_IDX, devIndex); + for (const auto& [type, val] : attrs) { + addIntAttr(n, type, val); + } + // Tell the kernel how large a response we can accept. + addIntAttr(n, XNL_ATTR_RSP_BUF_LEN, static_cast(buf.size())); + + sendMsg(n); + return recvMsg(buf); + } + +private: + static uint16_t alignedAttrLen(uint16_t payload) { + return static_cast(NLA_HDRLEN + payload); + } + + static void addIntAttr(struct nlmsghdr* n, uint16_t type, uint32_t value) { + auto* attr = reinterpret_cast(reinterpret_cast(n) + n->nlmsg_len); + attr->nla_type = type; + attr->nla_len = alignedAttrLen(sizeof(uint32_t)); + std::memcpy(reinterpret_cast(attr) + NLA_HDRLEN, &value, sizeof(value)); + n->nlmsg_len += NLMSG_ALIGN(attr->nla_len); + } + + static void addStrAttr(struct nlmsghdr* n, uint16_t type, const char* s) { + auto* attr = reinterpret_cast(reinterpret_cast(n) + n->nlmsg_len); + const size_t len = std::strlen(s) + 1; + attr->nla_type = type; + attr->nla_len = alignedAttrLen(static_cast(len)); + std::memcpy(reinterpret_cast(attr) + NLA_HDRLEN, s, len); + n->nlmsg_len += NLMSG_ALIGN(attr->nla_len); + } + + void sendMsg(struct nlmsghdr* n) { + struct sockaddr_nl addr{}; + addr.nl_family = AF_NETLINK; + ssize_t rv = sendto(fd_, n, n->nlmsg_len, 0, + reinterpret_cast(&addr), sizeof(addr)); + if (rv < 0 || static_cast(rv) != n->nlmsg_len) { + throwSystemError("QDMA netlink send failed"); + } + } + + Response recvMsg(std::vector& buf) { + std::memset(buf.data(), 0, buf.size()); + ssize_t rv = recv(fd_, buf.data(), buf.size(), 0); + if (rv < 0) { + throwSystemError("QDMA netlink receive failed"); + } + + auto* n = reinterpret_cast(buf.data()); + if (n->nlmsg_type == NLMSG_ERROR) { + int err = 0; + if (n->nlmsg_len >= NLMSG_LENGTH(sizeof(struct nlmsgerr))) { + auto* nlerr = reinterpret_cast(NLMSG_DATA(n)); + err = nlerr->error; + } + throw std::runtime_error("QDMA netlink returned an error response (" + + std::to_string(err) + ")"); + } + + Response resp; + auto* p = reinterpret_cast(buf.data()) + NLMSG_LENGTH(GENL_HDRLEN); + int maxlen = static_cast(n->nlmsg_len) - static_cast(NLMSG_LENGTH(GENL_HDRLEN)); + while (maxlen > 0) { + auto* na = reinterpret_cast(p); + if (na->nla_len < NLA_HDRLEN) { + break; + } + const int len = NLA_ALIGN(na->nla_len); + const char* payload = reinterpret_cast(na) + NLA_HDRLEN; + + if (na->nla_type == XNL_ATTR_GENMSG) { + resp.genmsg.assign(payload); + } else if (na->nla_type < XNL_ATTR_MAX) { + uint32_t v = 0; + std::memcpy(&v, payload, sizeof(v)); + resp.attrs[na->nla_type] = v; + resp.present[na->nla_type] = true; + } + + p += len; + maxlen -= len; + } + return resp; + } + + uint16_t resolveFamily(const char* name) { + std::vector buf(RESP_BUF_LEN, 0); + auto* n = reinterpret_cast(buf.data()); + + n->nlmsg_type = GENL_ID_CTRL; + n->nlmsg_flags = NLM_F_REQUEST; + n->nlmsg_pid = getpid(); + n->nlmsg_seq = seq_++; + n->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + + auto* g = reinterpret_cast(NLMSG_DATA(n)); + g->cmd = CTRL_CMD_GETFAMILY; + g->version = XNL_VERSION; + + addStrAttr(n, CTRL_ATTR_FAMILY_NAME, name); + sendMsg(n); + + std::memset(buf.data(), 0, buf.size()); + ssize_t rv = recv(fd_, buf.data(), buf.size(), 0); + if (rv < 0) { + throwSystemError(std::string("Failed to resolve QDMA netlink family '") + name + + "' (is the upstream qdma driver loaded?)"); + } + if (n->nlmsg_type == NLMSG_ERROR) { + int err = 0; + if (n->nlmsg_len >= NLMSG_LENGTH(sizeof(struct nlmsgerr))) { + auto* nlerr = reinterpret_cast(NLMSG_DATA(n)); + err = nlerr->error; + } + throw std::runtime_error(std::string("QDMA netlink family '") + name + + "' not found (netlink error " + std::to_string(err) + + "; is the upstream qdma driver loaded?)"); + } + + auto* p = reinterpret_cast(buf.data()) + NLMSG_LENGTH(GENL_HDRLEN); + int maxlen = static_cast(n->nlmsg_len) - static_cast(NLMSG_LENGTH(GENL_HDRLEN)); + while (maxlen > 0) { + auto* na = reinterpret_cast(p); + if (na->nla_len < NLA_HDRLEN) { + break; + } + if (na->nla_type == CTRL_ATTR_FAMILY_ID) { + uint16_t id = 0; + std::memcpy(&id, reinterpret_cast(na) + NLA_HDRLEN, sizeof(id)); + return id; + } + const int len = NLA_ALIGN(na->nla_len); + p += len; + maxlen -= len; + } + throw std::runtime_error(std::string("QDMA netlink family '") + name + + "' id not present in response"); + } + + int fd_ = -1; + uint16_t family_ = 0; + uint32_t seq_ = 0; +}; + +namespace { + +/// Queue flags for a bidirectional AXI-MM queue pair. +constexpr uint32_t QFLAG_MM_BI = XNL_F_QMODE_MM | XNL_F_QDIR_BOTH; + +/// Queue flags for `q start`. In addition to mode/direction, this must enable +/// the descriptor-ring writeback/completion-status reporting and fetch credit, +/// exactly as `dma-ctl q start` does by default (see +/// QDMA/linux-kernel/apps/dma-ctl/cmd_parse.c). Without the writeback bits the +/// poll-mode driver never observes MM completion and every transfer times out. +constexpr uint32_t QFLAG_MM_BI_START = + QFLAG_MM_BI | + XNL_F_CMPL_STATUS_EN | XNL_F_CMPL_STATUS_ACC_EN | + XNL_F_CMPL_STATUS_PEND_CHK | XNL_F_CMPL_STATUS_DESC_EN | + XNL_F_FETCH_CREDIT; + +/// Default descriptor-ring size index for `q start`, matching `dma-ctl`'s +/// default ("ring size set to 2048"). +constexpr uint32_t QRNGSZ_IDX_DEFAULT = 9; + +} // namespace + +QdmaDriverDevice::QdmaDriverDevice(const std::string& boardBdf, + std::optional ringSizeIndex) + : nl_(std::make_unique()), + ringSizeIndex_(ringSizeIndex.value_or(QRNGSZ_IDX_DEFAULT)) { + const ParsedBdf board = parseBdf(boardBdf); + + // Enumerate the driver's devices and find the QDMA function on this board. + // Each PF line looks like: "qdma61001\t0000:61:00.1\tmax QP: 512, 0~511". + XnlClient::Response resp = nl_->sendCmd(XNL_CMD_DEV_LIST, /*devIndex=*/0, {}); + if (resp.genmsg.empty()) { + throw std::runtime_error( + "Upstream QDMA driver reported no devices (dev list empty). " + "Ensure the stock qdma driver is bound to the board."); + } + + bool found = false; + std::istringstream lines(resp.genmsg); + std::string line; + while (std::getline(lines, line)) { + std::istringstream tokens(line); + std::string name; + std::string bdfStr; + if (!(tokens >> name >> bdfStr)) { + continue; + } + if (name.rfind("qdma", 0) != 0 || name.rfind("qdmavf", 0) == 0) { + continue; // not a PF entry + } + + ParsedBdf entry; + try { + entry = parseBdf(bdfStr); + } catch (const std::exception&) { + continue; + } + if (entry.base() != board.base()) { + continue; + } + + index_ = static_cast(std::stoul(name.substr(4), nullptr, 16)); + functionBdf_ = bdfStr; + + const auto pos = line.find("max QP:"); + if (pos != std::string::npos) { + qmax_ = static_cast(std::strtoul(line.c_str() + pos + 7, nullptr, 10)); + } + found = true; + if (entry.function.value_or(0) == 1) { + break; // Prefer the QDMA PF used by SLASH/V80. + } + } + + if (!found) { + throw std::runtime_error( + "No upstream QDMA function found for board " + board.base() + + " (is the stock qdma driver bound to this board's PF?)"); + } + + // Ask the driver how many MM (memory-mapped) DMA engine channels this + // function exposes so we can spread queues across them. CPM5 (V80) + // reports 2; older/soft IPs report 1. Best-effort: if the query fails or + // the attribute is absent, fall back to a single channel (channel 0). + try { + XnlClient::Response info = nl_->sendCmd(XNL_CMD_DEV_INFO, index_, {}); + if (info.present[XNL_ATTR_DEV_MM_CHANNEL_MAX] && + info.attrs[XNL_ATTR_DEV_MM_CHANNEL_MAX] > 0) { + mmChannelMax_ = info.attrs[XNL_ATTR_DEV_MM_CHANNEL_MAX]; + } + } catch (const std::exception&) { + mmChannelMax_ = 1; + } +} + +QdmaDriverDevice::~QdmaDriverDevice() = default; + +void QdmaDriverDevice::refreshQmax() { + XnlClient::Response resp = nl_->sendCmd(XNL_CMD_DEV_LIST, /*devIndex=*/0, {}); + std::istringstream lines(resp.genmsg); + std::string line; + + while (std::getline(lines, line)) { + std::istringstream tokens(line); + std::string name; + std::string bdfStr; + if (!(tokens >> name >> bdfStr) || bdfStr != functionBdf_) { + continue; + } + + const auto pos = line.find("max QP:"); + if (pos == std::string::npos) { + throw std::runtime_error("QDMA device list entry for " + functionBdf_ + + " does not report max QP"); + } + + qmax_ = static_cast(std::strtoul(line.c_str() + pos + 7, nullptr, 10)); + return; + } + + throw std::runtime_error("QDMA function " + functionBdf_ + + " disappeared from driver device list after qmax update"); +} + +void QdmaDriverDevice::ensureQmax(unsigned needed) { + if (qmax_ >= needed) { + return; + } + + const std::string path = "/sys/bus/pci/devices/" + functionBdf_ + "/qdma/qmax"; + std::ofstream qmaxFile(path); + if (!qmaxFile.is_open()) { + throw std::runtime_error( + "Need at least " + std::to_string(needed) + " queues but qmax is " + + std::to_string(qmax_) + " and cannot open " + path + + " to raise it (run as root, or set qmax manually with dma-ctl)"); + } + qmaxFile << needed << std::endl; + qmaxFile.close(); + if (qmaxFile.fail()) { + throw std::runtime_error( + "Failed to write qmax=" + std::to_string(needed) + " to " + path + + " (queues may be active; stop them or reload the driver)"); + } + refreshQmax(); + if (qmax_ < needed) { + throw std::runtime_error( + "QDMA qmax update requested " + std::to_string(needed) + + " queues, but driver reports only " + std::to_string(qmax_)); + } +} + +void QdmaDriverDevice::queueAdd(uint32_t qid) { + XnlClient::Response resp = nl_->sendCmd(XNL_CMD_Q_ADD, index_, + {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI}}); + if (resp.present[XNL_ATTR_ERROR] && resp.attrs[XNL_ATTR_ERROR] != 0) { + throw std::runtime_error("QDMA q add failed for qid " + std::to_string(qid) + ": " + + (resp.genmsg.empty() ? "netlink error" : resp.genmsg)); + } +} + +void QdmaDriverDevice::queueStart(uint32_t qid, uint32_t channel) { + // The caller chooses the MM engine channel for this queue pair. It has to + // be carried on `q start`: the driver only reads XNL_ATTR_MM_CHANNEL in its + // start handler (via qdma_queue_config) and defaults the queue to channel 0 + // whenever the attribute is absent. mmChannelMax_ is always >= 1, so the + // modulo keeps an out-of-range request inside the device's channel count. + channel %= mmChannelMax_; + XnlClient::Response resp = nl_->sendCmd(XNL_CMD_Q_START, index_, + {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI_START}, + {XNL_ATTR_QRNGSZ_IDX, ringSizeIndex_}, {XNL_ATTR_MM_CHANNEL, channel}}); + if (resp.present[XNL_ATTR_ERROR] && resp.attrs[XNL_ATTR_ERROR] != 0) { + throw std::runtime_error("QDMA q start failed for qid " + std::to_string(qid) + ": " + + (resp.genmsg.empty() ? "netlink error" : resp.genmsg)); + } +} + +void QdmaDriverDevice::queueStop(uint32_t qid) noexcept { + try { + (void)nl_->sendCmd(XNL_CMD_Q_STOP, index_, + {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI}}); + } catch (...) { + // Best-effort teardown. + } +} + +void QdmaDriverDevice::queueDel(uint32_t qid) noexcept { + try { + (void)nl_->sendCmd(XNL_CMD_Q_DEL, index_, + {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI}}); + } catch (...) { + // Best-effort teardown. + } +} + +std::string QdmaDriverDevice::charDevPath(uint32_t qid) const { + char name[64]; + std::snprintf(name, sizeof(name), "/dev/qdma%05x-MM-%u", index_, qid); + return std::string(name); +} + +QdmaDriverBuffer::QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid, + uint64_t physAddr, uint64_t size, + int mmChannel) + : device_(&device), qid_(qid), physAddr_(physAddr) { + try { + mapping_ = raw::createHostMapping(size, physAddr); + + // mmChannel < 0 means auto: spread the queue across channels by qid. + const uint32_t channel = (mmChannel < 0) + ? qid_ + : static_cast(mmChannel); + + device_->queueAdd(qid_); + queueAdded_ = true; + device_->queueStart(qid_, channel); + queueStarted_ = true; + + const std::string path = device_->charDevPath(qid_); + fd_ = open(path.c_str(), O_RDWR | O_CLOEXEC); + if (fd_ < 0) { + throwSystemError("Failed to open QDMA char device " + path); + } + } catch (...) { + cleanup(); + throw; + } +} + +QdmaDriverBuffer::~QdmaDriverBuffer() { + cleanup(); +} + +void QdmaDriverBuffer::moveFrom(QdmaDriverBuffer& other) noexcept { + device_ = other.device_; + qid_ = other.qid_; + queueAdded_ = other.queueAdded_; + queueStarted_ = other.queueStarted_; + fd_ = other.fd_; + physAddr_ = other.physAddr_; + mapping_ = other.mapping_; + + other.device_ = nullptr; + other.qid_ = 0; + other.queueAdded_ = false; + other.queueStarted_ = false; + other.fd_ = -1; + other.physAddr_ = 0; + other.mapping_ = raw::HostMapping{}; +} + +void QdmaDriverBuffer::cleanup() noexcept { + if (fd_ >= 0) { + (void)close(fd_); + fd_ = -1; + } + if (device_ != nullptr && queueStarted_) { + device_->queueStop(qid_); + queueStarted_ = false; + } + if (device_ != nullptr && queueAdded_) { + device_->queueDel(qid_); + queueAdded_ = false; + } + raw::destroyHostMapping(mapping_); +} + +} // namespace smi::qdma_driver diff --git a/smi/src/qdma_driver_backend.hpp b/smi/src/qdma_driver_backend.hpp new file mode 100644 index 00000000..e0d94fa8 --- /dev/null +++ b/smi/src/qdma_driver_backend.hpp @@ -0,0 +1,160 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software + * and associated documentation files (the "Software"), to deal in the Software without restriction, + * including without limitation the rights to use, copy, modify, merge, publish, distribute, + * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or + * substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT + * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef SMI_QDMA_DRIVER_BACKEND_HPP +#define SMI_QDMA_DRIVER_BACKEND_HPP + +/// @file qdma_driver_backend.hpp +/// @brief Raw-transfer backend for the off-the-shelf Xilinx QDMA driver. +/// +/// This backend mirrors the surface of validate.cpp's SLASH RawTransferBuffer +/// (data()/getSize()/syncToDevice()/syncFromDevice()) so the templated +/// integrity and bandwidth tests work unchanged, but it drives the upstream +/// QDMA driver (submodules/qdma_drv) instead of SLASH/libslash: +/// +/// - Queue lifecycle (add/start/stop/del) is performed over generic netlink +/// (family "xnl_pf"), exactly as the `dma-ctl` utility does. +/// - The function's `qmax` is provisioned via sysfs if it is too small. +/// - Data movement uses the per-queue char device /dev/qdma-MM- +/// with the device address carried as the file offset. +/// +/// Unlike SLASH there is no control device or custom ioctl ABI; the stock +/// driver must be bound to the function for any of this to work. + +#include +#include +#include +#include + +#include "raw_transfer.hpp" + +namespace smi::qdma_driver { + +/// Opaque generic-netlink client used to talk to the QDMA driver. +class XnlClient; + +/// Represents a single PCIe function managed by the upstream QDMA driver. +/// +/// Resolves the driver's device index from the board BDF, ensures enough +/// queues are provisioned (qmax), and provides queue lifecycle operations. +class QdmaDriverDevice { +public: + /// @param boardBdf Board-level BDF "DDDD:BB:DD" (function is resolved by + /// enumerating the driver's device list). + explicit QdmaDriverDevice(const std::string& boardBdf, + std::optional ringSizeIndex = std::nullopt); + ~QdmaDriverDevice(); + + QdmaDriverDevice(const QdmaDriverDevice&) = delete; + QdmaDriverDevice& operator=(const QdmaDriverDevice&) = delete; + + /// Ensure the function has at least @p needed queues provisioned, writing + /// the sysfs `qmax` entry (which re-initializes the queue set) if required. + void ensureQmax(unsigned needed); + + /// Add + start a bidirectional AXI-MM queue pair at relative index @p qid. + /// + /// queueStart pins the pair to MM engine channel `qid % mmChannelMax()`, + /// spreading queues across the device's MM channels (the channel only + /// takes effect on `q start`; the driver ignores it on `q add`). + void queueAdd(uint32_t qid); + /// Start queue @p qid pinned to MM engine @p channel (0-based, clamped to + /// the device's channel count). + void queueStart(uint32_t qid, uint32_t channel); + + /// Stop + delete a queue pair. Best-effort; never throws (safe in dtors). + void queueStop(uint32_t qid) noexcept; + void queueDel(uint32_t qid) noexcept; + + /// Char-device path for queue @p qid, e.g. "/dev/qdma61001-MM-0". + std::string charDevPath(uint32_t qid) const; + + /// Resolved 0000:BB:DD.F PCI address of the QDMA function. + const std::string& functionBdf() const { return functionBdf_; } + + /// Number of MM (memory-mapped) DMA engine channels the function exposes. + /// CPM5 (V80) reports 2; older/soft IPs report 1. Always >= 1. + unsigned mmChannelMax() const { return mmChannelMax_; } + +private: + void refreshQmax(); + + std::unique_ptr nl_; + unsigned index_ = 0; ///< Driver device index (qdma). + std::string functionBdf_; ///< Full BDF including function. + unsigned qmax_ = 0; ///< Currently provisioned queue count. + unsigned mmChannelMax_ = 1; ///< Number of MM engine channels (>= 1). + uint32_t ringSizeIndex_ = 0; ///< QRNGSZ_IDX used when starting queues. +}; + +/// One host buffer bound to a freshly-created upstream QDMA queue pair. +/// +/// Satisfies the buffer concept used by validate.cpp's testDataIntegrity() / +/// testBandwidth() templates. +class QdmaDriverBuffer { +public: + /// @param mmChannel Concrete MM channel to pin to, or -1 to spread the + /// queue across channels by qid % channel-count. + QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid, uint64_t physAddr, uint64_t size, + int mmChannel); + + QdmaDriverBuffer(const QdmaDriverBuffer&) = delete; + QdmaDriverBuffer& operator=(const QdmaDriverBuffer&) = delete; + + QdmaDriverBuffer(QdmaDriverBuffer&& other) noexcept { moveFrom(other); } + QdmaDriverBuffer& operator=(QdmaDriverBuffer&& other) noexcept { + if (this != &other) { + cleanup(); + moveFrom(other); + } + return *this; + } + + ~QdmaDriverBuffer(); + + void* data() { return mapping_.data; } + uint64_t getSize() const { return mapping_.size; } + + void syncToDevice(uint64_t offset, uint64_t size) { + raw::validateSyncRange(offset, size, mapping_.size, physAddr_, mapping_.step); + raw::rawTransfer(fd_, mapping_.data, physAddr_, offset, size, mapping_.step, /*toDevice=*/true); + } + + void syncFromDevice(uint64_t offset, uint64_t size) { + raw::validateSyncRange(offset, size, mapping_.size, physAddr_, mapping_.step); + raw::rawTransfer(fd_, mapping_.data, physAddr_, offset, size, mapping_.step, /*toDevice=*/false); + } + +private: + void moveFrom(QdmaDriverBuffer& other) noexcept; + void cleanup() noexcept; + + QdmaDriverDevice* device_ = nullptr; + uint32_t qid_ = 0; + bool queueAdded_ = false; + bool queueStarted_ = false; + int fd_ = -1; + uint64_t physAddr_ = 0; + raw::HostMapping mapping_{}; +}; + +} // namespace smi::qdma_driver + +#endif // SMI_QDMA_DRIVER_BACKEND_HPP diff --git a/smi/src/raw_transfer.hpp b/smi/src/raw_transfer.hpp new file mode 100644 index 00000000..41988d31 --- /dev/null +++ b/smi/src/raw_transfer.hpp @@ -0,0 +1,242 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software + * and associated documentation files (the "Software"), to deal in the Software without restriction, + * including without limitation the rights to use, copy, modify, merge, publish, distribute, + * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or + * substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT + * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef SMI_RAW_TRANSFER_HPP +#define SMI_RAW_TRANSFER_HPP + +/// @file raw_transfer.hpp +/// @brief Backend-agnostic helpers for the raw QDMA memory-mapped transfer +/// tests used by `smi validate`. +/// +/// The SLASH backend (libslash queue-pair fds) and the off-the-shelf Xilinx +/// QDMA-driver backend (/dev/qdma-MM- char devices) share the exact +/// same host-side buffer setup and pread/pwrite transfer loop -- only the way +/// the file descriptor and device address get provisioned differs. Those +/// shared pieces live here so both backends behave (and time) identically. + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/// Per-transfer timing instrumentation. +/// +/// When SLASH_QDMA_TIMING is non-zero (compile-time flag, e.g. built with +/// -DSLASH_QDMA_TIMING=1), the raw-transfer path logs the wall-clock cost of +/// each pwrite/pread syscall plus the aggregate per-transfer time and +/// effective bandwidth. This is the userspace counterpart to the kernel's +/// SLASH_QDMA_TIMING breakdown. +#ifndef SLASH_QDMA_TIMING +#define SLASH_QDMA_TIMING 0 +#endif + +namespace smi::raw { + +/// Host transfer sizes mirror libvrtd's QDMA staging policy. +static constexpr uint64_t BASE_TRANSFER_STEP_SIZE = 4ULL * 1024ULL; + +[[noreturn]] inline void throwSystemError(const std::string& message) { + throw std::runtime_error(message + ": " + std::strerror(errno)); +} + +/// A host staging buffer plus the DMA granule it is backed by. +/// +/// `step` is always BASE_TRANSFER_STEP_SIZE (4 KiB base pages). It is used +/// only for range/alignment validation: the whole range is transferred in a +/// single syscall and the kernel builds one DMA descriptor per page. +struct HostMapping { + void* data = nullptr; + uint64_t size = 0; + uint64_t step = 0; +}; + +/// Create a host staging buffer of 4 KiB base pages for raw transfers. @p +/// physAddr is the device address this buffer backs and is only used to make +/// error messages actionable. +inline HostMapping createHostMapping(uint64_t size, uint64_t physAddr) { + HostMapping mapping; + mapping.size = size; + + // Map regular base pages. MAP_POPULATE is deliberately omitted: it would + // pre-fault the whole buffer during mmap(), i.e. before the MADV_NOHUGEPAGE + // below can take effect. On hosts with transparent hugepages set to + // "always", those early faults hand back 2 MiB THP compound pages, and + // MADV_NOHUGEPAGE does not split pages that are already faulted in. The + // driver's strict 4 KiB base-page path (slash_qdma_map_user_base_pages_to_sgl) + // then rejects every transfer with -EINVAL ("4 KiB transfer is not backed by + // a base page"). + (void)physAddr; + mapping.data = mmap(nullptr, + size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, + 0); + if (mapping.data == MAP_FAILED) { + throwSystemError("Failed to mmap raw transfer host buffer"); + } + + // Disable THP for this region *before* any page is faulted in, so that + // every fault below allocates a genuine 4 KiB base page. + if (madvise(mapping.data, size, MADV_NOHUGEPAGE) != 0) { + const int savedErrno = errno; + (void)munmap(mapping.data, size); + mapping.data = nullptr; + errno = savedErrno; + throwSystemError("Failed to disable transparent hugepages for raw transfer host buffer"); + } + + // Pre-fault the buffer as base pages, replacing the MAP_POPULATE dropped + // above. Touching one byte per page now that VM_NOHUGEPAGE is set forces + // the kernel to back each page with a 4 KiB base page up front (and keeps + // the page-fault cost out of the timed transfer loop). + { + volatile uint8_t* touch = static_cast(mapping.data); + for (uint64_t off = 0; off < size; off += BASE_TRANSFER_STEP_SIZE) { + touch[off] = 0; + } + } + + mapping.step = BASE_TRANSFER_STEP_SIZE; + return mapping; +} + +/// Release a host mapping created by createHostMapping(). +inline void destroyHostMapping(HostMapping& mapping) noexcept { + if (mapping.data != nullptr && mapping.data != MAP_FAILED) { + (void)munmap(mapping.data, mapping.size); + mapping.data = nullptr; + } +} + +/// Validate that a [offset, offset+size) request is aligned and in range for a +/// buffer of @p bufSize bytes backing device address @p physAddr, given the +/// mapping's @p step. +inline void validateSyncRange(uint64_t offset, uint64_t size, uint64_t bufSize, + uint64_t physAddr, uint64_t step) { + if (step == 0 || size == 0) { + throw std::invalid_argument("Invalid raw transfer size"); + } + if ((offset % step) != 0 || (size % step) != 0 || + (bufSize % step) != 0 || (physAddr % step) != 0) { + throw std::invalid_argument("Raw transfer range is not aligned to the host mapping step"); + } + if (offset > bufSize || size > bufSize - offset) { + throw std::out_of_range("Raw transfer range exceeds buffer size"); + } + // Both granules transfer the whole range in a single pread/pwrite, so the + // size must fit in ssize_t regardless of step. + if (size > static_cast(std::numeric_limits::max())) { + throw std::invalid_argument("Raw transfer size exceeds syscall limit"); + } +} + +/// Perform a raw memory-mapped QDMA transfer over @p fd using pread/pwrite, +/// with the device (endpoint) address encoded as the file offset. +/// +/// @param fd Per-queue char device / queue-pair fd. +/// @param data Host staging buffer base. +/// @param physAddr Device-side base address for this buffer. +/// @param offset Byte offset within the buffer (and added to physAddr). +/// @param size Number of bytes to transfer. +/// @param step Mapping step size (see HostMapping::step). +/// @param toDevice true for H2C (pwrite), false for C2H (pread). +inline void rawTransfer(int fd, void* data, uint64_t physAddr, uint64_t offset, + uint64_t size, [[maybe_unused]] uint64_t step, + bool toDevice) { + // Issue the whole range in a single syscall regardless of page granule. + // The kernel pins every page in the range and builds one descriptor per + // page, submitting a single multi-descriptor libqdma request (libqdma + // refills the descriptor ring as needed). This keeps syscall/submit + // overhead independent of the page size -- the 4 KiB path no longer costs + // one syscall (and one single-descriptor DMA) per page. + const uint64_t syscallSize = size; + const uint64_t endOffset = offset + size; +#if SLASH_QDMA_TIMING + const auto xferStart = std::chrono::steady_clock::now(); +#endif + + for (uint64_t currOffset = offset; currOffset < endOffset; currOffset += syscallSize) { + uint64_t transferred = 0; + while (transferred < syscallSize) { + const auto* src = static_cast(data) + currOffset + transferred; + auto* dst = static_cast(data) + currOffset + transferred; + const size_t remaining = static_cast(syscallSize - transferred); + const off_t devOffset = static_cast(physAddr + currOffset + transferred); + +#if SLASH_QDMA_TIMING + const auto callStart = std::chrono::steady_clock::now(); +#endif + ssize_t ret = toDevice + ? pwrite(fd, src, remaining, devOffset) + : pread(fd, dst, remaining, devOffset); + + if (ret < 0 && errno == EINTR) { + continue; + } + if (ret <= 0) { + throwSystemError(toDevice ? "Raw QDMA write failed" : "Raw QDMA read failed"); + } +#if SLASH_QDMA_TIMING + const auto callNs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - callStart) + .count(); + std::fprintf(stderr, + "validate: timing %s dev=0x%llx bytes=%zu ret=%zd syscall=%lld ns\n", + toDevice ? "H2C" : "C2H", + static_cast(devOffset), remaining, ret, + static_cast(callNs)); +#endif + transferred += static_cast(ret); + } + } + +#if SLASH_QDMA_TIMING + const auto totalNs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - xferStart) + .count(); + const double mb = static_cast(size) / (1024.0 * 1024.0); + const double sec = static_cast(totalNs) / 1e9; + std::fprintf(stderr, + "validate: timing %s xfer dev=0x%llx size=%llu step=%llu total=%lld ns (%.1f MB/s)\n", + toDevice ? "H2C" : "C2H", + static_cast(physAddr + offset), + static_cast(size), + static_cast(step), static_cast(totalNs), + sec > 0.0 ? mb / sec : 0.0); +#endif +} + +} // namespace smi::raw + +#endif // SMI_RAW_TRANSFER_HPP diff --git a/smi/src/smi.cpp b/smi/src/smi.cpp index 063fdb8e..e9da5e3d 100644 --- a/smi/src/smi.cpp +++ b/smi/src/smi.cpp @@ -27,6 +27,9 @@ /// reset, validate, debug). #include +#include +#include +#include #include #include @@ -108,11 +111,90 @@ static int smiMain(int argc, char **argv) { // -- validate (memory integrity + bandwidth) -- auto* validateCommand = app.add_subcommand("validate", "Validate board memory (integrity + bandwidth)"); Validate::Options validateOptions; + auto addValidateSizeOption = [&](const char* name, uint64_t* target, const char* description) { + return validateCommand->add_option_function( + name, + [target, name, &validateOptions](const std::string& value) { + try { + *target = Validate::parseByteSizeOption(value); + validateOptions.placementExplicit = true; + } catch (const std::exception& e) { + throw CLI::ValidationError(name, e.what()); + } + }, + description); + }; validateCommand->add_option("-d,--device", validateOptions.bdf, "Board address (e.g. 03:00 or 0000:03:00)")->required(); validateCommand->add_option("-j,--threads", validateOptions.threads, "Number of parallel buffers/threads (1-64)")->default_val(8)->check(CLI::Range(1u, 64u)); validateCommand->add_flag("-R,--no-reset", validateOptions.noReset, "Skip the device reset step before running memory tests"); + validateCommand->add_option_function("--mm-channel", + [&validateOptions](const std::string& value) { + try { + validateOptions.mmChannels = Validate::parseMmChannelSpec(value); + } catch (const std::exception& e) { + throw CLI::ValidationError("--mm-channel", e.what()); + } + }, + "AXI-MM/NoC channel per buffer: auto|0|1 applied to all buffers, or a " + "comma-separated list with exactly one entry per buffer position " + "(2 x --threads entries, e.g. -j 1 -> '0,1'); no repeating. " + "auto stripes across channels by qid&1. Default auto.") + ->default_str("auto"); + addValidateSizeOption("--buffer-size", &validateOptions.bufferSize, + "Size of each validate buffer; accepts bytes or k/K/m/M suffixes (max 512M)") + ->default_str("512M"); + addValidateSizeOption("--offset", &validateOptions.offset, + "Distance between logical validate buffer positions; accepts bytes or k/K/m/M suffixes") + ->default_str("512M"); + addValidateSizeOption("--starting-offset", &validateOptions.startingOffset, + "Offset from each memory-space base for logical position 0; accepts bytes or k/K/m/M suffixes") + ->default_str("0"); + auto* rawTransferFlag = validateCommand->add_flag("--raw-transfer-test", validateOptions.rawTransferTest, + "Use libslash raw QDMA transfers instead of VRTD buffers (implies --no-reset)"); + auto* useQdmaDriverFlag = validateCommand->add_flag("--use-qdma-driver", validateOptions.useQdmaDriver, + "Run the raw transfer test over the off-the-shelf Xilinx QDMA driver " + "(/dev/qdma-MM-) instead of SLASH; requires the stock qdma driver " + "bound to the board. Implies --no-reset; mutually exclusive with --raw-transfer-test"); + rawTransferFlag->excludes(useQdmaDriverFlag); + useQdmaDriverFlag->excludes(rawTransferFlag); + auto* ddrOnlyFlag = validateCommand->add_flag("--ddr-only", validateOptions.ddrOnly, + "Run only DDR memory tests (skip HBM)"); + auto* hbmOnlyFlag = validateCommand->add_flag("--hbm-only", validateOptions.hbmOnly, + "Run only HBM memory tests (skip DDR)"); + ddrOnlyFlag->excludes(hbmOnlyFlag); + hbmOnlyFlag->excludes(ddrOnlyFlag); + const std::map channelAllocationMap{ + {"auto", Validate::Options::ChannelAllocation::Auto}, + {"paired", Validate::Options::ChannelAllocation::Paired}, + }; + validateCommand->add_option("--channel-allocation", validateOptions.channelAllocation, + "Raw-transfer NoC channel/memory placement (raw modes only): " + "auto (interleaved: mm-channel=qid&1, linear addressing; default) or " + "paired (couple mm-channel to a distinct memory region/NSU per " + "--channel-region-stride, mirroring dma-perf offset_ch0/offset_ch1)") + ->transform(CLI::CheckedTransformer(channelAllocationMap, CLI::ignore_case)) + ->default_str("auto"); + addValidateSizeOption("--channel-region-stride", &validateOptions.channelRegionStride, + "In --channel-allocation paired mode, byte distance between the two per-channel " + "memory regions (NSU/pseudo-channel stride); accepts k/K/m/M/g/G suffixes") + ->default_str("16G"); + validateCommand->add_option_function("--ring-size-index", + [&validateOptions](uint32_t value) { + validateOptions.ringSizeIndex = value; + }, + "Raw-transfer queue descriptor-ring size index (0-15). Overrides the backend default.") + ->check(CLI::Range(0u, 15u)) + ->default_str("backend default"); + validateCommand->add_option("--bandwidth-iterations", validateOptions.bandwidthIterations, + "Raw-transfer bandwidth mode only: repeat each whole-buffer transfer this many times") + ->default_val(1)->check(CLI::Range(static_cast(1), + std::numeric_limits::max())); + validateCommand->add_option("--bandwidth-duration", validateOptions.bandwidthDuration, + "Raw-transfer bandwidth mode only: repeat whole-buffer transfers for this many seconds " + "(0 disables duration mode)") + ->default_val(0.0)->check(CLI::NonNegativeNumber); // -- debug (low-level debug utilities) -- auto* debugCommand = app.add_subcommand("debug", "Low-level debug utilities"); diff --git a/smi/src/validate.cpp b/smi/src/validate.cpp index 605e9abc..60f28159 100644 --- a/smi/src/validate.cpp +++ b/smi/src/validate.cpp @@ -33,22 +33,733 @@ /// TODO: Decide whether vrt::Device should gain a vrtbin-less constructor so /// that commands like validate can go through the standard vrt:: layer. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include "validate.hpp" +#include +#include +#include +#include #include +#include #include +#include #include +#include +#include #include #include +#include +#include +#include +#include #include +#include #include +#include +#include +#include + #include #include "bdf.hpp" +#include "raw_transfer.hpp" + +#ifdef SMI_ENABLE_QDMA_DRIVER_BACKEND +#include "qdma_driver_backend.hpp" +#endif + +extern "C" { +#include +} + +namespace { + +using smi::raw::throwSystemError; + +/// Region constants mirror vrt/vrtd/src/allocator.h, which is private. +static constexpr uint64_t HBM_BASE = 0x4000000000ULL; +static constexpr uint64_t DDR_BASE = 0x60000000000ULL; +static constexpr uint64_t MEM_REGION_SIZE = 512ULL * 1024 * 1024; +static constexpr uint64_t MEMORY_SPACE_SIZE = 64ULL * MEM_REGION_SIZE; +static constexpr uint64_t MAX_BUFFER_SIZE = MEM_REGION_SIZE; +static constexpr uint64_t TRANSFER_ALIGNMENT = 4096ULL; + +static constexpr uint32_t QDMA_Q_MODE_MM = 0; +static constexpr uint32_t QDMA_DIR_H2C = 0x1; +static constexpr uint32_t QDMA_DIR_C2H = 0x2; +static constexpr uint32_t QDMA_RING_SZ_IDX = 0; + +/// Required alignment for placement sizes/offsets: the QDMA transfer alignment +/// (4 KiB base pages). +static uint64_t requiredAlignment(const Validate::Options& options) { + (void)options; + return TRANSFER_ALIGNMENT; +} + +/// Per-buffer AXI-MM channel selection. A single-element list applies to every +/// buffer; otherwise the list has exactly one entry per logical position +/// (validated in validatePlacement) and is indexed directly. +static Validate::Options::MmChannel mmChannelForPosition(const Validate::Options& options, + uint64_t position) { + const auto& list = options.mmChannels; + return list.size() == 1 ? list.front() : list[position]; +} + +/// Map the per-buffer channel selection to the vrtd channel enum. +static vrtd::MmChannel vrtdMmChannel(const Validate::Options& options, uint64_t position) { + switch (mmChannelForPosition(options, position)) { + case Validate::Options::MmChannel::Ch0: return vrtd::MmChannel::Ch0; + case Validate::Options::MmChannel::Ch1: return vrtd::MmChannel::Ch1; + case Validate::Options::MmChannel::Auto: + default: return vrtd::MmChannel::Auto; + } +} + +/// Map the per-buffer channel selection to the SLASH UAPI channel enum. +static slash_qdma_mm_channel slashMmChannel(const Validate::Options& options, uint64_t position) { + switch (mmChannelForPosition(options, position)) { + case Validate::Options::MmChannel::Ch0: return SLASH_QDMA_MM_CHANNEL_0; + case Validate::Options::MmChannel::Ch1: return SLASH_QDMA_MM_CHANNEL_1; + case Validate::Options::MmChannel::Auto: + default: return SLASH_QDMA_MM_CHANNEL_AUTO; + } +} + +/// Map the per-buffer channel selection to a concrete channel for the +/// off-the-shelf QDMA driver; -1 means auto (queue spreads by qid % channels). +static int qdmaDriverMmChannel(const Validate::Options& options, uint64_t position) { + switch (mmChannelForPosition(options, position)) { + case Validate::Options::MmChannel::Ch0: return 0; + case Validate::Options::MmChannel::Ch1: return 1; + case Validate::Options::MmChannel::Auto: + default: return -1; + } +} + +static std::string trim(std::string_view text) { + size_t first = 0; + while (first < text.size() && + std::isspace(static_cast(text[first]))) { + ++first; + } + + size_t last = text.size(); + while (last > first && + std::isspace(static_cast(text[last - 1]))) { + --last; + } + + return std::string{text.substr(first, last - first)}; +} + +static uint64_t parseByteSizeText(std::string_view text) { + std::string value = trim(text); + if (value.empty()) { + throw std::invalid_argument("value must not be empty"); + } + + uint64_t multiplier = 1; + if (!value.empty() && (value.back() == 'b' || value.back() == 'B')) { + value.pop_back(); + } + if (!value.empty()) { + const char suffix = value.back(); + if (suffix == 'k' || suffix == 'K') { + multiplier = 1024ULL; + value.pop_back(); + } else if (suffix == 'm' || suffix == 'M') { + multiplier = 1024ULL * 1024ULL; + value.pop_back(); + } else if (suffix == 'g' || suffix == 'G') { + multiplier = 1024ULL * 1024ULL * 1024ULL; + value.pop_back(); + } + } + + value = trim(value); + if (value.empty() || value.front() == '-' || value.front() == '+') { + throw std::invalid_argument("value must be an unsigned byte count"); + } + + size_t parsed = 0; + uint64_t bytes = 0; + try { + bytes = std::stoull(value, &parsed, 0); + } catch (const std::exception&) { + throw std::invalid_argument("value must be an unsigned byte count"); + } + + if (parsed != value.size()) { + throw std::invalid_argument("unrecognized byte-size suffix"); + } + if (bytes > std::numeric_limits::max() / multiplier) { + throw std::invalid_argument("byte-size value is too large"); + } + + return bytes * multiplier; +} + +static bool isAligned(uint64_t value, uint64_t alignment) { + return (value % alignment) == 0; +} + +static bool checkAligned(const char* name, uint64_t value, uint64_t alignment) { + if (!isAligned(value, alignment)) { + std::cerr << "validate: " << name << " must be " << alignment + << "-byte aligned" << std::endl; + return false; + } + return true; +} + +static bool checkMemoryPlacementRange(const char* memoryName, + const Validate::Options& options, + uint64_t positions) { + if (positions == 0) { + return true; + } + + const uint64_t lastPosition = positions - 1; + if (lastPosition != 0 && + options.offset > (std::numeric_limits::max() - options.startingOffset) / + lastPosition) { + std::cerr << "validate: " << memoryName + << " placement overflows 64-bit address arithmetic" << std::endl; + return false; + } + + const uint64_t lastStart = options.startingOffset + lastPosition * options.offset; + if (lastStart > MEMORY_SPACE_SIZE || options.bufferSize > MEMORY_SPACE_SIZE - lastStart) { + std::cerr << "validate: " << memoryName << " placement exceeds available " + << (MEMORY_SPACE_SIZE / (1024ULL * 1024ULL)) << " MiB address space" + << std::endl; + return false; + } + + return true; +} + +/// Paired-mode per-channel region stride (NSU / pseudo-channel spacing), +/// resolving 0 to half the per-memory address space. +static uint64_t pairedRegionStride(const Validate::Options& options) { + return options.channelRegionStride != 0 ? options.channelRegionStride + : (MEMORY_SPACE_SIZE / 2); +} + +/// Placement check for Paired channel allocation: even/odd positions occupy two +/// regions `pairedRegionStride()` bytes apart, each packed by in-region index. +/// Verifies neither region overflows into the next nor past the memory space. +static bool checkMemoryPlacementRangePaired(const char* memoryName, + const Validate::Options& options, + uint64_t positions) { + if (positions == 0) { + return true; + } + + const uint64_t stride = pairedRegionStride(options); + const uint64_t alignment = requiredAlignment(options); + if (stride == 0 || (stride % alignment) != 0) { + std::cerr << "validate: --channel-region-stride must be a non-zero multiple of " + << alignment << " bytes" << std::endl; + return false; + } + if (stride > MEMORY_SPACE_SIZE) { + std::cerr << "validate: --channel-region-stride exceeds the " + << (MEMORY_SPACE_SIZE / (1024ULL * 1024ULL)) << " MiB per-memory address space" + << std::endl; + return false; + } + + // Highest in-region index used across both regions (positions 0..positions-1, + // split even/odd, each using index = position >> 1). + const uint64_t maxIndex = (positions - 1) >> 1; + if (maxIndex != 0 && + options.offset > (std::numeric_limits::max() - options.startingOffset) / maxIndex) { + std::cerr << "validate: " << memoryName + << " paired placement overflows 64-bit address arithmetic" << std::endl; + return false; + } + const uint64_t lastStart = options.startingOffset + maxIndex * options.offset; + + // Each region must hold its last buffer without spilling into the next region. + if (lastStart > stride || options.bufferSize > stride - lastStart) { + std::cerr << "validate: " << memoryName + << " paired placement overflows the per-channel region (stride " << stride + << " bytes); reduce --threads/--buffer-size/--offset or raise" + " --channel-region-stride" << std::endl; + return false; + } + // Region 1 sits one stride higher and must still fit the memory space. + if (lastStart + options.bufferSize > MEMORY_SPACE_SIZE - stride) { + std::cerr << "validate: " << memoryName + << " paired placement exceeds available " + << (MEMORY_SPACE_SIZE / (1024ULL * 1024ULL)) << " MiB address space" + << std::endl; + return false; + } + return true; +} + +static bool validatePlacement(const Validate::Options& options) { + const uint64_t positions = 2ULL * options.threads; + if (options.mmChannels.size() != 1 && options.mmChannels.size() != positions) { + std::cerr << "validate: --mm-channel list must have exactly 1 or " << positions + << " entries (one per buffer position = 2 x --threads); got " + << options.mmChannels.size() << std::endl; + return false; + } + + if (options.bufferSize == 0 || options.bufferSize > MAX_BUFFER_SIZE) { + std::cerr << "validate: --buffer-size must be in the range 1..512M" << std::endl; + return false; + } + if (options.offset == 0) { + std::cerr << "validate: --offset must be greater than zero" << std::endl; + return false; + } + const uint64_t alignment = requiredAlignment(options); + if (!checkAligned("--buffer-size", options.bufferSize, alignment) || + !checkAligned("--offset", options.offset, alignment) || + !checkAligned("--starting-offset", options.startingOffset, alignment)) { + return false; + } + if (options.offset < options.bufferSize) { + std::cerr << "validate: --offset must be at least --buffer-size so buffers do not overlap" + << std::endl; + return false; + } + + const bool paired = + options.channelAllocation == Validate::Options::ChannelAllocation::Paired; + if (paired && !options.rawTransferTest && !options.useQdmaDriver) { + std::cerr << "validate: --channel-allocation paired only applies to the raw transfer" + " tests (--raw-transfer-test or --use-qdma-driver)" << std::endl; + return false; + } + if ((options.bandwidthIterations > 1 || options.bandwidthDuration > 0.0) && + !options.rawTransferTest && !options.useQdmaDriver) { + std::cerr << "validate: --bandwidth-iterations/--bandwidth-duration only apply to the raw transfer" + " tests (--raw-transfer-test or --use-qdma-driver)" << std::endl; + return false; + } + if (options.ringSizeIndex.has_value() && + !options.rawTransferTest && !options.useQdmaDriver) { + std::cerr << "validate: --ring-size-index only applies to the raw transfer" + " tests (--raw-transfer-test or --use-qdma-driver)" << std::endl; + return false; + } + if (options.bandwidthDuration < 0.0) { + std::cerr << "validate: --bandwidth-duration must be non-negative" << std::endl; + return false; + } + if (options.ringSizeIndex.has_value() && *options.ringSizeIndex > 15) { + std::cerr << "validate: --ring-size-index must be in the range 0..15" << std::endl; + return false; + } + + const auto checkRange = paired ? checkMemoryPlacementRangePaired : checkMemoryPlacementRange; + if (!options.ddrOnly && !checkRange("HBM", options, positions)) { + return false; + } + if (!options.hbmOnly && !checkRange("DDR", options, positions)) { + return false; + } + + return true; +} + +static uint64_t addressFor(uint64_t memoryBase, + const Validate::Options& options, + uint64_t position) { + return memoryBase + options.startingOffset + position * options.offset; +} + +/// Device address for a raw-transfer buffer, honouring the channel-allocation +/// strategy. In Paired mode the mm-channel (position&1 -- which SLASH maps to +/// the SW-context host_id and hence the CPM5 NoC NMU) is coupled to a distinct +/// memory region (NSU): even positions land in region 0, odd positions in +/// region 1, pairedRegionStride() bytes higher, each packed by its in-region +/// index. This mirrors dma-perf's offset_ch0/offset_ch1 so the two NMUs drive +/// independent memory endpoints instead of converging on one. +static uint64_t rawAddressFor(uint64_t memoryBase, + const Validate::Options& options, + uint64_t position) { + if (options.channelAllocation == Validate::Options::ChannelAllocation::Paired) { + const uint64_t channel = position & 1ULL; + const uint64_t inRegionIndex = position >> 1; + return memoryBase + channel * pairedRegionStride(options) + + options.startingOffset + inRegionIndex * options.offset; + } + return addressFor(memoryBase, options, position); +} + +/// Print which raw-transfer channel-allocation strategy is in effect. +static void printChannelAllocation(const Validate::Options& options) { + if (options.channelAllocation == Validate::Options::ChannelAllocation::Paired) { + std::cout << "Channel allocation: paired (even positions -> mm-channel 0 / region 0, " + "odd -> mm-channel 1 / region 1; region stride 0x" + << std::hex << pairedRegionStride(options) << std::dec << " bytes)" << std::endl; + } else { + std::cout << "Channel allocation: auto (mm-channel = qid&1, linear addressing)" << std::endl; + } +} + +/// Print the raw-transfer queue ring-size override, when one was requested. +static void printRingSizeIndex(const Validate::Options& options) { + if (options.ringSizeIndex.has_value()) { + std::cout << "QDMA ring size index: " << *options.ringSizeIndex << std::endl; + } +} + +/// Print the per-buffer AXI-MM channel selection in effect. +static void printMmChannel(const Validate::Options& options) { + std::cout << "MM channel: "; + for (size_t i = 0; i < options.mmChannels.size(); ++i) { + if (i != 0) { + std::cout << ","; + } + switch (options.mmChannels[i]) { + case Validate::Options::MmChannel::Ch0: std::cout << "0"; break; + case Validate::Options::MmChannel::Ch1: std::cout << "1"; break; + case Validate::Options::MmChannel::Auto: + default: std::cout << "auto"; break; + } + } + std::cout << (options.mmChannels.size() == 1 ? " (all buffers)" : " (per buffer position)") + << std::endl; +} + +static bool checkHostMemoryBudget(const Validate::Options& options) { + const uint64_t maxConcurrentBuffers = (!options.ddrOnly && !options.hbmOnly) + ? 4ULL * options.threads + : 2ULL * options.threads; + const uint64_t requiredBytes = maxConcurrentBuffers * options.bufferSize; + + const long pageSize = sysconf(_SC_PAGESIZE); + const long availablePages = sysconf(_SC_AVPHYS_PAGES); + + if (pageSize <= 0 || availablePages <= 0) { + std::cerr << "Warning: unable to estimate available host memory for validate; " + << "peak mapped buffer footprint is " + << (requiredBytes / (1024ULL * 1024ULL)) << " MiB." << std::endl; + return true; + } + + const auto availableBytes = static_cast(pageSize) * + static_cast(availablePages); + if (requiredBytes > availableBytes) { + std::cerr << "validate: requested test can map up to " + << (requiredBytes / (1024ULL * 1024ULL)) << " MiB of host buffers, " + << "but only about " << (availableBytes / (1024ULL * 1024ULL)) + << " MiB is currently available. Reduce --threads or use --ddr-only/--hbm-only." + << std::endl; + return false; + } + + return true; +} + +static void warnIfNotRoot(const char* mode) { + if (geteuid() != 0) { + std::cerr << "Warning: " << mode + << " usually needs root or udev-granted access to QDMA device nodes/sysfs." + << std::endl; + } +} + +std::string readDevNameFromUevent(const std::filesystem::path& miscPath) { + std::ifstream uevent(miscPath / "uevent"); + if (!uevent.is_open()) { + throw std::runtime_error("Failed to open " + (miscPath / "uevent").string()); + } + + std::string line; + while (std::getline(uevent, line)) { + static constexpr std::string_view key{"DEVNAME="}; + if (!line.starts_with(key)) { + continue; + } + + std::string devName = line.substr(key.size()); + while (!devName.empty() && (devName.back() == '\n' || devName.back() == '\r')) { + devName.pop_back(); + } + return "/dev/" + devName; + } + + throw std::runtime_error("No DEVNAME entry found in " + (miscPath / "uevent").string()); +} + +std::string resolveQdmaDevicePath(const std::string& boardBdf) { + static const std::filesystem::path MISC_PATH{"/sys/class/misc"}; + + const std::string exactName = "slash_qdma_ctl_" + boardBdf + ".1"; + const auto exactPath = MISC_PATH / exactName; + if (std::filesystem::exists(exactPath)) { + return readDevNameFromUevent(exactPath); + } + + const std::string prefix = "slash_qdma_ctl_" + boardBdf + "."; + std::vector matches; + for (const auto& entry : std::filesystem::directory_iterator(MISC_PATH)) { + const std::string name = entry.path().filename().string(); + if (name.starts_with(prefix)) { + matches.push_back(entry.path()); + } + } + + if (matches.empty()) { + throw std::runtime_error( + "No QDMA misc device found for board " + boardBdf + + " (looked for /sys/class/misc/" + prefix + "*)"); + } + + std::sort(matches.begin(), matches.end()); + if (matches.size() > 1) { + std::cerr << "Warning: multiple QDMA devices found for " << boardBdf + << "; using " << matches.front().filename().string() << std::endl; + } + + return readDevNameFromUevent(matches.front()); +} + +class RawQdmaDevice { +public: + explicit RawQdmaDevice(const std::string& path) : qdma_{slash_qdma_open(path.c_str())} { + if (qdma_ == nullptr) { + throwSystemError("Failed to open QDMA device " + path); + } + } + + RawQdmaDevice(const RawQdmaDevice&) = delete; + RawQdmaDevice& operator=(const RawQdmaDevice&) = delete; + + RawQdmaDevice(RawQdmaDevice&& other) noexcept : qdma_{other.qdma_} { + other.qdma_ = nullptr; + } + + RawQdmaDevice& operator=(RawQdmaDevice&& other) noexcept { + if (this != &other) { + cleanup(); + qdma_ = other.qdma_; + other.qdma_ = nullptr; + } + return *this; + } + + ~RawQdmaDevice() { + cleanup(); + } + + slash_qdma* get() const { + return qdma_; + } + +private: + void cleanup() { + if (qdma_ != nullptr) { + (void)slash_qdma_close(qdma_); + qdma_ = nullptr; + } + } + + slash_qdma* qdma_ = nullptr; +}; + +class RawTransferBuffer { +public: + RawTransferBuffer(slash_qdma* qdma, uint64_t physAddr, uint64_t size, + slash_qdma_mm_channel mmChannel, + uint32_t ringSizeIndex) + : qdma_{qdma}, physAddr_{physAddr}, size_{size}, + mmChannel_{mmChannel}, ringSizeIndex_{ringSizeIndex} { + try { + createBuffer(); + createQpair(); + } catch (...) { + cleanup(); + throw; + } + } + + RawTransferBuffer(const RawTransferBuffer&) = delete; + RawTransferBuffer& operator=(const RawTransferBuffer&) = delete; + + RawTransferBuffer(RawTransferBuffer&& other) noexcept { + moveFrom(other); + } + + RawTransferBuffer& operator=(RawTransferBuffer&& other) noexcept { + if (this != &other) { + cleanup(); + moveFrom(other); + } + return *this; + } + + ~RawTransferBuffer() { + cleanup(); + } + + void* data() { + return data_; + } + + uint64_t getSize() const { + return size_; + } + + void syncToDevice(uint64_t offset, uint64_t size) { + validateSyncRange(offset, size); + transfer(offset, size, /*toDevice=*/true); + } + + void syncFromDevice(uint64_t offset, uint64_t size) { + validateSyncRange(offset, size); + transfer(offset, size, /*toDevice=*/false); + } + +private: + void moveFrom(RawTransferBuffer& other) noexcept { + qdma_ = other.qdma_; + fd_ = other.fd_; + qid_ = other.qid_; + qpairCreated_ = other.qpairCreated_; + qpairStarted_ = other.qpairStarted_; + buf_ = other.buf_; + data_ = other.data_; + physAddr_ = other.physAddr_; + size_ = other.size_; + transferStepSize_ = other.transferStepSize_; + mmChannel_ = other.mmChannel_; + ringSizeIndex_ = other.ringSizeIndex_; + + other.qdma_ = nullptr; + other.fd_ = -1; + other.qid_ = 0; + other.qpairCreated_ = false; + other.qpairStarted_ = false; + other.buf_ = slash_qdma_buffer{}; + other.data_ = nullptr; + other.physAddr_ = 0; + other.size_ = 0; + other.transferStepSize_ = 0; + other.ringSizeIndex_ = QDMA_RING_SZ_IDX; + } + + void createBuffer() { + // The kernel owns the DMA buffer (pages + SGL + DMA map built once at + // create time); we mmap it for CPU access via buf_.addr. + if (slash_qdma_buffer_create(qdma_, size_, &buf_) != 0) { + throwSystemError("Failed to create raw transfer DMA buffer"); + } + data_ = buf_.addr; + transferStepSize_ = smi::raw::BASE_TRANSFER_STEP_SIZE; + + // Pre-fault the mapping so the page-fault cost stays out of the timed + // transfer loop. + auto* touch = static_cast(data_); + for (uint64_t off = 0; off < size_; off += transferStepSize_) { + touch[off] = 0; + } + } + + void createQpair() { + if (qdma_ == nullptr || size_ == 0) { + throw std::invalid_argument("Invalid raw transfer buffer arguments"); + } + + struct slash_qdma_qpair_add req{}; + req.size = sizeof(req); + req.mode = QDMA_Q_MODE_MM; + req.dir_mask = QDMA_DIR_H2C | QDMA_DIR_C2H; + req.mm_channel = mmChannel_; + req.h2c_ring_sz = ringSizeIndex_; + req.c2h_ring_sz = ringSizeIndex_; + req.cmpt_ring_sz = ringSizeIndex_; + + if (slash_qdma_qpair_add(qdma_, &req) != 0) { + throwSystemError("Failed to add raw transfer QDMA queue pair"); + } + qid_ = req.qid; + qpairCreated_ = true; + + if (slash_qdma_qpair_start(qdma_, qid_) != 0) { + throwSystemError("Failed to start raw transfer QDMA queue pair"); + } + qpairStarted_ = true; + + fd_ = slash_qdma_qpair_get_fd(qdma_, qid_, O_CLOEXEC); + if (fd_ < 0) { + throwSystemError("Failed to get raw transfer QDMA queue fd"); + } + } + + void validateSyncRange(uint64_t offset, uint64_t size) const { + smi::raw::validateSyncRange(offset, size, size_, physAddr_, transferStepSize_); + } + + void transfer(uint64_t offset, uint64_t size, bool toDevice) { + // Issue via the array transfer ioctl with a single sub-transfer on this + // buffer's queue pair (qpair_index 0). Channel parallelism for the + // bandwidth test comes from running many buffers concurrently, each + // pinned to a channel by mm_channel (see the channel-allocation knobs). + struct slash_qdma_subxfer xfer{}; + xfer.qpair_index = 0; + xfer.direction = toDevice ? SLASH_QDMA_XFER_H2C : SLASH_QDMA_XFER_C2H; + xfer.buf_fd = buf_.fd; + xfer.buf_offset = offset; + xfer.dev_addr = physAddr_ + offset; + xfer.length = size; -/// Buffer size for each allocation (64 MB — one allocator subregion). -static constexpr uint64_t BUFFER_SIZE = 64ULL * 1024 * 1024; + ssize_t n = slash_qdma_qpair_transfer_batch(fd_, &xfer, 1); + if (n < 0) { + throwSystemError(toDevice ? "Raw QDMA write failed" + : "Raw QDMA read failed"); + } + if (static_cast(n) != size) { + throw std::runtime_error("Raw QDMA transfer moved fewer bytes than requested"); + } + } + + void cleanup() { + if (fd_ >= 0) { + (void)close(fd_); + fd_ = -1; + } + if (qpairStarted_) { + (void)slash_qdma_qpair_stop(qdma_, qid_); + qpairStarted_ = false; + } + if (qpairCreated_) { + (void)slash_qdma_qpair_del(qdma_, qid_); + qpairCreated_ = false; + } + if (buf_.addr != nullptr) { + (void)slash_qdma_buffer_destroy(&buf_); + buf_ = slash_qdma_buffer{}; + } + data_ = nullptr; + } + + slash_qdma* qdma_ = nullptr; + int fd_ = -1; + uint32_t qid_ = 0; + bool qpairCreated_ = false; + bool qpairStarted_ = false; + slash_qdma_buffer buf_{}; + void* data_ = nullptr; + uint64_t physAddr_ = 0; + uint64_t size_ = 0; + uint64_t transferStepSize_ = 0; + slash_qdma_mm_channel mmChannel_ = SLASH_QDMA_MM_CHANNEL_AUTO; + uint32_t ringSizeIndex_ = QDMA_RING_SZ_IDX; +}; /// Fill @p buf with a deterministic pattern seeded by @p seed. static void fillPattern(void* buf, uint64_t size, uint32_t seed) { @@ -74,12 +785,19 @@ static bool verifyPattern(const void* buf, uint64_t size, uint32_t seed) { /// Run data integrity on every buffer: write pattern → sync to device → /// clear host → sync from device → verify. +/// +/// Output policy: per-buffer FAIL lines are printed as failures occur; OK +/// buffers are silent. A single summary line ("N/N OK" or "M/N OK, K +/// FAIL") is printed at the end. +/// /// @return true if all buffers pass. -static bool testDataIntegrity(std::vector& buffers, +template +static bool testDataIntegrity(std::vector& buffers, const std::string& label) { - bool allPassed = true; + const size_t total = buffers.size(); + size_t passed = 0; - for (size_t i = 0; i < buffers.size(); ++i) { + for (size_t i = 0; i < total; ++i) { auto& buf = buffers[i]; uint32_t seed = static_cast(i); uint64_t size = buf.getSize(); @@ -90,73 +808,726 @@ static bool testDataIntegrity(std::vector& buffers, std::memset(buf.data(), 0, size); buf.syncFromDevice(0, size); - bool ok = verifyPattern(buf.data(), size, seed); - std::cout << " " << label << i << ": " - << (ok ? "OK" : "FAIL") << std::endl; - - if (!ok) { - allPassed = false; + if (verifyPattern(buf.data(), size, seed)) { + ++passed; + } else { + std::cout << " " << label << i << ": FAIL" << std::endl; } } - return allPassed; + if (passed == total) { + std::cout << " " << total << "/" << total << " OK" << std::endl; + } else { + std::cout << " " << passed << "/" << total << " OK, " + << (total - passed) << " FAIL" << std::endl; + } + + return passed == total; +} + +static double mbPerSecond(uint64_t bytes, std::chrono::duration elapsed) { + const double totalMB = static_cast(bytes) / (1024.0 * 1024.0); + return totalMB / elapsed.count(); +} + +static void printBandwidthMetric(const char* label, double mbps) { + std::cout << " " << label << ": " << std::fixed << std::setprecision(2) + << mbps << " MB/s" << std::endl; +} + +struct BandwidthRepeatOptions { + uint64_t iterations = 1; + std::chrono::duration duration{0.0}; + + bool durationMode() const { + return duration.count() > 0.0; + } + + bool isRepeated() const { + return durationMode() || iterations > 1; + } +}; + +static BandwidthRepeatOptions repeatOptionsFromValidate(const Validate::Options& options) { + BandwidthRepeatOptions repeat; + repeat.iterations = std::max(1, options.bandwidthIterations); + repeat.duration = std::chrono::duration(options.bandwidthDuration); + return repeat; } -/// Measure aggregate write and read bandwidth across all buffers in parallel -/// (one std::thread per buffer). -static void testBandwidth(std::vector& buffers) { +static void printBandwidthRepeatMode(const BandwidthRepeatOptions& repeat) { + if (repeat.durationMode()) { + std::cout << "Bandwidth mode: duration " << std::fixed << std::setprecision(3) + << repeat.duration.count() << " s" << std::endl; + } else if (repeat.iterations > 1) { + std::cout << "Bandwidth mode: " << repeat.iterations << " iterations" << std::endl; + } +} + +template +static uint64_t fillBuffers(std::vector& buffers, int value) { uint64_t totalBytes = 0; for (auto& buf : buffers) { - std::memset(buf.data(), 0xAB, buf.getSize()); + std::memset(buf.data(), value, buf.getSize()); totalBytes += buf.getSize(); } + return totalBytes; +} - // -- Write (H2C) bandwidth -- - auto writeStart = std::chrono::steady_clock::now(); - { - std::vector threads; - threads.reserve(buffers.size()); - for (auto& buf : buffers) { - threads.emplace_back([&buf] { - buf.syncToDevice(0, buf.getSize()); - }); +template +static void launchTransferThreads(std::vector& buffers, + bool toDevice, + std::vector& threads, + std::vector& errors, + size_t errorOffset) { + for (size_t i = 0; i < buffers.size(); ++i) { + threads.emplace_back([&buffers, &errors, i, errorOffset, toDevice] { + try { + if (toDevice) { + buffers[i].syncToDevice(0, buffers[i].getSize()); + } else { + buffers[i].syncFromDevice(0, buffers[i].getSize()); + } + } catch (...) { + errors[errorOffset + i] = std::current_exception(); + } + }); + } +} + +template +static void runTransfers(std::vector& buffers, bool toDevice) { + std::vector threads; + std::vector errors(buffers.size()); + threads.reserve(buffers.size()); + + launchTransferThreads(buffers, toDevice, threads, errors, 0); + + for (auto& t : threads) { + t.join(); + } + for (auto& error : errors) { + if (error) { + std::rethrow_exception(error); } - for (auto& t : threads) { - t.join(); + } +} + +static uint64_t joinRepeatedTransferThreads(std::vector& threads, + std::vector& errors, + const std::vector& bytes) { + for (auto& t : threads) { + t.join(); + } + for (auto& error : errors) { + if (error) { + std::rethrow_exception(error); } } - auto writeEnd = std::chrono::steady_clock::now(); - // -- Read (C2H) bandwidth -- - auto readStart = std::chrono::steady_clock::now(); - { - std::vector threads; - threads.reserve(buffers.size()); + uint64_t totalBytes = 0; + for (uint64_t value : bytes) { + totalBytes += value; + } + return totalBytes; +} + +template +static std::pair> +runRepeatedTransfers(std::vector& buffers, + bool toDevice, + const BandwidthRepeatOptions& repeat) { + std::vector threads; + std::vector errors(buffers.size()); + std::vector bytes(buffers.size(), 0); + threads.reserve(buffers.size()); + + const auto start = std::chrono::steady_clock::now(); + const auto deadline = start + repeat.duration; + + for (size_t i = 0; i < buffers.size(); ++i) { + threads.emplace_back([&buffers, &errors, &bytes, i, toDevice, repeat, deadline] { + try { + const uint64_t size = buffers[i].getSize(); + uint64_t completed = 0; + + if (repeat.durationMode()) { + while (std::chrono::steady_clock::now() < deadline) { + if (toDevice) { + buffers[i].syncToDevice(0, size); + } else { + buffers[i].syncFromDevice(0, size); + } + ++completed; + } + } else { + for (uint64_t iter = 0; iter < repeat.iterations; ++iter) { + if (toDevice) { + buffers[i].syncToDevice(0, size); + } else { + buffers[i].syncFromDevice(0, size); + } + ++completed; + } + } + + bytes[i] = completed * size; + } catch (...) { + errors[i] = std::current_exception(); + } + }); + } + + const uint64_t totalBytes = joinRepeatedTransferThreads(threads, errors, bytes); + const auto end = std::chrono::steady_clock::now(); + return {totalBytes, end - start}; +} + +template +static double testSingleDirectionBandwidth(std::vector& buffers, + bool toDevice, + const BandwidthRepeatOptions& repeat = {}) { + (void)fillBuffers(buffers, toDevice ? 0xAB : 0xCD); + + if (!toDevice) { + runTransfers(buffers, /*toDevice=*/true); for (auto& buf : buffers) { - threads.emplace_back([&buf] { - buf.syncFromDevice(0, buf.getSize()); - }); + std::memset(buf.data(), 0, buf.getSize()); } - for (auto& t : threads) { - t.join(); + } + + const auto [totalBytes, elapsed] = runRepeatedTransfers(buffers, toDevice, repeat); + + return mbPerSecond(totalBytes, elapsed); +} + +template +static void testBidirectionalBandwidth(std::vector& writeBuffers, + std::vector& readBuffers, + const BandwidthRepeatOptions& repeat = {}) { + (void)fillBuffers(writeBuffers, 0xAB); + (void)fillBuffers(readBuffers, 0xCD); + + // Prime device memory before timing so the C2H side reads initialized data. + runTransfers(readBuffers, /*toDevice=*/true); + for (auto& buf : readBuffers) { + std::memset(buf.data(), 0, buf.getSize()); + } + + std::vector threads; + std::vector errors(writeBuffers.size() + readBuffers.size()); + std::vector writeThreadBytes(writeBuffers.size(), 0); + std::vector readThreadBytes(readBuffers.size(), 0); + threads.reserve(errors.size()); + + const auto start = std::chrono::steady_clock::now(); + const auto deadline = start + repeat.duration; + + for (size_t i = 0; i < writeBuffers.size(); ++i) { + threads.emplace_back([&writeBuffers, &errors, &writeThreadBytes, i, repeat, deadline] { + try { + const uint64_t size = writeBuffers[i].getSize(); + uint64_t completed = 0; + + if (repeat.durationMode()) { + while (std::chrono::steady_clock::now() < deadline) { + writeBuffers[i].syncToDevice(0, size); + ++completed; + } + } else { + for (uint64_t iter = 0; iter < repeat.iterations; ++iter) { + writeBuffers[i].syncToDevice(0, size); + ++completed; + } + } + + writeThreadBytes[i] = completed * size; + } catch (...) { + errors[i] = std::current_exception(); + } + }); + } + for (size_t i = 0; i < readBuffers.size(); ++i) { + threads.emplace_back([&readBuffers, &errors, &readThreadBytes, i, + repeat, deadline, errorOffset = writeBuffers.size()] { + try { + const uint64_t size = readBuffers[i].getSize(); + uint64_t completed = 0; + + if (repeat.durationMode()) { + while (std::chrono::steady_clock::now() < deadline) { + readBuffers[i].syncFromDevice(0, size); + ++completed; + } + } else { + for (uint64_t iter = 0; iter < repeat.iterations; ++iter) { + readBuffers[i].syncFromDevice(0, size); + ++completed; + } + } + + readThreadBytes[i] = completed * size; + } catch (...) { + errors[errorOffset + i] = std::current_exception(); + } + }); + } + + for (auto& t : threads) { + t.join(); + } + const auto end = std::chrono::steady_clock::now(); + + for (auto& error : errors) { + if (error) { + std::rethrow_exception(error); } } - auto readEnd = std::chrono::steady_clock::now(); - double writeSec = std::chrono::duration(writeEnd - writeStart).count(); - double readSec = std::chrono::duration(readEnd - readStart).count(); - double totalMB = static_cast(totalBytes) / (1024.0 * 1024.0); + const auto elapsed = end - start; + uint64_t writeBytes = 0; + uint64_t readBytes = 0; + for (uint64_t value : writeThreadBytes) { + writeBytes += value; + } + for (uint64_t value : readThreadBytes) { + readBytes += value; + } + const double writeMBps = mbPerSecond(writeBytes, elapsed); + const double readMBps = mbPerSecond(readBytes, elapsed); + + printBandwidthMetric("Read", readMBps); + printBandwidthMetric("Write", writeMBps); + printBandwidthMetric("Total", readMBps + writeMBps); +} + +template +static void testBandwidthSuite(std::vector& singleDirectionBuffers, + const std::string& label, + const std::string& backendSuffix, + const BandwidthRepeatOptions& repeat = {}) { + std::cout << "Testing " << label << " read bandwidth (" + << singleDirectionBuffers.size() << " threads" << backendSuffix << ")..." << std::endl; + printBandwidthMetric("Read", testSingleDirectionBandwidth(singleDirectionBuffers, /*toDevice=*/false, repeat)); + + std::cout << "Testing " << label << " write bandwidth (" + << singleDirectionBuffers.size() << " threads" << backendSuffix << ")..." << std::endl; + printBandwidthMetric("Write", testSingleDirectionBandwidth(singleDirectionBuffers, /*toDevice=*/true, repeat)); +} + +template +static void testBidirectionalBandwidthSuite(std::vector& bidirectionalWriteBuffers, + std::vector& bidirectionalReadBuffers, + const std::string& label, + const std::string& backendSuffix, + const BandwidthRepeatOptions& repeat = {}) { + std::cout << "Testing " << label << " bidirectional bandwidth (" + << (bidirectionalWriteBuffers.size() + bidirectionalReadBuffers.size()) + << " threads" << backendSuffix << ")..." << std::endl; + testBidirectionalBandwidth(bidirectionalWriteBuffers, bidirectionalReadBuffers, repeat); +} + +static vrtd::Buffer openValidateHbmBuffer(const vrtd::Device& device, + const Validate::Options& options, + uint64_t position) { + if (options.placementExplicit) { + return device.openRawBuffer(addressFor(HBM_BASE, options, position), + options.bufferSize, vrtd::BufferAllocDir::Bidirectional, + vrtdMmChannel(options, position)); + } + + return device.openHbmBuffer(static_cast(position), options.bufferSize, + vrtd::BufferAllocDir::Bidirectional, + vrtdMmChannel(options, position)); +} + +static vrtd::Buffer openValidateDdrBuffer(const vrtd::Device& device, + const Validate::Options& options, + uint64_t position) { + if (options.placementExplicit) { + return device.openRawBuffer(addressFor(DDR_BASE, options, position), + options.bufferSize, vrtd::BufferAllocDir::Bidirectional, + vrtdMmChannel(options, position)); + } - std::cout << " Write: " << std::fixed << std::setprecision(2) - << (totalMB / writeSec) << " MB/s" << std::endl; - std::cout << " Read: " << std::fixed << std::setprecision(2) - << (totalMB / readSec) << " MB/s" << std::endl; + return device.openDdrBuffer(options.bufferSize, vrtd::BufferAllocDir::Bidirectional, + vrtdMmChannel(options, position)); +} + +static int runRawTransferTest(const std::string& bdf, const Validate::Options& options) { + const unsigned N = options.threads; + const BandwidthRepeatOptions repeat = repeatOptionsFromValidate(options); + + if (!options.noReset) { + std::cout << "Raw transfer mode skips reset; continuing without VRTD reset." << std::endl; + } + warnIfNotRoot("SLASH raw transfer mode"); + + const std::string qdmaPath = resolveQdmaDevicePath(bdf); + std::cout << "Using raw QDMA device " << qdmaPath << "..." << std::endl; + printChannelAllocation(options); + printMmChannel(options); + printRingSizeIndex(options); + printBandwidthRepeatMode(repeat); + + RawQdmaDevice qdma(qdmaPath); + const uint32_t ringSizeIndex = options.ringSizeIndex.value_or(QDMA_RING_SZ_IDX); + + if (!options.ddrOnly) { + std::cout << "Testing HBM data integrity (" << N << " regions, raw QDMA)..." << std::endl; + { + std::vector hbmBuffers; + hbmBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + hbmBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, i), + options.bufferSize, + slashMmChannel(options, i), ringSizeIndex); + } + + if (!testDataIntegrity(hbmBuffers, "HBM")) { + std::cerr << "HBM data integrity check failed" << std::endl; + return 1; + } + + testBandwidthSuite(hbmBuffers, "HBM", ", raw QDMA", repeat); + } + { + // Bidirectional HBM: positions interleave R/W across regions + // 0..2N-1. Reads land on even regions, writes on odd regions. + std::vector hbmWriteBuffers; + std::vector hbmReadBuffers; + hbmWriteBuffers.reserve(N); + hbmReadBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + hbmReadBuffers.emplace_back(qdma.get(), + rawAddressFor(HBM_BASE, options, 2 * i), + options.bufferSize, + slashMmChannel(options, 2 * i), ringSizeIndex); + hbmWriteBuffers.emplace_back(qdma.get(), + rawAddressFor(HBM_BASE, options, 2 * i + 1), + options.bufferSize, + slashMmChannel(options, 2 * i + 1), ringSizeIndex); + } + + testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", raw QDMA", repeat); + } + } + + if (!options.hbmOnly) { + std::cout << "Testing DDR data integrity (" << N << " buffers, raw QDMA)..." << std::endl; + { + std::vector ddrBuffers; + ddrBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + ddrBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, i), + options.bufferSize, + slashMmChannel(options, i), ringSizeIndex); + } + + if (!testDataIntegrity(ddrBuffers, "DDR")) { + std::cerr << "DDR data integrity check failed" << std::endl; + return 1; + } + + testBandwidthSuite(ddrBuffers, "DDR", ", raw QDMA", repeat); + } + { + // Bidirectional DDR: positions interleave R/W across slot indices + // 0..2N-1 of the DDR address space. + std::vector ddrWriteBuffers; + std::vector ddrReadBuffers; + ddrWriteBuffers.reserve(N); + ddrReadBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + ddrReadBuffers.emplace_back(qdma.get(), + rawAddressFor(DDR_BASE, options, 2 * i), + options.bufferSize, + slashMmChannel(options, 2 * i), ringSizeIndex); + ddrWriteBuffers.emplace_back(qdma.get(), + rawAddressFor(DDR_BASE, options, 2 * i + 1), + options.bufferSize, + slashMmChannel(options, 2 * i + 1), ringSizeIndex); + } + + testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", raw QDMA", repeat); + } + } + + if (!options.ddrOnly && !options.hbmOnly) { + { + std::vector parBuffers; + parBuffers.reserve(2 * N); + for (unsigned i = 0; i < N; ++i) { + parBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, i), + options.bufferSize, + slashMmChannel(options, i), ringSizeIndex); + } + for (unsigned i = 0; i < N; ++i) { + parBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, i), + options.bufferSize, + slashMmChannel(options, i), ringSizeIndex); + } + + testBandwidthSuite(parBuffers, "HBM+DDR", ", raw QDMA", repeat); + } + { + // Bidirectional HBM+DDR: 4N positions total. Positions 0..2N-1 + // are HBM (interleaved R/W across regions 0..2N-1); positions + // 2N..4N-1 are DDR (interleaved R/W across DDR slots 0..2N-1). + // Channel = (p / 2) & 1 throughout. + std::vector parWriteBuffers; + std::vector parReadBuffers; + parWriteBuffers.reserve(2 * N); + parReadBuffers.reserve(2 * N); + for (unsigned i = 0; i < N; ++i) { + parReadBuffers.emplace_back(qdma.get(), + rawAddressFor(HBM_BASE, options, 2 * i), + options.bufferSize, + slashMmChannel(options, 2 * i), ringSizeIndex); + parWriteBuffers.emplace_back(qdma.get(), + rawAddressFor(HBM_BASE, options, 2 * i + 1), + options.bufferSize, + slashMmChannel(options, 2 * i + 1), ringSizeIndex); + } + for (unsigned i = 0; i < N; ++i) { + parReadBuffers.emplace_back(qdma.get(), + rawAddressFor(DDR_BASE, options, 2 * i), + options.bufferSize, + slashMmChannel(options, 2 * i), ringSizeIndex); + parWriteBuffers.emplace_back(qdma.get(), + rawAddressFor(DDR_BASE, options, 2 * i + 1), + options.bufferSize, + slashMmChannel(options, 2 * i + 1), ringSizeIndex); + } + + testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", raw QDMA", repeat); + } + } + + return 0; +} + +/// Raw integrity + bandwidth test driven over the off-the-shelf Xilinx QDMA +/// driver instead of SLASH. smi provisions queues itself (qmax + netlink +/// add/start) and transfers over the per-queue char devices. +static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& options) { +#ifndef SMI_ENABLE_QDMA_DRIVER_BACKEND + (void)bdf; + (void)options; + std::cerr << "validate: this v80-smi build was configured without " + << "--use-qdma-driver support. Rebuild with " + << "-DSMI_ENABLE_QDMA_DRIVER_BACKEND=ON." << std::endl; + return 1; +#else + const unsigned N = options.threads; + const BandwidthRepeatOptions repeat = repeatOptionsFromValidate(options); + + if (!options.noReset) { + std::cout << "QDMA-driver raw mode skips reset; continuing without VRTD reset." << std::endl; + } + warnIfNotRoot("QDMA-driver raw mode"); + + const bool runParallel = !options.ddrOnly && !options.hbmOnly; + + std::cout << "Using off-the-shelf Xilinx QDMA driver for board " << bdf << "..." << std::endl; + printChannelAllocation(options); + printMmChannel(options); + printRingSizeIndex(options); + printBandwidthRepeatMode(repeat); + smi::qdma_driver::QdmaDriverDevice qdma(bdf, options.ringSizeIndex); + std::cout << "Resolved QDMA function " << qdma.functionBdf() << std::endl; + qdma.ensureQmax(runParallel ? 4 * N : 2 * N); + + const unsigned mmChannels = qdma.mmChannelMax(); + if (mmChannels > 1) { + std::cout << "Distributing queues across " << mmChannels + << " MM channels (channel = qid % " << mmChannels << ")." << std::endl; + } else { + std::cout << "Device exposes a single MM channel; all queues on channel 0." << std::endl; + } + + if (!options.ddrOnly) { + std::cout << "Testing HBM data integrity (" << N << " regions, QDMA driver)..." << std::endl; + { + std::vector hbmBuffers; + hbmBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + hbmBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, i), + options.bufferSize, + qdmaDriverMmChannel(options, i)); + } + + if (!testDataIntegrity(hbmBuffers, "HBM")) { + std::cerr << "HBM data integrity check failed" << std::endl; + return 1; + } + + testBandwidthSuite(hbmBuffers, "HBM", ", QDMA driver", repeat); + } + { + std::vector hbmWriteBuffers; + std::vector hbmReadBuffers; + hbmWriteBuffers.reserve(N); + hbmReadBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + hbmReadBuffers.emplace_back(qdma, i, + rawAddressFor(HBM_BASE, options, 2 * i), + options.bufferSize, + qdmaDriverMmChannel(options, 2 * i)); + hbmWriteBuffers.emplace_back(qdma, N + i, + rawAddressFor(HBM_BASE, options, 2 * i + 1), + options.bufferSize, + qdmaDriverMmChannel(options, 2 * i + 1)); + } + + testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", QDMA driver", repeat); + } + } + + if (!options.hbmOnly) { + std::cout << "Testing DDR data integrity (" << N << " buffers, QDMA driver)..." << std::endl; + { + std::vector ddrBuffers; + ddrBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + ddrBuffers.emplace_back(qdma, i, rawAddressFor(DDR_BASE, options, i), + options.bufferSize, + qdmaDriverMmChannel(options, i)); + } + + if (!testDataIntegrity(ddrBuffers, "DDR")) { + std::cerr << "DDR data integrity check failed" << std::endl; + return 1; + } + + testBandwidthSuite(ddrBuffers, "DDR", ", QDMA driver", repeat); + } + { + std::vector ddrWriteBuffers; + std::vector ddrReadBuffers; + ddrWriteBuffers.reserve(N); + ddrReadBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + ddrReadBuffers.emplace_back(qdma, i, + rawAddressFor(DDR_BASE, options, 2 * i), + options.bufferSize, + qdmaDriverMmChannel(options, 2 * i)); + ddrWriteBuffers.emplace_back(qdma, N + i, + rawAddressFor(DDR_BASE, options, 2 * i + 1), + options.bufferSize, + qdmaDriverMmChannel(options, 2 * i + 1)); + } + + testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", QDMA driver", repeat); + } + } + + if (runParallel) { + { + std::vector parBuffers; + parBuffers.reserve(2 * N); + for (unsigned i = 0; i < N; ++i) { + parBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, i), + options.bufferSize, + qdmaDriverMmChannel(options, i)); + } + for (unsigned i = 0; i < N; ++i) { + parBuffers.emplace_back(qdma, N + i, rawAddressFor(DDR_BASE, options, i), + options.bufferSize, + qdmaDriverMmChannel(options, i)); + } + + testBandwidthSuite(parBuffers, "HBM+DDR", ", QDMA driver", repeat); + } + { + std::vector parWriteBuffers; + std::vector parReadBuffers; + parWriteBuffers.reserve(2 * N); + parReadBuffers.reserve(2 * N); + for (unsigned i = 0; i < N; ++i) { + parReadBuffers.emplace_back(qdma, i, + rawAddressFor(HBM_BASE, options, 2 * i), + options.bufferSize, + qdmaDriverMmChannel(options, 2 * i)); + parWriteBuffers.emplace_back(qdma, 2 * N + i, + rawAddressFor(HBM_BASE, options, 2 * i + 1), + options.bufferSize, + qdmaDriverMmChannel(options, 2 * i + 1)); + } + for (unsigned i = 0; i < N; ++i) { + parReadBuffers.emplace_back(qdma, N + i, + rawAddressFor(DDR_BASE, options, 2 * i), + options.bufferSize, + qdmaDriverMmChannel(options, 2 * i)); + parWriteBuffers.emplace_back(qdma, 3 * N + i, + rawAddressFor(DDR_BASE, options, 2 * i + 1), + options.bufferSize, + qdmaDriverMmChannel(options, 2 * i + 1)); + } + + testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", QDMA driver", repeat); + } + } + + return 0; +#endif +} + +} // namespace + +uint64_t Validate::parseByteSizeOption(const std::string& text) { + return parseByteSizeText(text); +} + +std::vector Validate::parseMmChannelSpec(const std::string& text) { + std::vector result; + size_t start = 0; + while (true) { + const size_t comma = text.find(',', start); + std::string token = trim(comma == std::string::npos ? text.substr(start) + : text.substr(start, comma - start)); + std::transform(token.begin(), token.end(), token.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + if (token == "auto") { + result.push_back(Options::MmChannel::Auto); + } else if (token == "0") { + result.push_back(Options::MmChannel::Ch0); + } else if (token == "1") { + result.push_back(Options::MmChannel::Ch1); + } else { + throw std::invalid_argument("mm-channel entries must be auto, 0, or 1"); + } + if (comma == std::string::npos) { + break; + } + start = comma + 1; + } + if (result.empty()) { + throw std::invalid_argument("mm-channel spec must not be empty"); + } + return result; } int Validate::run(const Options& options) { std::string bdf = resolveBoardBdf(options.bdf, "validate"); unsigned N = options.threads; + if (!validatePlacement(options)) { + return 1; + } + + if (!checkHostMemoryBudget(options)) { + return 1; + } + + if (options.rawTransferTest) { + return runRawTransferTest(bdf, options); + } + + if (options.useQdmaDriver) { + return runQdmaDriverTest(bdf, options); + } + // -- Step 1: (Optional) Reset the device via vrtd -- if (!options.noReset) { std::cout << "Resetting device " << bdf << "..." << std::endl; @@ -171,43 +1542,117 @@ int Validate::run(const Options& options) { vrtd::Session session; auto device = session.getDeviceByBdf(bdf); + printMmChannel(options); + // -- Step 2: HBM — integrity then bandwidth -- - std::cout << "Testing HBM data integrity (" << N << " regions)..." << std::endl; - { - std::vector hbmBuffers; - hbmBuffers.reserve(N); - for (unsigned i = 0; i < N; ++i) { - hbmBuffers.push_back(device.openHbmBuffer(i, BUFFER_SIZE)); - } + if (!options.ddrOnly) { + std::cout << "Testing HBM data integrity (" << N << " regions)..." << std::endl; + { + std::vector hbmBuffers; + hbmBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + hbmBuffers.push_back(openValidateHbmBuffer(device, options, i)); + } + + if (!testDataIntegrity(hbmBuffers, "HBM")) { + std::cerr << "HBM data integrity check failed" << std::endl; + return 1; + } - if (!testDataIntegrity(hbmBuffers, "HBM")) { - std::cerr << "HBM data integrity check failed" << std::endl; - return 1; + testBandwidthSuite(hbmBuffers, "HBM", ""); } + // HBM buffers released. + { + std::vector hbmWriteBuffers; + std::vector hbmReadBuffers; + hbmWriteBuffers.reserve(N); + hbmReadBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + hbmReadBuffers.push_back(openValidateHbmBuffer(device, options, 2 * i)); + hbmWriteBuffers.push_back(openValidateHbmBuffer(device, options, 2 * i + 1)); + } - std::cout << "Testing HBM bandwidth (" << N << " threads)..." << std::endl; - testBandwidth(hbmBuffers); + testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ""); + } + // Bidirectional HBM buffers released. } - // HBM buffers released. // -- Step 3: DDR — integrity then bandwidth -- - std::cout << "Testing DDR data integrity (" << N << " buffers)..." << std::endl; - { - std::vector ddrBuffers; - ddrBuffers.reserve(N); - for (unsigned i = 0; i < N; ++i) { - ddrBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); + if (!options.hbmOnly) { + std::cout << "Testing DDR data integrity (" << N << " buffers)..." << std::endl; + { + std::vector ddrBuffers; + ddrBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + ddrBuffers.push_back(openValidateDdrBuffer(device, options, i)); + } + + if (!testDataIntegrity(ddrBuffers, "DDR")) { + std::cerr << "DDR data integrity check failed" << std::endl; + return 1; + } + + testBandwidthSuite(ddrBuffers, "DDR", ""); + } + // DDR buffers released. + { + std::vector ddrWriteBuffers; + std::vector ddrReadBuffers; + ddrWriteBuffers.reserve(N); + ddrReadBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + if (options.placementExplicit) { + ddrReadBuffers.push_back(openValidateDdrBuffer(device, options, 2 * i)); + ddrWriteBuffers.push_back(openValidateDdrBuffer(device, options, 2 * i + 1)); + } else { + ddrWriteBuffers.push_back(openValidateDdrBuffer(device, options, i)); + ddrReadBuffers.push_back(openValidateDdrBuffer(device, options, i)); + } + } + + testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ""); } + // Bidirectional DDR buffers released. + } + + // -- Step 4: HBM + DDR in parallel -- + if (!options.ddrOnly && !options.hbmOnly) { + { + std::vector parBuffers; + parBuffers.reserve(2 * N); + for (unsigned i = 0; i < N; ++i) { + parBuffers.push_back(openValidateHbmBuffer(device, options, i)); + } + for (unsigned i = 0; i < N; ++i) { + parBuffers.push_back(openValidateDdrBuffer(device, options, i)); + } - if (!testDataIntegrity(ddrBuffers, "DDR")) { - std::cerr << "DDR data integrity check failed" << std::endl; - return 1; + testBandwidthSuite(parBuffers, "HBM+DDR", ""); } + // Parallel single-direction buffers released. + { + std::vector parWriteBuffers; + std::vector parReadBuffers; + parWriteBuffers.reserve(2 * N); + parReadBuffers.reserve(2 * N); + for (unsigned i = 0; i < N; ++i) { + parReadBuffers.push_back(openValidateHbmBuffer(device, options, 2 * i)); + parWriteBuffers.push_back(openValidateHbmBuffer(device, options, 2 * i + 1)); + } + for (unsigned i = 0; i < N; ++i) { + if (options.placementExplicit) { + parReadBuffers.push_back(openValidateDdrBuffer(device, options, 2 * i)); + parWriteBuffers.push_back(openValidateDdrBuffer(device, options, 2 * i + 1)); + } else { + parWriteBuffers.push_back(openValidateDdrBuffer(device, options, i)); + parReadBuffers.push_back(openValidateDdrBuffer(device, options, i)); + } + } - std::cout << "Testing DDR bandwidth (" << N << " threads)..." << std::endl; - testBandwidth(ddrBuffers); + testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ""); + } + // Parallel bidirectional buffers released. } - // DDR buffers released. return 0; } diff --git a/smi/src/validate.hpp b/smi/src/validate.hpp index 2e5d1f8e..ef15a174 100644 --- a/smi/src/validate.hpp +++ b/smi/src/validate.hpp @@ -24,11 +24,15 @@ /// @file validate.hpp /// @brief Declaration of the Validate command. /// -/// The Validate command resets a V80 board and then exercises DDR and HBM -/// memory via PCIe by running data integrity checks followed by parallel -/// bandwidth measurements. +/// The Validate command optionally resets a V80 board and then exercises DDR +/// and HBM memory via PCIe by running data integrity checks followed by +/// parallel bandwidth measurements. Raw transfer modes skip reset and bypass +/// the default VRTD buffer path. +#include +#include #include +#include /// @brief Static entry-point for the validate command. /// @@ -39,15 +43,69 @@ class Validate { public: /// @brief Options parsed from the CLI for the validate command. struct Options { + /// @brief How raw-transfer buffers map QDMA MM/NoC channels onto memory. + /// + /// On CPM5 the host-side NoC ingress port (NMU) is selected per queue by + /// the SW-context mm-channel/host_id (SLASH uses qid&1), while the + /// memory-side NoC egress endpoint (NSU / pseudo-channel) is selected by + /// the device address. Sustaining both NMUs requires also spreading + /// across two NSUs; otherwise both ports converge on one memory endpoint + /// and bandwidth caps at a single path. This mirrors the off-the-shelf + /// dma-perf knobs offset_ch0/offset_ch1. + enum class ChannelAllocation { + Auto, ///< Interleaved: driver picks mm-channel (qid&1), addresses linear. Default; current behaviour. + Paired, ///< Couple mm-channel to a distinct memory region: even positions -> region 0, odd -> region 1. + }; + + /// @brief Per-queue AXI-MM/NoC channel selection for a buffer. + /// + /// Auto lets the driver stripe by qid&1; Ch0/Ch1 pin the queue to a + /// single AXI-MM channel (and hence NoC channel). Applies to the VRTD, + /// raw SLASH, and off-the-shelf QDMA-driver backends. + enum class MmChannel { + Auto, ///< Driver stripes by qid&1 (default). + Ch0, ///< Pin to AXI-MM/NoC channel 0. + Ch1, ///< Pin to AXI-MM/NoC channel 1. + }; + std::string bdf; ///< BDF (Bus:Device.Function) address of the target device. unsigned threads = 8; ///< Number of parallel buffers/threads (1-64). bool noReset = false; ///< Skip the device reset step before running memory tests. + bool ddrOnly = false; ///< Skip HBM phase (mutually exclusive with hbmOnly). + bool hbmOnly = false; ///< Skip DDR phase (mutually exclusive with ddrOnly). + bool rawTransferTest = false; ///< Use libslash raw QDMA transfers instead of VRTD buffers. + bool useQdmaDriver = false; ///< Run the raw test over the off-the-shelf Xilinx QDMA driver. + /// Per-buffer AXI-MM channel selection, indexed by buffer position + /// modulo size (a single entry applies to every buffer). Default auto. + std::vector mmChannels{MmChannel::Auto}; + uint64_t bufferSize = 512ULL * 1024ULL * 1024ULL; ///< Size of each test buffer. + uint64_t offset = 512ULL * 1024ULL * 1024ULL; ///< Distance between logical buffer positions. + uint64_t startingOffset = 0; ///< Offset from memory-space base for position 0. + bool placementExplicit = false; ///< True when any placement option was provided. + /// Raw-transfer NoC channel/memory placement strategy (raw modes only). + ChannelAllocation channelAllocation = ChannelAllocation::Auto; + /// Paired-mode byte distance between the two per-channel memory regions + /// (the NSU / pseudo-channel stride). Default 16 GiB == MEMORY_SPACE_SIZE/2, + /// which matches the dma-perf HBM offset_ch1-offset_ch0 spacing. + uint64_t channelRegionStride = 16ULL * 1024ULL * 1024ULL * 1024ULL; + /// Number of whole-buffer transfers per buffer in raw bandwidth phases. + uint64_t bandwidthIterations = 1; + /// Raw bandwidth phase duration in seconds. 0 means use fixed iterations. + double bandwidthDuration = 0.0; + /// Optional descriptor-ring size index for raw QDMA queue creation. + std::optional ringSizeIndex; }; /// @brief Executes the validate command. /// @param options Populated options struct. /// @return Exit code (0 on success). static int run(const Options& options); + + /// @brief Parse a byte-size option accepting bare values and k/K/m/M suffixes. + static uint64_t parseByteSizeOption(const std::string& text); + + /// @brief Parse an --mm-channel spec: a single auto|0|1 or a comma-separated list. + static std::vector parseMmChannelSpec(const std::string& text); }; #endif // SMI_VALIDATE_HPP diff --git a/vrt/src/qdma/qdma_intf.cpp b/vrt/src/qdma/qdma_intf.cpp index 780fb766..66454c60 100644 --- a/vrt/src/qdma/qdma_intf.cpp +++ b/vrt/src/qdma/qdma_intf.cpp @@ -20,6 +20,8 @@ #include +#include + #include #include @@ -56,61 +58,41 @@ QdmaIntf::~QdmaIntf() { } } +namespace { +constexpr uint64_t kQdmaPage = 4096ULL; +inline uint64_t roundUpToPage(uint64_t v) { return (v + kQdmaPage - 1) & ~(kQdmaPage - 1); } +} // namespace + ssize_t QdmaIntf::write_from_buffer(const char* fname, char* buffer, uint64_t size, uint64_t base) { if (qpairFd < 0) { utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "QDMA streaming not initialized"); return -EIO; } - int fd = qpairFd; - ssize_t rc; - uint64_t count = 0; - char* buf = buffer; - off_t offset = base; - - do { /* Support zero byte transfer */ - uint64_t bytes = size - count; - - if (bytes > RW_MAX_SIZE) bytes = RW_MAX_SIZE; - - if (offset) { - rc = lseek(fd, offset, SEEK_SET); - if (rc < 0) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not write to {}", fname); - return -EIO; - } - if (rc != offset) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not write to {}", fname); - return -EIO; - } - } - - /* write data to file from memory buffer */ - rc = write(fd, buf, bytes); - if (rc < 0) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not write to {}", - fname); - return -EIO; - } - if (rc != bytes) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not write to {}", - fname); - return -EIO; - } - - count += bytes; - buf += bytes; - offset += bytes; - } while (count < size); - - if (count != size) { + if (size == 0) { + return 0; + } + + // The kernel buffer owns its DMA-mapped pages; stage the caller's data into + // the mapping, then transfer whole pages. + const uint64_t aligned = roundUpToPage(size); + struct slash_qdma_buffer buf{}; + if (slash_qdma_qpair_buffer_create(qpairFd, aligned, &buf) != 0) { + utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, + "Could not create QDMA write buffer for {}", fname); + return -EIO; + } + std::memcpy(buf.addr, buffer, size); + + ssize_t rc = slash_qdma_qpair_transfer(qpairFd, buf.fd, 0, base, aligned, + SLASH_QDMA_XFER_H2C); + (void)slash_qdma_buffer_destroy(&buf); + if (rc != (ssize_t)aligned) { utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not write to {}", fname); return -EIO; } - return count; + return (ssize_t)size; } ssize_t QdmaIntf::read_to_buffer(const char* fname, char* buffer, uint64_t size, uint64_t base) { @@ -119,55 +101,30 @@ ssize_t QdmaIntf::read_to_buffer(const char* fname, char* buffer, uint64_t size, "QDMA streaming not initialized"); return -EIO; } - int fd = qpairFd; - ssize_t rc; - uint64_t count = 0; - char* buf = buffer; - off_t offset = base; - - do { /* Support zero byte transfer */ - uint64_t bytes = size - count; - - if (bytes > RW_MAX_SIZE) bytes = RW_MAX_SIZE; - - if (offset) { - rc = lseek(fd, offset, SEEK_SET); - if (rc < 0) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not read from {}", fname); - return -EIO; - } - if (rc != offset) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not read from {}", fname); - return -EIO; - } - } - - /* read data from file into memory buffer */ - rc = read(fd, buf, bytes); - if (rc < 0) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not read from {}", fname); - return -EIO; - } - if (rc != bytes) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not read from {}", fname); - return -EIO; - } - - count += bytes; - buf += bytes; - offset += bytes; - } while (count < size); - - if (count != size) { + if (size == 0) { + return 0; + } + + const uint64_t aligned = roundUpToPage(size); + struct slash_qdma_buffer buf{}; + if (slash_qdma_qpair_buffer_create(qpairFd, aligned, &buf) != 0) { + utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, + "Could not create QDMA read buffer for {}", fname); + return -EIO; + } + + ssize_t rc = slash_qdma_qpair_transfer(qpairFd, buf.fd, 0, base, aligned, + SLASH_QDMA_XFER_C2H); + if (rc == (ssize_t)aligned) { + std::memcpy(buffer, buf.addr, size); + } + (void)slash_qdma_buffer_destroy(&buf); + if (rc != (ssize_t)aligned) { utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not read from {}", fname); return -EIO; } - return count; + return (ssize_t)size; } void QdmaIntf::write_buff(char* buffer, uint64_t start_addr, uint64_t size) { diff --git a/vrt/vrtd/include/vrtd/wire.h b/vrt/vrtd/include/vrtd/wire.h index 4749afd4..a543b815 100644 --- a/vrt/vrtd/include/vrtd/wire.h +++ b/vrt/vrtd/include/vrtd/wire.h @@ -276,8 +276,9 @@ struct vrtd_resp_qdma_qpair_op { /** * @brief Request a read/write file descriptor for a QDMA qpair. * - * The qpair FD is sent out-of-band via SCM_RIGHTS when - * @ref vrtd_resp_header::ret == VRTD_RET_OK. + * One or more qpair FDs are sent out-of-band via SCM_RIGHTS when + * @ref vrtd_resp_header::ret == VRTD_RET_OK. The response body reports the + * number of descriptors attached. */ struct vrtd_req_qdma_qpair_get_fd { uint32_t dev_number; ///< Device index (0-based). @@ -299,6 +300,7 @@ struct vrtd_req_buffer_open { uint32_t dev_number; ///< Device index (0-based). uint32_t alloc_type; ///< One of enum vrtd_alloc_type. uint32_t alloc_dir; ///< One of enum vrtd_alloc_dir. + uint32_t mm_channel; ///< AXI-MM/NoC channel selection (enum vrtd_mm_channel). uint64_t alloc_arg; ///< Allocation argument (HBM region index for HBM). uint64_t size; ///< Requested size in bytes. } __attribute__((packed)); @@ -306,6 +308,15 @@ struct vrtd_req_buffer_open { struct vrtd_resp_buffer_open { uint64_t size; ///< Allocated size in bytes (rounded up to subregion). uint64_t phys_addr; ///< Device physical address of the allocation. + /** + * Number of QDMA queue pairs (AXI-MM/NoC channels) owned by the single + * transfer FD sent via SCM_RIGHTS (1 or 2). When two qpairs are bound + * (an mm_channel == AUTO request), their qpair_index ordering is fixed: + * index 0 is pinned to channel 0 and index 1 to channel 1, so the client + * can apply the V80 placement policy deterministically. Exactly one FD is + * always sent regardless of this count. + */ + uint32_t qpair_count; } __attribute__((packed)); /** @@ -327,18 +338,25 @@ struct vrtd_resp_buffer_close { * Bypasses the allocator entirely — the caller is responsible for ensuring the * address is valid and not in use. Requires the @c raw-mem-access permission. * - * The qpair FD is sent out-of-band via SCM_RIGHTS when - * @ref vrtd_resp_header::ret == VRTD_RET_OK. + * A single transfer FD is sent out-of-band via SCM_RIGHTS when + * @ref vrtd_resp_header::ret == VRTD_RET_OK. The response body reports how + * many queue pairs (channels) that FD owns. */ struct vrtd_req_buffer_open_raw { uint32_t dev_number; ///< Device index (0-based). uint32_t alloc_dir; ///< One of enum vrtd_alloc_dir. + uint32_t mm_channel; ///< AXI-MM/NoC channel selection (enum vrtd_mm_channel). uint64_t phys_addr; ///< Caller-specified device physical address (bypasses allocator). uint64_t size; ///< Size in bytes. } __attribute__((packed)); struct vrtd_resp_buffer_open_raw { - uint8_t zero; ///< Placeholder; all data is carried via SCM_RIGHTS. + /** + * Number of QDMA queue pairs (channels) owned by the single transfer FD + * sent via SCM_RIGHTS (1 or 2). Same qpair_index-to-channel ordering as + * @ref vrtd_resp_buffer_open. + */ + uint32_t qpair_count; } __attribute__((packed)); /** diff --git a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h index 76cf2541..0e6ce0ec 100644 --- a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h +++ b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h @@ -54,6 +54,18 @@ extern "C" { struct vrtd_buffer; +/** + * @brief AXI-MM / NoC channel selection for a buffer's QDMA queue pair. + * + * Sent to the daemon, which forwards it to the SLASH driver's qpair-add ioctl + * (the values mirror enum slash_qdma_mm_channel). + */ +enum vrtd_mm_channel { + VRTD_MM_CHANNEL_AUTO = 0, ///< Stripe across channels by (qid & 1). + VRTD_MM_CHANNEL_0 = 1, ///< Pin to AXI-MM/NoC channel 0. + VRTD_MM_CHANNEL_1 = 2, ///< Pin to AXI-MM/NoC channel 1. +}; + /** * @brief Connect to the vrtd UNIX domain socket. @@ -291,9 +303,10 @@ enum vrtd_ret vrtd_qdma_qpair_del( ); /** - * @brief Obtain a read/write file descriptor for a QDMA qpair. + * @brief Obtain an ioctl-only file descriptor for a QDMA qpair. * - * The descriptor can be used with read()/write() for C2H/H2C data transfer. + * The descriptor can be used with registered-buffer transfer ioctls for + * C2H/H2C data transfer. * * @param fd Connected vrtd socket file descriptor. * @param dev Device index (0‑based). @@ -325,6 +338,7 @@ enum vrtd_ret vrtd_qdma_qpair_get_fd( * @param alloc_dir QDMA direction (one of enum vrtd_alloc_dir). * @param alloc_arg Allocation argument (HBM region index for HBM). * @param size_in Requested size in bytes. + * @param mm_channel AXI-MM/NoC channel selection (one of enum vrtd_mm_channel). * @param buffer_out Output pointer to receive the allocated buffer handle. * * @return #VRTD_RET_OK on success; otherwise a #vrtd_ret error code. @@ -338,6 +352,7 @@ enum vrtd_ret vrtd_buffer_open( uint32_t alloc_dir, uint64_t alloc_arg, uint64_t size_in, + enum vrtd_mm_channel mm_channel, struct vrtd_buffer **buffer_out ); @@ -352,6 +367,7 @@ enum vrtd_ret vrtd_buffer_open( * @param phys_addr Caller-specified device physical address. * @param size Size in bytes. * @param alloc_dir One of #vrtd_alloc_dir. + * @param mm_channel AXI-MM/NoC channel selection (one of enum vrtd_mm_channel). * @param buffer_out Output parameter set to the new buffer handle on success. * * @return #VRTD_RET_OK on success; otherwise a #vrtd_ret error code. @@ -364,6 +380,7 @@ enum vrtd_ret vrtd_buffer_open_raw( uint64_t phys_addr, uint64_t size, uint32_t alloc_dir, + enum vrtd_mm_channel mm_channel, struct vrtd_buffer **buffer_out ); @@ -514,8 +531,17 @@ struct vrtd_buffer { uint64_t size; uint64_t phys_addr; + /* Single transfer fd that owns @qpair_count queue pairs (channels). */ int qpair_fd; + /* Number of queue pairs (channels) the fd owns; selects 1- or 2-way split. */ + uint32_t qpair_count; + /* Kernel-owned DMA buffer fd backing @buf (from slash_qdma_qpair_buffer_create). */ + int buffer_fd; + enum slash_qdma_transfer_hint transfer_hint; + /* CPU mapping of the kernel buffer (mmap of @buffer_fd). */ void *buf; + /* Internal DMA granule for the host mapping (4 KiB base pages). */ + uint64_t transfer_step_size; }; enum vrtd_ret vrtd_buffer_create_raw( @@ -527,9 +553,25 @@ enum vrtd_ret vrtd_buffer_create_raw( uint64_t size, uint64_t phys_addr, int qpair_fd, + uint32_t qpair_count, struct vrtd_buffer **buffer_out ); +/** + * @brief Synchronize a range from the local host buffer to the device. + * + * The requested range may be smaller than the QDMA transfer granule. libvrtd + * handles any required internal alignment. Bidirectional buffers preserve + * device bytes outside the requested range with an internal read-modify-write; + * host-to-device-only buffers keep the historical behavior of expanding the + * transfer to the backing DMA granule. + */ +enum vrtd_ret vrtd_buffer_sync_to_device( + struct vrtd_buffer *buffer, + uint64_t offset, + uint64_t size +); + /** * @brief Destroy a local buffer handle. * @@ -540,12 +582,13 @@ enum vrtd_ret vrtd_buffer_destroy( struct vrtd_buffer *buffer ); -enum vrtd_ret vrtd_buffer_sync_to_device( - struct vrtd_buffer *buffer, - uint64_t offset, - uint64_t size -); - +/** + * @brief Synchronize a range from the device into the local host buffer. + * + * The requested range may be smaller than the QDMA transfer granule. libvrtd + * handles any required internal alignment and preserves bytes outside the + * requested host range. + */ enum vrtd_ret vrtd_buffer_sync_from_device( struct vrtd_buffer *buffer, uint64_t offset, diff --git a/vrt/vrtd/libvrtd/src/buffer.c b/vrt/vrtd/libvrtd/src/buffer.c index b810de2c..87573074 100644 --- a/vrt/vrtd/libvrtd/src/buffer.c +++ b/vrt/vrtd/libvrtd/src/buffer.c @@ -24,14 +24,14 @@ * DMA buffer lifecycle management for the vrtd C client library. * * Buffers are host-side memory regions used for DMA transfers to/from - * the FPGA. Each buffer is backed by an anonymous mmap (preferring - * 2 MB hugepages for TLB efficiency, with automatic fallback to - * regular pages) and associated with a QDMA queue pair fd for - * performing the actual H2C / C2H transfers. + * the FPGA. Each buffer is backed by an anonymous mmap of 4 KiB base pages + * (transparent hugepages disabled) and associated with a QDMA queue pair fd + * for performing the actual H2C / C2H transfers. * - * Sync operations (sync_to_device / sync_from_device) transfer data - * between the host buffer and FPGA memory in TRANSFER_STEP_SIZE (4 KB) - * chunks using positional I/O on the QDMA qpair fd. + * Sync operations (sync_to_device / sync_from_device) accept arbitrary + * in-buffer ranges. Internally, the QDMA fd requires page-aligned transfer + * ranges, so libvrtd expands partial requests to the mapping granule and uses + * a staging buffer when needed to preserve host-side partial-range semantics. * * Buffer lifecycle: * 1. vrtd_buffer_open() -- daemon allocates, returns qpair fd @@ -44,25 +44,179 @@ #include +#include + +#include "v80_policy.h" + #include #include +#include +#include #include #include +#include #include +#include +#include #include #include -#ifndef MAP_HUGE_SHIFT -#define MAP_HUGE_SHIFT 26 +#define BASE_TRANSFER_STEP_SIZE (4ULL * 1024ULL) // 4K + +/* + * Per-sync timing instrumentation. + * + * When SLASH_QDMA_TIMING is non-zero (compile-time flag, e.g. built with + * -DSLASH_QDMA_TIMING=1), the sync_to/from_device paths log the wall-clock + * cost of each transfer ioctl plus the aggregate per-sync time and + * effective bandwidth. This is the userspace counterpart to the kernel's + * SLASH_QDMA_TIMING breakdown. + */ +#ifndef SLASH_QDMA_TIMING +#define SLASH_QDMA_TIMING 0 #endif -#ifndef MAP_HUGE_2MB -#define MAP_HUGE_2MB (21UL << MAP_HUGE_SHIFT) +#if SLASH_QDMA_TIMING +static inline uint64_t vrtd_now_ns(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec; +} #endif -#define TRANSFER_STEP_SIZE (4ULL * 1024ULL) // 4K +/* + * Issue a buffer transfer of [offset, offset + size) as a single batched ioctl + * per round, fanning the range across the fd's queue pairs (channels) according + * to the placement policy so both NoC channels run concurrently in-kernel. + * + * The QDMA transfer descriptor's length is a 32-bit byte count, so each + * segment is chunked to stay within that limit while preserving step alignment; + * every chunk round issues one ioctl covering all active channels. + */ +static int vrtd_transfer_registered( + int qpair_fd, + uint32_t qpair_count, + enum slash_qdma_transfer_hint transfer_hint, + int buf_fd, + uint64_t phys_addr, + uint64_t offset, + uint64_t size, + uint64_t step, + bool to_device +) { + uint32_t direction = to_device ? SLASH_QDMA_XFER_H2C : SLASH_QDMA_XFER_C2H; + + if (size == 0) { + return 0; + } + + if (qpair_fd < 0 || qpair_count == 0) { + return -EINVAL; + } + + if (step == 0 || (offset % step) != 0 || (size % step) != 0) { + return -EINVAL; + } + + /* + * Decide how the transfer maps onto the available queue pairs. V80 applies + * the placement-aware policy (DDR halved, HBM routed by the half-memory + * boundary); any other hint keeps everything on the primary qpair. + */ + struct vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint32_t nseg; + + if (transfer_hint == SLASH_QDMA_TRANSFER_HINT_V80) { + nseg = vrtd_plan_v80(phys_addr, offset, size, step, qpair_count, segs); + } else { + segs[0].qpair_index = 0; + segs[0].offset = offset; + segs[0].size = size; + nseg = 1; + } + + /* Clamp any planned qpair_index to the qpairs the fd actually owns. */ + for (uint32_t i = 0; i < nseg; ++i) { + if (segs[i].qpair_index >= qpair_count) { + segs[i].qpair_index = 0; + } + } + + /* Per-channel descriptor length is 32-bit; keep chunks step-aligned. */ + uint64_t max_chunk = 0xFFFFF000ULL; + max_chunk -= max_chunk % step; + if (max_chunk == 0) { + return -EINVAL; + } + + uint64_t done[VRTD_V80_MAX_SEGS] = {0}; + for (;;) { + struct slash_qdma_subxfer xfers[VRTD_V80_MAX_SEGS]; + uint32_t map_seg[VRTD_V80_MAX_SEGS]; + uint32_t count = 0; + + for (uint32_t i = 0; i < nseg; ++i) { + uint64_t remaining = segs[i].size - done[i]; + uint64_t chunk; + uint64_t xfer_offset; + + if (remaining == 0) { + continue; + } + chunk = remaining > max_chunk ? max_chunk : remaining; + xfer_offset = segs[i].offset + done[i]; + + memset(&xfers[count], 0, sizeof(xfers[count])); + xfers[count].qpair_index = segs[i].qpair_index; + xfers[count].direction = direction; + xfers[count].buf_fd = buf_fd; + xfers[count].buf_offset = xfer_offset; + xfers[count].dev_addr = phys_addr + xfer_offset; + xfers[count].length = chunk; + map_seg[count] = i; + count++; + } + + if (count == 0) { + break; + } + + ssize_t ret = slash_qdma_qpair_transfer_batch(qpair_fd, xfers, count); + if (ret < 0) { + return -EIO; + } + + for (uint32_t c = 0; c < count; ++c) { + done[map_seg[c]] += xfers[c].length; + } + } + + return 0; +} + +/* + * Transfer [0, size) of a separate kernel buffer (@bounce) against the device + * starting at @phys_addr. Used for partial-range read-modify-write staging. + */ +static int vrtd_bounce_transfer( + const struct vrtd_buffer *buffer, + const struct slash_qdma_buffer *bounce, + uint64_t phys_addr, + uint64_t size, + bool to_device +) { + if (buffer == NULL || bounce == NULL || buffer->qpair_count == 0 || + buffer->qpair_fd < 0) { + return -EINVAL; + } + + return vrtd_transfer_registered(buffer->qpair_fd, buffer->qpair_count, + buffer->transfer_hint, bounce->fd, + phys_addr, 0, size, + BASE_TRANSFER_STEP_SIZE, to_device); +} enum vrtd_ret vrtd_buffer_create_raw( int sock_fd, @@ -73,6 +227,7 @@ enum vrtd_ret vrtd_buffer_create_raw( uint64_t size, uint64_t phys_addr, int qpair_fd, + uint32_t qpair_count, struct vrtd_buffer **buffer_out ) { if (buffer_out == NULL) { @@ -84,31 +239,44 @@ enum vrtd_ret vrtd_buffer_create_raw( return VRTD_RET_INTERNAL_ERROR; } - buffer->buf = mmap( - NULL, /* address (let the kernel choose) */ - size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, - -1, /* fd */ - 0 /* offset */ - ); - if (buffer->buf == MAP_FAILED) { - // Huge pages are an optimization, not a hard requirement. - // Fall back to normal anonymous mapping when hugepage mmap fails. - buffer->buf = mmap( - NULL, /* address (let the kernel choose) */ - size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, - -1, /* fd */ - 0 /* offset */ - ); - if (buffer->buf == MAP_FAILED) { - free(buffer); - return VRTD_RET_INTERNAL_ERROR; - } + buffer->buf = NULL; + buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE; + buffer->qpair_fd = -1; + buffer->qpair_count = 0; + buffer->buffer_fd = -1; + buffer->transfer_hint = SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR; + + if (qpair_fd < 0 || qpair_count == 0 || qpair_count > 2) { + free(buffer); + return VRTD_RET_BAD_LIB_CALL; } + /* + * The kernel owns the DMA buffer: it allocates 4 KiB base pages, builds the + * SGL, and DMA-maps everything once at create time, then hands back a + * mappable fd. We mmap that fd for CPU access (buffer->buf). + */ + struct slash_qdma_buffer sbuf; + memset(&sbuf, 0, sizeof(sbuf)); + if (slash_qdma_qpair_buffer_create(qpair_fd, size, &sbuf) != 0) { + free(buffer); + return VRTD_RET_INTERNAL_ERROR; + } + + buffer->buf = sbuf.addr; + buffer->buffer_fd = sbuf.fd; + buffer->transfer_hint = sbuf.transfer_hint; + buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE; +#if SLASH_QDMA_TIMING + syslog( + LOG_INFO, + "libvrtd: buffer kernel mapping size=%llu phys_addr=0x%llx step=%llu", + (unsigned long long)size, + (unsigned long long)phys_addr, + (unsigned long long)buffer->transfer_step_size + ); +#endif + buffer->sock_fd = sock_fd; buffer->dev = dev; buffer->alloc_type = alloc_type; @@ -116,13 +284,69 @@ enum vrtd_ret vrtd_buffer_create_raw( buffer->alloc_arg = alloc_arg; buffer->size = size; buffer->phys_addr = phys_addr; - buffer->qpair_fd = qpair_fd; + buffer->qpair_fd = qpair_fd; + buffer->qpair_count = qpair_count; *buffer_out = buffer; return VRTD_RET_OK; } +static enum vrtd_ret vrtd_buffer_prepare_sync_range( + const struct vrtd_buffer *buffer, + uint64_t offset, + uint64_t size, + uint64_t *aligned_offset_out, + uint64_t *aligned_size_out, + bool *needs_bounce_out +) { + uint64_t step; + uint64_t end; + uint64_t aligned_offset; + uint64_t aligned_end; + + if (buffer == NULL || aligned_offset_out == NULL || + aligned_size_out == NULL || needs_bounce_out == NULL) { + return VRTD_RET_BAD_LIB_CALL; + } + + step = buffer->transfer_step_size; + if (step == 0) { + return VRTD_RET_INVALID_ARGUMENT; + } + + if (offset > buffer->size || size > buffer->size - offset) { + return VRTD_RET_INVALID_ARGUMENT; + } + + if (size == 0) { + *aligned_offset_out = offset; + *aligned_size_out = 0; + *needs_bounce_out = false; + return VRTD_RET_OK; + } + + if ((buffer->size % step) != 0 || (buffer->phys_addr % step) != 0) { + return VRTD_RET_INVALID_ARGUMENT; + } + + end = offset + size; + aligned_offset = offset - (offset % step); + if (end > UINT64_MAX - (step - 1)) { + return VRTD_RET_INVALID_ARGUMENT; + } + aligned_end = ((end + step - 1) / step) * step; + if (aligned_end > buffer->size) { + return VRTD_RET_INVALID_ARGUMENT; + } + + *aligned_offset_out = aligned_offset; + *aligned_size_out = aligned_end - aligned_offset; + *needs_bounce_out = (aligned_offset != offset || aligned_end != end); + + return VRTD_RET_OK; +} + enum vrtd_ret vrtd_buffer_destroy( struct vrtd_buffer *buffer ) { @@ -130,12 +354,19 @@ enum vrtd_ret vrtd_buffer_destroy( return VRTD_RET_BAD_LIB_CALL; } - if (buffer->qpair_fd >= 0) { - (void) close(buffer->qpair_fd); + if (buffer->buf != NULL && buffer->size != 0) { + (void) munmap(buffer->buf, buffer->size); + buffer->buf = NULL; } - if (buffer->buf != NULL) { - (void) munmap(buffer->buf, buffer->size); + if (buffer->buffer_fd >= 0) { + (void) close(buffer->buffer_fd); + buffer->buffer_fd = -1; + } + + if (buffer->qpair_fd >= 0) { + (void) close(buffer->qpair_fd); + buffer->qpair_fd = -1; } free(buffer); @@ -189,31 +420,77 @@ enum vrtd_ret vrtd_buffer_sync_to_device( return VRTD_RET_INVALID_ARGUMENT; } + assert(buffer->qpair_count > 0); assert(buffer->qpair_fd >= 0); assert(buffer->buf != NULL); - assert(buffer->size % TRANSFER_STEP_SIZE == 0); - assert(buffer->phys_addr % TRANSFER_STEP_SIZE == 0); + uint64_t aligned_offset = 0; + uint64_t aligned_size = 0; + bool needs_bounce = false; + enum vrtd_ret range_ret = vrtd_buffer_prepare_sync_range( + buffer, offset, size, &aligned_offset, &aligned_size, &needs_bounce); + if (range_ret != VRTD_RET_OK) { + return range_ret; + } + if (aligned_size == 0) { + return VRTD_RET_OK; + } - uint64_t effective_offset = offset - (offset % TRANSFER_STEP_SIZE); - uint64_t end_offset = offset + size; + uint64_t step = buffer->transfer_step_size; +#if SLASH_QDMA_TIMING + uint64_t sync_start_ns = vrtd_now_ns(); +#endif - off_t ret = lseek(buffer->qpair_fd, buffer->phys_addr + effective_offset, SEEK_SET); - if (ret == -1) { + int transfer_ret; + if (needs_bounce && buffer->alloc_dir == VRTD_ALLOC_DIR_BIDIRECTIONAL) { + struct slash_qdma_buffer bounce; + memset(&bounce, 0, sizeof(bounce)); + if (slash_qdma_qpair_buffer_create(buffer->qpair_fd, aligned_size, + &bounce) != 0) { + return VRTD_RET_INTERNAL_ERROR; + } + + transfer_ret = vrtd_bounce_transfer( + buffer, &bounce, buffer->phys_addr + aligned_offset, + aligned_size, false); + if (transfer_ret == 0) { + memcpy( + (uint8_t *)bounce.addr + (offset - aligned_offset), + (uint8_t *)buffer->buf + offset, + size + ); + transfer_ret = vrtd_bounce_transfer( + buffer, &bounce, buffer->phys_addr + aligned_offset, + aligned_size, true); + } + (void) slash_qdma_buffer_destroy(&bounce); + } else { + /* + * Host-to-device-only buffers cannot read the surrounding device + * granule for a read-modify-write, so keep the historical behavior: + * expand partial syncs to the backing DMA granule. + */ + transfer_ret = vrtd_transfer_registered( + buffer->qpair_fd, buffer->qpair_count, buffer->transfer_hint, + buffer->buffer_fd, buffer->phys_addr, + aligned_offset, aligned_size, step, true); + } + if (transfer_ret != 0) { return VRTD_RET_INTERNAL_ERROR; } - for (uint64_t curr_offset = effective_offset; curr_offset < end_offset; curr_offset += TRANSFER_STEP_SIZE) { - ssize_t bytes_written = 0; - while (bytes_written < TRANSFER_STEP_SIZE) { - ssize_t bw = write(buffer->qpair_fd, - (uint8_t *) buffer->buf + curr_offset + bytes_written, - TRANSFER_STEP_SIZE - bytes_written); - if (bw == -1) { - return VRTD_RET_INTERNAL_ERROR; - } - bytes_written += bw; - } +#if SLASH_QDMA_TIMING + { + uint64_t total_ns = vrtd_now_ns() - sync_start_ns; + double mb = (double) size / (1024.0 * 1024.0); + double sec = (double) total_ns / 1e9; + syslog(LOG_INFO, + "libvrtd: timing H2C sync offset=%llu size=%llu aligned_offset=%llu aligned_size=%llu step=%llu total=%llu ns (%.1f MB/s)", + (unsigned long long) offset, (unsigned long long) size, + (unsigned long long) aligned_offset, (unsigned long long) aligned_size, + (unsigned long long) step, (unsigned long long) total_ns, + sec > 0.0 ? mb / sec : 0.0); } +#endif return VRTD_RET_OK; } @@ -231,31 +508,69 @@ enum vrtd_ret vrtd_buffer_sync_from_device( return VRTD_RET_INVALID_ARGUMENT; } + assert(buffer->qpair_count > 0); assert(buffer->qpair_fd >= 0); assert(buffer->buf != NULL); - assert(buffer->size % TRANSFER_STEP_SIZE == 0); - assert(buffer->phys_addr % TRANSFER_STEP_SIZE == 0); + uint64_t aligned_offset = 0; + uint64_t aligned_size = 0; + bool needs_bounce = false; + enum vrtd_ret range_ret = vrtd_buffer_prepare_sync_range( + buffer, offset, size, &aligned_offset, &aligned_size, &needs_bounce); + if (range_ret != VRTD_RET_OK) { + return range_ret; + } + if (aligned_size == 0) { + return VRTD_RET_OK; + } + + uint64_t step = buffer->transfer_step_size; +#if SLASH_QDMA_TIMING + uint64_t sync_start_ns = vrtd_now_ns(); +#endif - uint64_t effective_offset = offset - (offset % TRANSFER_STEP_SIZE); - uint64_t end_offset = offset + size; + int transfer_ret; + if (needs_bounce) { + struct slash_qdma_buffer bounce; + memset(&bounce, 0, sizeof(bounce)); + if (slash_qdma_qpair_buffer_create(buffer->qpair_fd, aligned_size, + &bounce) != 0) { + return VRTD_RET_INTERNAL_ERROR; + } - off_t ret = lseek(buffer->qpair_fd, buffer->phys_addr + effective_offset, SEEK_SET); - if (ret == -1) { + transfer_ret = vrtd_bounce_transfer( + buffer, &bounce, buffer->phys_addr + aligned_offset, + aligned_size, false); + if (transfer_ret == 0) { + memcpy( + (uint8_t *)buffer->buf + offset, + (uint8_t *)bounce.addr + (offset - aligned_offset), + size + ); + } + (void) slash_qdma_buffer_destroy(&bounce); + } else { + transfer_ret = vrtd_transfer_registered( + buffer->qpair_fd, buffer->qpair_count, buffer->transfer_hint, + buffer->buffer_fd, buffer->phys_addr, + aligned_offset, aligned_size, step, false); + } + if (transfer_ret != 0) { return VRTD_RET_INTERNAL_ERROR; } - for (uint64_t curr_offset = effective_offset; curr_offset < end_offset; curr_offset += TRANSFER_STEP_SIZE) { - ssize_t bytes_read = 0; - while (bytes_read < TRANSFER_STEP_SIZE) { - ssize_t br = read(buffer->qpair_fd, - (uint8_t *) buffer->buf + curr_offset + bytes_read, - TRANSFER_STEP_SIZE - bytes_read); - if (br == -1) { - return VRTD_RET_INTERNAL_ERROR; - } - bytes_read += br; - } +#if SLASH_QDMA_TIMING + { + uint64_t total_ns = vrtd_now_ns() - sync_start_ns; + double mb = (double) size / (1024.0 * 1024.0); + double sec = (double) total_ns / 1e9; + syslog(LOG_INFO, + "libvrtd: timing C2H sync offset=%llu size=%llu aligned_offset=%llu aligned_size=%llu step=%llu total=%llu ns (%.1f MB/s)", + (unsigned long long) offset, (unsigned long long) size, + (unsigned long long) aligned_offset, (unsigned long long) aligned_size, + (unsigned long long) step, (unsigned long long) total_ns, + sec > 0.0 ? mb / sec : 0.0); } +#endif return VRTD_RET_OK; } diff --git a/vrt/vrtd/libvrtd/src/requests.c b/vrt/vrtd/libvrtd/src/requests.c index b03c863a..d56c2a47 100644 --- a/vrt/vrtd/libvrtd/src/requests.c +++ b/vrt/vrtd/libvrtd/src/requests.c @@ -56,13 +56,13 @@ #include /** - * vrtd_recv_response() - Receive a response message from the daemon. + * vrtd_recv_response_fds() - Receive a response message from the daemon. * @fd: Connection socket. * @resp_body_buf: Buffer for the response body (may be NULL if no body expected). * @resp_bufsz: Size of @resp_body_buf. - * @resp_fd: If non-NULL, receives an out-of-band file descriptor - * sent by the daemon via SCM_RIGHTS (e.g. a BAR fd or - * QDMA qpair fd). Set to -1 if no fd was received. + * @resp_fds: Optional array receiving out-of-band file descriptors. + * @max_resp_fds: Capacity of @resp_fds. + * @resp_fd_count: Optional output count of received fds. * * Uses recvmsg() with scatter-gather I/O: the header and body are read * into separate buffers in a single system call. MSG_CMSG_CLOEXEC @@ -70,11 +70,13 @@ * * Return: VRTD_RET_OK on success, or an error code. */ -static enum vrtd_ret vrtd_recv_response( +static enum vrtd_ret vrtd_recv_response_fds( int fd, void *resp_body_buf, size_t resp_bufsz, - int *resp_fd + int *resp_fds, + uint32_t max_resp_fds, + uint32_t *resp_fd_count ) { struct vrtd_resp_header rh = {0}; @@ -85,16 +87,21 @@ static enum vrtd_ret vrtd_recv_response( riov[1].iov_base = resp_body_buf; riov[1].iov_len = resp_bufsz; - char cbuf[CMSG_SPACE(sizeof(int))]; + char cbuf[CMSG_SPACE(2 * sizeof(int))]; struct msghdr rmsg = { .msg_iov = riov, .msg_iovlen = resp_bufsz ? 2 : 1, - .msg_control = resp_fd ? cbuf : NULL, - .msg_controllen = resp_fd ? sizeof(cbuf) : 0, + .msg_control = resp_fds ? cbuf : NULL, + .msg_controllen = resp_fds ? sizeof(cbuf) : 0, }; - if (resp_fd) { - *resp_fd = -1; + if (resp_fd_count) { + *resp_fd_count = 0; + } + if (resp_fds) { + for (uint32_t i = 0; i < max_resp_fds; ++i) { + resp_fds[i] = -1; + } } ssize_t rn = recvmsg(fd, &rmsg, MSG_CMSG_CLOEXEC); @@ -118,11 +125,19 @@ static enum vrtd_ret vrtd_recv_response( return VRTD_RET_BAD_CONN; } - /* Extract file descriptor from SCM_RIGHTS ancillary data, if any. */ + /* Extract file descriptors from SCM_RIGHTS ancillary data, if any. */ for (struct cmsghdr *c = CMSG_FIRSTHDR(&rmsg); c != NULL; c = CMSG_NXTHDR(&rmsg, c)) { if (c->cmsg_level == SOL_SOCKET && c->cmsg_type == SCM_RIGHTS && c->cmsg_len >= CMSG_LEN(sizeof(int))) { - assert(resp_fd != NULL); - memcpy(resp_fd, CMSG_DATA(c), sizeof(int)); + assert(resp_fds != NULL); + size_t payload = c->cmsg_len - CMSG_LEN(0); + uint32_t n = (uint32_t)(payload / sizeof(int)); + if (n > max_resp_fds) { + n = max_resp_fds; + } + memcpy(resp_fds, CMSG_DATA(c), n * sizeof(int)); + if (resp_fd_count) { + *resp_fd_count = n; + } break; } } @@ -130,6 +145,22 @@ static enum vrtd_ret vrtd_recv_response( return (enum vrtd_ret) rh.ret; } +static enum vrtd_ret vrtd_recv_response( + int fd, + void *resp_body_buf, + size_t resp_bufsz, + int *resp_fd +) +{ + uint32_t count = 0; + enum vrtd_ret ret = vrtd_recv_response_fds( + fd, resp_body_buf, resp_bufsz, resp_fd, resp_fd ? 1u : 0u, &count); + if (resp_fd && count == 0) { + *resp_fd = -1; + } + return ret; +} + int vrtd_connect(const char *path) { if (path == NULL) { @@ -232,6 +263,60 @@ enum vrtd_ret vrtd_raw_request( return vrtd_recv_response(fd, resp_body_buf, resp_bufsz, resp_fd); } +static enum vrtd_ret vrtd_raw_request_fds( + int fd, + uint16_t opcode, + const void *req_body, uint16_t req_size, + void *resp_body_buf, size_t resp_bufsz, + int *resp_fds, uint32_t max_resp_fds, uint32_t *resp_fd_count, + const int *req_fd +) +{ + if (req_size > VRTD_MSG_MAX_SIZE - sizeof(struct vrtd_req_header)) { errno = EMSGSIZE; return -1; } + + struct vrtd_req_header h = { + .size = req_size, + .opcode= opcode, + .seqno = 1, + }; + + struct iovec siov[2]; + siov[0].iov_base = &h; + siov[0].iov_len = sizeof(h); + siov[1].iov_base = (void*) req_body; + siov[1].iov_len = req_size; + + char cbuf[CMSG_SPACE(sizeof(int))]; + struct msghdr smsg = { + .msg_iov = siov, + .msg_iovlen = req_size ? 2 : 1, + .msg_control = NULL, + .msg_controllen = 0, + }; + + if (req_fd && *req_fd >= 0) { + smsg.msg_control = cbuf; + smsg.msg_controllen = sizeof(cbuf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&smsg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), req_fd, sizeof(int)); + } + + ssize_t sn = sendmsg(fd, &smsg, MSG_NOSIGNAL); + if (sn == -1) { + return VRTD_RET_BAD_CONN; + } + if ((size_t) sn != sizeof(h) + req_size) { + return VRTD_RET_BAD_CONN; + } + + return vrtd_recv_response_fds(fd, resp_body_buf, resp_bufsz, + resp_fds, max_resp_fds, resp_fd_count); +} + enum vrtd_ret vrtd_get_num_devices(int fd, uint32_t *out) { @@ -468,6 +553,7 @@ enum vrtd_ret vrtd_buffer_open( uint32_t alloc_dir, uint64_t alloc_arg, uint64_t size_in, + enum vrtd_mm_channel mm_channel, struct vrtd_buffer **buffer_out ) { @@ -480,21 +566,28 @@ enum vrtd_ret vrtd_buffer_open( .dev_number = dev, .alloc_type = alloc_type, .alloc_dir = alloc_dir, + .mm_channel = mm_channel, .alloc_arg = alloc_arg, .size = size_in, }; struct vrtd_resp_buffer_open resp = {0}; + /* The daemon sends a single transfer fd that owns resp.qpair_count qpairs. */ int qpair_fd = -1; - int ret = vrtd_raw_request(fd, VRTD_REQ_BUFFER_OPEN, - &req, sizeof(req), - &resp, sizeof(resp), - &qpair_fd, NULL); + uint32_t fd_count = 0; + int ret = vrtd_raw_request_fds(fd, VRTD_REQ_BUFFER_OPEN, + &req, sizeof(req), + &resp, sizeof(resp), + &qpair_fd, 1, &fd_count, NULL); if (ret != VRTD_RET_OK) { return ret; } - if (qpair_fd < 0) { + if (fd_count != 1 || qpair_fd < 0 || + resp.qpair_count == 0 || resp.qpair_count > 2) { + if (qpair_fd >= 0) { + (void) close(qpair_fd); + } return VRTD_RET_INTERNAL_ERROR; } @@ -507,6 +600,7 @@ enum vrtd_ret vrtd_buffer_open( resp.size, resp.phys_addr, qpair_fd, + resp.qpair_count, buffer_out ); if (ret != VRTD_RET_OK) { @@ -523,6 +617,7 @@ enum vrtd_ret vrtd_buffer_open_raw( uint64_t phys_addr, uint64_t size, uint32_t alloc_dir, + enum vrtd_mm_channel mm_channel, struct vrtd_buffer **buffer_out ) { @@ -534,21 +629,28 @@ enum vrtd_ret vrtd_buffer_open_raw( struct vrtd_req_buffer_open_raw req = { .dev_number = dev, .alloc_dir = alloc_dir, + .mm_channel = mm_channel, .phys_addr = phys_addr, .size = size, }; struct vrtd_resp_buffer_open_raw resp = {0}; + /* The daemon sends a single transfer fd that owns resp.qpair_count qpairs. */ int qpair_fd = -1; - int ret = vrtd_raw_request(fd, VRTD_REQ_BUFFER_OPEN_RAW, - &req, sizeof(req), - &resp, sizeof(resp), - &qpair_fd, NULL); + uint32_t fd_count = 0; + int ret = vrtd_raw_request_fds(fd, VRTD_REQ_BUFFER_OPEN_RAW, + &req, sizeof(req), + &resp, sizeof(resp), + &qpair_fd, 1, &fd_count, NULL); if (ret != VRTD_RET_OK) { return ret; } - if (qpair_fd < 0) { + if (fd_count != 1 || qpair_fd < 0 || + resp.qpair_count == 0 || resp.qpair_count > 2) { + if (qpair_fd >= 0) { + (void) close(qpair_fd); + } return VRTD_RET_INTERNAL_ERROR; } @@ -561,6 +663,7 @@ enum vrtd_ret vrtd_buffer_open_raw( size, phys_addr, qpair_fd, + resp.qpair_count, buffer_out ); if (ret != VRTD_RET_OK) { diff --git a/vrt/vrtd/libvrtd/src/v80_policy.h b/vrt/vrtd/libvrtd/src/v80_policy.h new file mode 100644 index 00000000..2205aacf --- /dev/null +++ b/vrt/vrtd/libvrtd/src/v80_policy.h @@ -0,0 +1,144 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software + * and associated documentation files (the "Software"), to deal in the Software without restriction, + * including without limitation the rights to use, copy, modify, merge, publish, distribute, + * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or + * substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT + * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @file v80_policy.h + * @brief Client-side V80 placement-aware channel policy for QDMA transfers. + * + * The kernel returns the opaque SLASH_QDMA_TRANSFER_HINT_V80 marker on buffer + * registration; the actual decision of how to spread a transfer across the + * available QDMA queues lives here, where the buffer's device address is known. + * + * On the V80 a transfer takes two independent NoC paths: the host-side ingress + * master (NMU) is chosen by the queue's mm-channel, while the memory-side + * egress endpoint (NSU / HBM pseudo-channel) is chosen by the device address. + * Sustaining both NMUs requires also spreading across two NSUs. The policy: + * + * - DDR (single NSU): split the range in half so both NMUs stay busy. + * - HBM below the 16 GiB half-boundary: channel 0 only. + * - HBM at/above the half-boundary: channel 1 only. + * - HBM spanning the boundary: split there (below -> ch0, above -> ch1). + * + * The qpair-to-channel mapping is the wire contract from vrtd: qpair_index 0 is + * pinned to channel 0 and qpair_index 1 to channel 1 (see vrtd_resp_buffer_open). + */ + +#ifndef VRTD_V80_POLICY_H +#define VRTD_V80_POLICY_H + +#include +#include + +/* + * V80 device-memory geometry (mirrors vrt/vrtd/src/allocator.h and the + * memory-model docs). HBM and DDR are each 64 x 512 MiB = 32 GiB; the HBM + * half-boundary at +16 GiB separates the two NoC slave-unit (NSU) regions. + */ +#define VRTD_V80_HBM_BASE 0x4000000000ULL +#define VRTD_V80_HBM_SIZE (64ULL * 512ULL * 1024ULL * 1024ULL) +#define VRTD_V80_HBM_HALF (VRTD_V80_HBM_SIZE / 2ULL) +#define VRTD_V80_DDR_BASE 0x60000000000ULL +#define VRTD_V80_DDR_SIZE (64ULL * 512ULL * 1024ULL * 1024ULL) + +/** @brief Maximum segments a transfer is split into (one per mm-channel). */ +#define VRTD_V80_MAX_SEGS 2u + +/** @brief One contiguous sub-transfer routed to a specific qpair. */ +struct vrtd_xfer_seg { + uint32_t qpair_index; /**< Index into the fd's bound qpairs (== channel). */ + uint64_t offset; /**< Buffer-relative byte offset. */ + uint64_t size; /**< Byte count. */ +}; + +/** + * @brief Compute the V80 transfer plan for a buffer range. + * + * Plans the transfer of [@p offset, @p offset + @p size) within a buffer based + * at device address @p phys_addr across @p qpair_count available queue pairs + * (qpair_index 0 == channel 0, qpair_index 1 == channel 1). Split points are + * aligned down to @p step so every emitted segment stays page-aligned. With + * fewer than two queue pairs (or a zero step) the whole range is assigned to + * qpair_index 0. + * + * @param phys_addr Device base address of the buffer. + * @param offset Buffer-relative start of the transfer. + * @param size Transfer length in bytes (assumed a multiple of step). + * @param step Transfer/page granule used to align split points. + * @param qpair_count Number of available queue pairs (1 or 2). + * @param segs [out] Receives up to VRTD_V80_MAX_SEGS segments. + * @return Number of segments written to @p segs (1 or 2). + */ +static inline uint32_t vrtd_plan_v80(uint64_t phys_addr, + uint64_t offset, + uint64_t size, + uint64_t step, + uint32_t qpair_count, + struct vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]) +{ + if (qpair_count < 2u || step == 0u) { + segs[0].qpair_index = 0u; + segs[0].offset = offset; + segs[0].size = size; + return 1u; + } + + uint64_t start = phys_addr + offset; + uint64_t end = start + size; + bool is_ddr = (phys_addr >= VRTD_V80_DDR_BASE && + phys_addr < VRTD_V80_DDR_BASE + VRTD_V80_DDR_SIZE); + + uint64_t lo_len; + if (is_ddr) { + /* Single DDR NSU: just split the range to drive both NMUs. */ + lo_len = size / 2u; + } else { + /* HBM: route by the 16 GiB half-memory boundary (NSU split). */ + uint64_t boundary = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF; + if (end <= boundary) { + lo_len = size; /* entirely in the lower half -> ch0 */ + } else if (start >= boundary) { + segs[0].qpair_index = 1u; /* entirely in the upper half -> ch1 */ + segs[0].offset = offset; + segs[0].size = size; + return 1u; + } else { + lo_len = boundary - start; /* spans the boundary */ + } + } + + lo_len -= lo_len % step; /* keep both segments page-aligned */ + + if (lo_len == 0u || lo_len >= size) { + segs[0].qpair_index = 0u; + segs[0].offset = offset; + segs[0].size = size; + return 1u; + } + + segs[0].qpair_index = 0u; + segs[0].offset = offset; + segs[0].size = lo_len; + segs[1].qpair_index = 1u; + segs[1].offset = offset + lo_len; + segs[1].size = size - lo_len; + return 2u; +} + +#endif /* VRTD_V80_POLICY_H */ diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp index 8748d379..b2569d91 100644 --- a/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp +++ b/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp @@ -48,6 +48,18 @@ enum class BufferAllocDir : uint32_t { DeviceToHost = VRTD_ALLOC_DIR_DEVICE_TO_HOST, }; +/** + * @brief AXI-MM / NoC channel selection for a buffer's QDMA queue pair. + * + * Mirrors @c vrtd_mm_channel (values must stay in sync). @c Auto stripes across + * channels by (qid & 1); @c Ch0 / @c Ch1 pin to a single channel. + */ +enum class MmChannel : uint32_t { + Auto = 0, ///< Stripe across channels by (qid & 1). + Ch0 = 1, ///< Pin to AXI-MM/NoC channel 0. + Ch1 = 2, ///< Pin to AXI-MM/NoC channel 1. +}; + /** * @brief RAII wrapper for a vrtd buffer allocation. * diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp index 8123a7f1..5a220075 100644 --- a/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp +++ b/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp @@ -158,19 +158,22 @@ class Device { * @param size Requested size in bytes. * @param allocArg Allocation argument (HBM region index for HBM). * @param allocDir QDMA transfer direction. + * @param mmChannel AXI-MM/NoC channel selection (defaults to auto). * @return An owning @c Buffer. * @throws vrtd::Error on error. */ Buffer openBuffer(BufferAllocType allocType, uint64_t size, uint64_t allocArg = 0, - BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const; + BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + MmChannel mmChannel = MmChannel::Auto) const; /** * @brief Convenience helper for DDR allocations. */ - Buffer openDdrBuffer(uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const { - return openBuffer(BufferAllocType::Ddr, size, 0, allocDir); + Buffer openDdrBuffer(uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + MmChannel mmChannel = MmChannel::Auto) const { + return openBuffer(BufferAllocType::Ddr, size, 0, allocDir, mmChannel); } /** @@ -178,16 +181,18 @@ class Device { */ Buffer openHbmBuffer(uint32_t region, uint64_t size, - BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const { - return openBuffer(BufferAllocType::Hbm, size, region, allocDir); + BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + MmChannel mmChannel = MmChannel::Auto) const { + return openBuffer(BufferAllocType::Hbm, size, region, allocDir, mmChannel); } /** * @brief Convenience helper for HBM VNOC allocations. */ Buffer openHbmVnocBuffer(uint64_t size, - BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const { - return openBuffer(BufferAllocType::HbmVnoc, size, 0, allocDir); + BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + MmChannel mmChannel = MmChannel::Auto) const { + return openBuffer(BufferAllocType::HbmVnoc, size, 0, allocDir, mmChannel); } /** @@ -199,12 +204,14 @@ class Device { * @param phys_addr Device physical address. * @param size Size in bytes. * @param allocDir QDMA transfer direction. + * @param mmChannel AXI-MM/NoC channel selection (defaults to auto). * @return An owning @c Buffer. * @throws vrtd::Error on error. */ Buffer openRawBuffer(uint64_t phys_addr, uint64_t size, - BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const; + BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + MmChannel mmChannel = MmChannel::Auto) const; /** * @brief Perform a PCIe hotplug operation for this device. @@ -351,8 +358,8 @@ class Device { uint16_t subsystemDeviceId, std::function fGetBar, std::function fCreateQdmaQpair, - std::function fOpenBuffer, - std::function fOpenBufferRaw, + std::function fOpenBuffer, + std::function fOpenBufferRaw, std::function fHotplugOp, std::function fDesignWrite, std::function fDesignWriteFile, @@ -370,8 +377,8 @@ class Device { std::function fGetBar; std::function fCreateQdmaQpair; - std::function fOpenBuffer; - std::function fOpenBufferRaw; + std::function fOpenBuffer; + std::function fOpenBufferRaw; std::function fHotplugOp; std::function fDesignWrite; std::function fDesignWriteFile; diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp index e2dbbac2..422160c9 100644 --- a/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp +++ b/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp @@ -190,6 +190,7 @@ class Session { * @param size Requested size in bytes. * @param allocArg Allocation argument (HBM region index for HBM). * @param allocDir QDMA transfer direction. + * @param mmChannel AXI-MM/NoC channel selection for the queue pair. * @return An owning @c Buffer. * @throws vrtd::Error on error. */ @@ -198,7 +199,8 @@ class Session { BufferAllocType allocType, uint64_t size, uint64_t allocArg, - BufferAllocDir allocDir + BufferAllocDir allocDir, + MmChannel mmChannel ) const; /** @@ -208,6 +210,7 @@ class Session { * @param phys_addr Caller-specified device physical address (bypasses allocator). * @param size Size in bytes. * @param allocDir QDMA transfer direction. + * @param mmChannel AXI-MM/NoC channel selection for the queue pair. * @return An owning @c Buffer. * @throws vrtd::Error on error. */ @@ -215,7 +218,8 @@ class Session { const Device& device, uint64_t phys_addr, uint64_t size, - BufferAllocDir allocDir + BufferAllocDir allocDir, + MmChannel mmChannel ) const; /** diff --git a/vrt/vrtd/libvrtdpp/src/buffer.cpp b/vrt/vrtd/libvrtdpp/src/buffer.cpp index 0c68ff22..756170a6 100644 --- a/vrt/vrtd/libvrtdpp/src/buffer.cpp +++ b/vrt/vrtd/libvrtdpp/src/buffer.cpp @@ -151,24 +151,12 @@ bool Buffer::isClosed() const noexcept std::fstream Buffer::fstream(std::ios_base::openmode mode) const { + (void)mode; if (isClosed()) { throw std::runtime_error("Buffer is closed"); } - int fd = getFd(); - if (fd < 0) { - throw std::runtime_error("Buffer FD is invalid"); - } - - std::string path = "/proc/self/fd/" + std::to_string(fd); - - std::fstream stream; - stream.open(path, mode); - if (!stream.is_open()) { - throw std::runtime_error("Failed to open fstream for buffer"); - } - - return stream; + throw std::runtime_error("Buffer qpair fds are ioctl-only; use syncToDevice/syncFromDevice"); } void Buffer::syncToDevice(uint64_t offset, uint64_t size) diff --git a/vrt/vrtd/libvrtdpp/src/device.cpp b/vrt/vrtd/libvrtdpp/src/device.cpp index f45cda24..6fa00791 100644 --- a/vrt/vrtd/libvrtdpp/src/device.cpp +++ b/vrt/vrtd/libvrtdpp/src/device.cpp @@ -31,8 +31,8 @@ Device::Device(uint32_t num, uint16_t subsystemDeviceId, std::function fGetBar, std::function fCreateQdmaQpair, - std::function fOpenBuffer, - std::function fOpenBufferRaw, + std::function fOpenBuffer, + std::function fOpenBufferRaw, std::function fHotplugOp, std::function fDesignWrite, std::function fDesignWriteFile, @@ -97,14 +97,16 @@ QdmaQpair Device::createQdmaQpair(const struct slash_qdma_qpair_add& cfg) const Buffer Device::openBuffer(BufferAllocType allocType, uint64_t size, uint64_t allocArg, - BufferAllocDir allocDir) const { - return fOpenBuffer(*this, allocType, size, allocArg, allocDir); + BufferAllocDir allocDir, + MmChannel mmChannel) const { + return fOpenBuffer(*this, allocType, size, allocArg, allocDir, mmChannel); } Buffer Device::openRawBuffer(uint64_t phys_addr, uint64_t size, - BufferAllocDir allocDir) const { - return fOpenBufferRaw(*this, phys_addr, size, allocDir); + BufferAllocDir allocDir, + MmChannel mmChannel) const { + return fOpenBufferRaw(*this, phys_addr, size, allocDir, mmChannel); } void Device::hotplugOp(HotplugOp op, uint8_t function) const { diff --git a/vrt/vrtd/libvrtdpp/src/session.cpp b/vrt/vrtd/libvrtdpp/src/session.cpp index d2e69fd9..7bbda0bc 100644 --- a/vrt/vrtd/libvrtdpp/src/session.cpp +++ b/vrt/vrtd/libvrtdpp/src/session.cpp @@ -132,11 +132,11 @@ Device Session::getDevice(size_t i) const { info.pci.subsystem_device_id, [&](const Device& device, uint8_t num) { return getBar(device, num); }, [&](const Device& device, const slash_qdma_qpair_add& cfg) { return createQdmaQpair(device, cfg); }, - [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir) { - return openBuffer(device, type, size, arg, dir); + [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, MmChannel mm) { + return openBuffer(device, type, size, arg, dir, mm); }, - [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir) { - return openBufferRaw(device, phys_addr, size, dir); + [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, MmChannel mm) { + return openBufferRaw(device, phys_addr, size, dir, mm); }, [&](const Device& device, HotplugOp op, uint8_t function) { return hotplugOp(device, op, function); }, [&](const Device& device, int input_fd) { return designWrite(device, input_fd); }, @@ -197,11 +197,11 @@ Device Session::getDeviceByBdf(std::string_view bdf) const { info.pci.subsystem_device_id, [&](const Device& device, uint8_t num) { return getBar(device, num); }, [&](const Device& device, const slash_qdma_qpair_add& cfg) { return createQdmaQpair(device, cfg); }, - [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir) { - return openBuffer(device, type, size, arg, dir); + [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, MmChannel mm) { + return openBuffer(device, type, size, arg, dir, mm); }, - [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir) { - return openBufferRaw(device, phys_addr, size, dir); + [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, MmChannel mm) { + return openBufferRaw(device, phys_addr, size, dir, mm); }, [&](const Device& device, HotplugOp op, uint8_t function) { return hotplugOp(device, op, function); }, [&](const Device& device, int input_fd) { return designWrite(device, input_fd); }, @@ -289,7 +289,8 @@ Buffer Session::openBuffer( BufferAllocType allocType, uint64_t size, uint64_t allocArg, - BufferAllocDir allocDir + BufferAllocDir allocDir, + MmChannel mmChannel ) const { if (isClosed()) { throw Error(VRTD_RET_BAD_LIB_CALL); @@ -304,6 +305,7 @@ Buffer Session::openBuffer( static_cast(allocDir), allocArg, size, + static_cast(static_cast(mmChannel)), &raw ); if (ret != VRTD_RET_OK) { @@ -321,7 +323,8 @@ Buffer Session::openBufferRaw( const Device& device, uint64_t phys_addr, uint64_t size, - BufferAllocDir allocDir + BufferAllocDir allocDir, + MmChannel mmChannel ) const { if (isClosed()) { throw Error(VRTD_RET_BAD_LIB_CALL); @@ -335,6 +338,7 @@ Buffer Session::openBufferRaw( phys_addr, size, static_cast(allocDir), + static_cast(static_cast(mmChannel)), &raw ); if (ret != VRTD_RET_OK) { diff --git a/vrt/vrtd/src/buffer.c b/vrt/vrtd/src/buffer.c index 5a30076e..a5194fc7 100644 --- a/vrt/vrtd/src/buffer.c +++ b/vrt/vrtd/src/buffer.c @@ -52,7 +52,38 @@ #define VRTD_QDMA_Q_MODE_MM 0u /* Memory-mapped (MM) mode */ #define VRTD_QDMA_DIR_H2C (1u << 0) /* Host-to-Card direction */ #define VRTD_QDMA_DIR_C2H (1u << 1) /* Card-to-Host direction */ -#define VRTD_QDMA_RING_SZ_IDX 0u /* Default ring size index */ +/* + * TODO: make this a vrtd.conf setting. Index 15 is the largest QDMA descriptor + * ring and gives the best sustained transfer speed, but it consumes more + * host-side DMA-coherent memory per queue. + */ +#define VRTD_QDMA_RING_SZ_IDX 15u /* Default ring size index */ + +/** + * Decide how many qpairs back a buffer and which AXI-MM channel each one uses. + * + * A request of SLASH_QDMA_MM_CHANNEL_AUTO is expanded into two qpairs -- + * @c fds[0] pinned to channel 0 and @c fds[1] to channel 1 -- so the client can + * apply the V80 placement policy with a deterministic fd-to-channel mapping. + * An explicit channel request pins a single qpair to that channel (no split). + * + * @param mm_channel Requested channel (enum slash_qdma_mm_channel). + * @param channels [out] Per-qpair channel value, indexed by qpair number. + * @return Number of qpairs to create (1 or VRTD_BUFFER_MAX_QPAIR_FDS). + */ +static uint32_t buffer_plan_qpair_channels( + uint32_t mm_channel, + uint32_t channels[VRTD_BUFFER_MAX_QPAIR_FDS]) +{ + if (mm_channel == SLASH_QDMA_MM_CHANNEL_AUTO) { + channels[0] = SLASH_QDMA_MM_CHANNEL_0; + channels[1] = SLASH_QDMA_MM_CHANNEL_1; + return VRTD_BUFFER_MAX_QPAIR_FDS; + } + + channels[0] = mm_channel; + return 1u; +} /** * Initialise a buffer: allocate device memory, create a QDMA queue pair, @@ -81,6 +112,7 @@ static int buffer_init(struct buffer *buf, uint64_t size, uint64_t alloc_arg, uint64_t client_id, + uint32_t mm_channel, const struct slash_qdma_qpair_add *qpair_params) { if (buf == NULL) { @@ -100,7 +132,8 @@ static int buffer_init(struct buffer *buf, .client_id = client_id, .addr = 0, .size = 0, - .qid = 0, + .qpair_count = 0, + .qids = {0}, .fd = -1, .allocation_valid = false, .qpair_created = false, @@ -171,46 +204,57 @@ static int buffer_init(struct buffer *buf, buf->size = alloc_size; buf->allocation_valid = true; - /* Step 2: Configure and create a QDMA queue pair. If the caller - * supplied custom qpair parameters (e.g. streaming mode), use those; - * otherwise default to memory-mapped mode with the smallest ring size. */ - struct slash_qdma_qpair_add qpair = {0}; - if (qpair_params != NULL) { - qpair = *qpair_params; - } else { - qpair.mode = VRTD_QDMA_Q_MODE_MM; - qpair.h2c_ring_sz = VRTD_QDMA_RING_SZ_IDX; - qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX; - qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX; - } - qpair.dir_mask = dir_mask; - qpair.size = sizeof(qpair); + /* Steps 2-4: create/start queue pairs and obtain their fds. An AUTO + * request yields two qpairs -- fds[0] on channel 0, fds[1] on channel 1 -- + * so the client's V80 placement policy has a deterministic fd-to-channel + * mapping; an explicit channel pins a single qpair. */ + uint32_t qpair_channels[VRTD_BUFFER_MAX_QPAIR_FDS]; + uint32_t num_qpairs = buffer_plan_qpair_channels(mm_channel, qpair_channels); + + for (uint32_t i = 0; i < num_qpairs; ++i) { + struct slash_qdma_qpair_add qpair = {0}; + + if (qpair_params != NULL) { + qpair = *qpair_params; + } else { + qpair.mode = VRTD_QDMA_Q_MODE_MM; + qpair.h2c_ring_sz = VRTD_QDMA_RING_SZ_IDX; + qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX; + qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX; + } + qpair.dir_mask = dir_mask; + qpair.mm_channel = qpair_channels[i]; + qpair.size = sizeof(qpair); - if (slash_qdma_qpair_add(qdma, &qpair) != 0) { - LOG(LOG_ERR, "Failed to add buffer qpair: %m"); - goto fail; - } + if (slash_qdma_qpair_add(qdma, &qpair) != 0) { + LOG(LOG_ERR, "Failed to add buffer qpair %u: %m", (unsigned int)i); + goto fail; + } - buf->qid = qpair.qid; - buf->qpair_created = true; + buf->qids[i] = qpair.qid; + buf->qpair_count = i + 1; + buf->qpair_created = true; - /* Step 3: Start the queue pair so DMA transfers can be issued. */ - if (slash_qdma_qpair_start(qdma, buf->qid) != 0) { - LOG(LOG_ERR, "Failed to start buffer qpair %u: %m", buf->qid); - goto fail; + if (slash_qdma_qpair_start(qdma, qpair.qid) != 0) { + LOG(LOG_ERR, "Failed to start buffer qpair %u: %m", qpair.qid); + goto fail; + } } - /* Step 4: Obtain a file descriptor for the queue. The client will use - * this fd (passed over the Unix socket via SCM_RIGHTS) to perform - * read/write/mmap against the QDMA queue. */ - int fd = slash_qdma_qpair_get_fd(qdma, buf->qid, O_CLOEXEC); - if (fd < 0) { - LOG(LOG_ERR, "Failed to get fd for buffer qpair %u: %m", buf->qid); + /* Step 5: bind every started qpair into a single transfer fd so one + * transfer ioctl can fan across both channels. The qids array index + * becomes the qpair_index used by the client's sub-transfers. */ + buf->fd = slash_qdma_qpair_get_fd_multi(qdma, buf->qids, buf->qpair_count, + O_CLOEXEC); + if (buf->fd < 0) { + LOG(LOG_ERR, "Failed to get combined fd for %u buffer qpairs: %m", + (unsigned int)buf->qpair_count); goto fail; } - buf->fd = fd; - LOG(LOG_DEBUG, "Buffer initialized addr=0x%llx size=%llu qid=%u", (unsigned long long)buf->addr, (unsigned long long)buf->size, buf->qid); + LOG(LOG_DEBUG, "Buffer initialized addr=0x%llx size=%llu qpairs=%u", + (unsigned long long)buf->addr, (unsigned long long)buf->size, + (unsigned int)buf->qpair_count); return 0; fail: @@ -235,6 +279,7 @@ struct buffer *buffer_create(struct slash_qdma *qdma, uint64_t size, uint64_t alloc_arg, uint64_t client_id, + uint32_t mm_channel, const struct slash_qdma_qpair_add *qpair_params) { struct buffer *buf = calloc(1, sizeof(*buf)); @@ -243,7 +288,7 @@ struct buffer *buffer_create(struct slash_qdma *qdma, return NULL; } - if (buffer_init(buf, qdma, map, alloc_type, alloc_dir, size, alloc_arg, client_id, qpair_params) != 0) { + if (buffer_init(buf, qdma, map, alloc_type, alloc_dir, size, alloc_arg, client_id, mm_channel, qpair_params) != 0) { LOG(LOG_ERR, "Failed to initialize buffer: %m"); return NULL; } @@ -263,7 +308,9 @@ struct buffer *buffer_create(struct slash_qdma *qdma, struct buffer *buffer_create_raw(struct slash_qdma *qdma, uint64_t phys_addr, uint64_t size, - enum vrtd_alloc_dir alloc_dir) + enum vrtd_alloc_dir alloc_dir, + uint64_t client_id, + uint32_t mm_channel) { if (qdma == NULL || size == 0) { errno = EINVAL; @@ -299,48 +346,60 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma, .alloc_type = 0, .alloc_arg = 0, .alloc_dir = alloc_dir, - .client_id = 0, + .client_id = client_id, .addr = phys_addr, .size = size, - .qid = 0, + .qpair_count = 0, + .qids = {0}, .fd = -1, .allocation_valid = false, /* no allocator reservation to free */ .qpair_created = false, }; - struct slash_qdma_qpair_add qpair = {0}; - qpair.mode = VRTD_QDMA_Q_MODE_MM; - qpair.h2c_ring_sz = VRTD_QDMA_RING_SZ_IDX; - qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX; - qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX; - qpair.dir_mask = dir_mask; - qpair.size = sizeof(qpair); - - if (slash_qdma_qpair_add(qdma, &qpair) != 0) { - LOG(LOG_ERR, "buffer_create_raw: failed to add qpair: %m"); - free(buf); - return NULL; - } + uint32_t qpair_channels[VRTD_BUFFER_MAX_QPAIR_FDS]; + uint32_t num_qpairs = buffer_plan_qpair_channels(mm_channel, qpair_channels); - buf->qid = qpair.qid; - buf->qpair_created = true; + for (uint32_t i = 0; i < num_qpairs; ++i) { + struct slash_qdma_qpair_add qpair = {0}; - if (slash_qdma_qpair_start(qdma, buf->qid) != 0) { - LOG(LOG_ERR, "buffer_create_raw: failed to start qpair %u: %m", buf->qid); - cleanup_buffer(buf); - return NULL; + qpair.mode = VRTD_QDMA_Q_MODE_MM; + qpair.h2c_ring_sz = VRTD_QDMA_RING_SZ_IDX; + qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX; + qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX; + qpair.dir_mask = dir_mask; + qpair.mm_channel = qpair_channels[i]; + qpair.size = sizeof(qpair); + + if (slash_qdma_qpair_add(qdma, &qpair) != 0) { + LOG(LOG_ERR, "buffer_create_raw: failed to add qpair %u: %m", (unsigned int)i); + cleanup_buffer(buf); + return NULL; + } + + buf->qids[i] = qpair.qid; + buf->qpair_count = i + 1; + buf->qpair_created = true; + + if (slash_qdma_qpair_start(qdma, qpair.qid) != 0) { + LOG(LOG_ERR, "buffer_create_raw: failed to start qpair %u: %m", qpair.qid); + cleanup_buffer(buf); + return NULL; + } } - int fd = slash_qdma_qpair_get_fd(qdma, buf->qid, O_CLOEXEC); - if (fd < 0) { - LOG(LOG_ERR, "buffer_create_raw: failed to get fd for qpair %u: %m", buf->qid); + /* Bind every started qpair into a single transfer fd. */ + buf->fd = slash_qdma_qpair_get_fd_multi(qdma, buf->qids, buf->qpair_count, + O_CLOEXEC); + if (buf->fd < 0) { + LOG(LOG_ERR, "buffer_create_raw: failed to get combined fd for %u qpairs: %m", + (unsigned int)buf->qpair_count); cleanup_buffer(buf); return NULL; } - buf->fd = fd; - LOG(LOG_DEBUG, "Raw buffer created phys_addr=0x%llx size=%llu qid=%u", - (unsigned long long)phys_addr, (unsigned long long)size, buf->qid); + LOG(LOG_DEBUG, "Raw buffer created phys_addr=0x%llx size=%llu qpairs=%u", + (unsigned long long)phys_addr, (unsigned long long)size, + (unsigned int)buf->qpair_count); return buf; } @@ -348,12 +407,12 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma, * Tear down a buffer and release all associated resources. * * Resources are released in reverse acquisition order: - * 1. Close the file descriptor (if open). - * 2. Stop and delete the QDMA queue pair (if created). + * 1. Close the file descriptors (if open). + * 2. Stop and delete the QDMA queue pairs (if created). * 3. Free the device memory allocation (if valid). * 4. Zero all fields and free the struct. * - * Each step is guarded by its corresponding flag (fd >= 0, + * Each step is guarded by its corresponding flag (fds[] >= 0, * qpair_created, allocation_valid) so this function is safe to call * after partial initialisation. NULL-safe. */ @@ -363,31 +422,31 @@ void cleanup_buffer(struct buffer *buf) return; } - LOG(LOG_DEBUG, "Freeing buffer addr=0x%llx size=%llu qid=%u", (unsigned long long)buf->addr, (unsigned long long)buf->size, buf->qid); + LOG(LOG_DEBUG, "Freeing buffer addr=0x%llx size=%llu qpairs=%u", + (unsigned long long)buf->addr, (unsigned long long)buf->size, + (unsigned int)buf->qpair_count); - /* Close the QDMA queue fd first, before stopping the queue. */ + /* Close the combined QDMA transfer fd first, before stopping the queues. */ if (buf->fd >= 0) { (void) close(buf->fd); buf->fd = -1; } - /* Stop and delete the QDMA queue pair. Errors are logged but + /* Stop and delete the QDMA queue pairs. Errors are logged but * otherwise ignored -- we are on the teardown path and must continue * releasing remaining resources. */ if (buf->qpair_created && buf->qdma != NULL) { - if (slash_qdma_qpair_stop(buf->qdma, buf->qid) != 0) { - LOG( - LOG_WARNING, - "Error stopping buffer qpair %u: %m (ignored)", - buf->qid - ); - } - if (slash_qdma_qpair_del(buf->qdma, buf->qid) != 0) { - LOG( - LOG_WARNING, - "Error deleting buffer qpair %u: %m (ignored)", - buf->qid - ); + for (uint32_t i = 0; i < buf->qpair_count; ++i) { + if (slash_qdma_qpair_stop(buf->qdma, buf->qids[i]) != 0) { + LOG(LOG_WARNING, + "Error stopping buffer qpair %u: %m (ignored)", + buf->qids[i]); + } + if (slash_qdma_qpair_del(buf->qdma, buf->qids[i]) != 0) { + LOG(LOG_WARNING, + "Error deleting buffer qpair %u: %m (ignored)", + buf->qids[i]); + } } } @@ -417,7 +476,8 @@ void cleanup_buffer(struct buffer *buf) buf->allocation_valid = false; buf->addr = 0; buf->size = 0; - buf->qid = 0; + buf->qpair_count = 0; + memset(buf->qids, 0, sizeof(buf->qids)); buf->fd = -1; free(buf); diff --git a/vrt/vrtd/src/buffer.h b/vrt/vrtd/src/buffer.h index 6834222b..167bf98a 100644 --- a/vrt/vrtd/src/buffer.h +++ b/vrt/vrtd/src/buffer.h @@ -48,6 +48,8 @@ #include "array.h" #include "vrtd/wire.h" +#define VRTD_BUFFER_MAX_QPAIR_FDS 2u + /** * @brief A single DMA buffer allocated on a SLASH FPGA device. * @@ -72,10 +74,13 @@ struct buffer { uint64_t addr; /** @brief Size of the allocated memory region in bytes (rounded up to subregion granularity). */ uint64_t size; - /** @brief QDMA queue ID assigned to this buffer's queue pair. */ - uint32_t qid; - /** @brief File descriptor for the QDMA queue pair character device. - * Passed to the client via SCM_RIGHTS for direct data transfer. */ + /** @brief Number of QDMA queue pairs created for this buffer (1 or 2). */ + uint32_t qpair_count; + /** @brief QDMA queue IDs assigned to this buffer's queue pairs. */ + uint32_t qids[VRTD_BUFFER_MAX_QPAIR_FDS]; + /** @brief Single transfer fd that owns all @qpair_count queue pairs. + * Passed to the client via SCM_RIGHTS for direct data transfer; the client + * selects a channel per sub-transfer by qpair_index. -1 when not created. */ int fd; /** @brief True if the address-space allocation in the memory map is valid and must be freed. */ bool allocation_valid; @@ -96,6 +101,7 @@ struct buffer { * @param size Requested buffer size in bytes (may be rounded up). * @param alloc_arg Type-specific argument (HBM region index for non-VNOC HBM). * @param client_id Connection ID of the owning client. + * @param mm_channel AXI-MM/NoC channel selection (enum slash_qdma_mm_channel). * @param qpair_params QDMA queue pair configuration parameters. * @return Heap-allocated buffer on success, NULL on failure. */ @@ -106,6 +112,7 @@ struct buffer *buffer_create(struct slash_qdma *qdma, uint64_t size, uint64_t alloc_arg, uint64_t client_id, + uint32_t mm_channel, const struct slash_qdma_qpair_add *qpair_params); /** @@ -120,12 +127,17 @@ struct buffer *buffer_create(struct slash_qdma *qdma, * @param phys_addr Caller-specified device physical address. * @param size Size in bytes. * @param alloc_dir DMA transfer direction. + * @param client_id Connection ID of the owning client (for ownership checks + * and automatic cleanup on disconnect; must be non-zero). + * @param mm_channel AXI-MM/NoC channel selection (enum slash_qdma_mm_channel). * @return Heap-allocated buffer on success, NULL on failure (errno set). */ struct buffer *buffer_create_raw(struct slash_qdma *qdma, uint64_t phys_addr, uint64_t size, - enum vrtd_alloc_dir alloc_dir); + enum vrtd_alloc_dir alloc_dir, + uint64_t client_id, + uint32_t mm_channel); /** * @brief Release all resources owned by a buffer. diff --git a/vrt/vrtd/src/serve.c b/vrt/vrtd/src/serve.c index c11c4d32..4559f82a 100644 --- a/vrt/vrtd/src/serve.c +++ b/vrt/vrtd/src/serve.c @@ -761,7 +761,7 @@ static int client_handle_in(struct client *client) * Allocate a cmsg buffer large enough for one fd. * CMSG_SPACE includes alignment padding required by the kernel. */ - char cbuf[CMSG_SPACE(sizeof(int))]; + char cbuf[CMSG_SPACE(2 * sizeof(int))]; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, @@ -895,17 +895,24 @@ static int client_handle_out(struct client *client) * The cbuf is zeroed to satisfy kernel expectations about padding. */ if (client->have_out_fd) { + uint32_t fd_count = client->out_fd_count ? client->out_fd_count : 1; + + if (fd_count > 2) { + LOG(LOG_ERR, "Invalid outbound fd count %u", (unsigned int)fd_count); + return -1; + } + memset(cbuf, 0, sizeof cbuf); msg.msg_control = cbuf; - msg.msg_controllen = sizeof cbuf; + msg.msg_controllen = CMSG_SPACE(fd_count * sizeof(int)); struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; - cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + cmsg->cmsg_len = CMSG_LEN(fd_count * sizeof(int)); - memcpy(CMSG_DATA(cmsg), &client->out_fd, sizeof(int)); + memcpy(CMSG_DATA(cmsg), client->out_fds, fd_count * sizeof(int)); } ssize_t n; @@ -937,6 +944,7 @@ static int client_handle_out(struct client *client) /* Response sent -- clear state so the client can send a new request. */ client->have_response = false; client->have_out_fd = false; + client->out_fd_count = 0; return 0; } @@ -1040,7 +1048,7 @@ static int client_handle_request(struct client *client) req_header->size, CLIENT_OUT_BODY(*client, vrtd_resp_get_bar_fd), &size, - &client->out_fd, + &client->out_fds[0], &client->have_out_fd ); break; @@ -1082,7 +1090,7 @@ static int client_handle_request(struct client *client) req_header->size, CLIENT_OUT_BODY(*client, vrtd_resp_qdma_qpair_get_fd), &size, - &client->out_fd, + &client->out_fds[0], &client->have_out_fd ); break; @@ -1094,7 +1102,7 @@ static int client_handle_request(struct client *client) req_header->size, CLIENT_OUT_BODY(*client, vrtd_resp_buffer_open), &size, - &client->out_fd, + &client->out_fds[0], &client->have_out_fd ); break; @@ -1106,7 +1114,7 @@ static int client_handle_request(struct client *client) req_header->size, CLIENT_OUT_BODY(*client, vrtd_resp_buffer_open_raw), &size, - &client->out_fd, + &client->out_fds[0], &client->have_out_fd ); break; @@ -1966,6 +1974,12 @@ static uint16_t client_handle_request_buffer_open( return VRTD_RET_INVALID_ARGUMENT; } + if (req_body->mm_channel > SLASH_QDMA_MM_CHANNEL_1) { + LOG(LOG_WARNING, "Received buffer open request with invalid mm_channel %u", + (unsigned int)req_body->mm_channel); + return VRTD_RET_INVALID_ARGUMENT; + } + struct device *d = client->state->devices.d[req_body->dev_number]; if (d == NULL || d->qdma == NULL || d->memory_map == NULL) { LOG(LOG_WARNING, "Received buffer open request for non-existent or non-functional device"); @@ -1992,6 +2006,7 @@ static uint16_t client_handle_request_buffer_open( req_body->size, req_body->alloc_arg, client_id, + req_body->mm_channel, NULL ); if (buf == NULL) { @@ -2009,14 +2024,16 @@ static uint16_t client_handle_request_buffer_open( return VRTD_RET_INTERNAL_ERROR; } - if (buf->fd < 0) { - LOG(LOG_ERR, "Buffer created without valid fd"); + if (buf->qpair_count == 0 || buf->qpair_count > VRTD_BUFFER_MAX_QPAIR_FDS || + buf->fd < 0) { + LOG(LOG_ERR, "Buffer created without valid qpair fd"); return VRTD_RET_INTERNAL_ERROR; } uint64_t real_size = buf->size; - int fd = buf->fd; uint64_t phys_addr = buf->addr; + uint32_t qpair_count = buf->qpair_count; + int fd = buf->fd; /* * Transfer ownership of the buffer into the device's buffer list. @@ -2030,6 +2047,11 @@ static uint16_t client_handle_request_buffer_open( resp_body->size = real_size; resp_body->phys_addr = phys_addr; + resp_body->qpair_count = qpair_count; + /* A single transfer fd owns all qpairs; the client selects channels by + * qpair_index per sub-transfer. */ + client->out_fds[0] = fd; + client->out_fd_count = 1; *out_fd = fd; *have_out_fd = true; *resp_size = sizeof(*resp_body); @@ -2104,18 +2126,32 @@ static uint16_t client_handle_request_buffer_open_raw( return VRTD_RET_INVALID_ARGUMENT; } + if (req_body->mm_channel > SLASH_QDMA_MM_CHANNEL_1) { + LOG(LOG_WARNING, "Received raw buffer open request with invalid mm_channel %u", + (unsigned int)req_body->mm_channel); + return VRTD_RET_INVALID_ARGUMENT; + } + struct device *d = client->state->devices.d[req_body->dev_number]; if (d == NULL || d->qdma == NULL) { LOG(LOG_WARNING, "Received raw buffer open request for non-existent or non-functional device"); return VRTD_RET_NOEXIST; } + uint64_t client_id = client->conn_id; + if (client_id == 0) { + LOG(LOG_ERR, "Invalid client connection id"); + return VRTD_RET_INTERNAL_ERROR; + } + _cleanup_(cleanup_bufferp) struct buffer *buf = buffer_create_raw( d->qdma, req_body->phys_addr, req_body->size, - (enum vrtd_alloc_dir) req_body->alloc_dir + (enum vrtd_alloc_dir) req_body->alloc_dir, + client_id, + req_body->mm_channel ); if (buf == NULL) { if (errno == EINVAL) { @@ -2126,11 +2162,13 @@ static uint16_t client_handle_request_buffer_open_raw( return VRTD_RET_INTERNAL_ERROR; } - if (buf->fd < 0) { - LOG(LOG_ERR, "Raw buffer created without valid fd"); + if (buf->qpair_count == 0 || buf->qpair_count > VRTD_BUFFER_MAX_QPAIR_FDS || + buf->fd < 0) { + LOG(LOG_ERR, "Raw buffer created without valid qpair fd"); return VRTD_RET_INTERNAL_ERROR; } + uint32_t qpair_count = buf->qpair_count; int fd = buf->fd; if (buffer_ptr_array_push_move(&d->buffers, &buf) != 0) { @@ -2138,7 +2176,9 @@ static uint16_t client_handle_request_buffer_open_raw( return VRTD_RET_INTERNAL_ERROR; } - resp_body->zero = 0; + resp_body->qpair_count = qpair_count; + client->out_fds[0] = fd; + client->out_fd_count = 1; *out_fd = fd; *have_out_fd = true; *resp_size = sizeof(*resp_body); @@ -2211,8 +2251,15 @@ static uint16_t client_handle_request_buffer_close( return VRTD_RET_NOEXIST; } - /* Search for the buffer by physical address. */ + /* + * Search for the caller's buffer by physical address. Raw buffers bypass + * the allocator and use caller-specified addresses, so distinct clients can + * hold buffers at the same address; scan all matches and pick the one owned + * by this connection rather than rejecting on the first address match. + */ struct buffer *found = NULL; + bool addr_size_match_foreign = false; /* same addr+size, owned by another conn */ + bool addr_match_size_mismatch = false; /* same addr, different size */ for (size_t i = 0; i < d->buffers.len; ++i) { struct buffer *buf = d->buffers.d[i]; if (buf == NULL) { @@ -2221,15 +2268,20 @@ static uint16_t client_handle_request_buffer_close( if (buf->addr != req_body->phys_addr) { continue; } - /* Found a buffer at the right address -- verify size. */ if (buf->size != req_body->size) { - LOG(LOG_WARNING, "buffer_close: size mismatch at addr=0x%llx (expected %llu, got %llu)", - (unsigned long long)req_body->phys_addr, - (unsigned long long)buf->size, (unsigned long long)req_body->size); - return VRTD_RET_INVALID_ARGUMENT; + addr_match_size_mismatch = true; + continue; } - /* Verify ownership: only the client that opened the buffer may close it. */ if (buf->client_id != client->conn_id) { + addr_size_match_foreign = true; + continue; + } + found = buf; + break; + } + + if (found == NULL) { + if (addr_size_match_foreign) { char pwbuf[1024]; LOG( LOG_WARNING, @@ -2239,11 +2291,12 @@ static uint16_t client_handle_request_buffer_close( ); return VRTD_RET_AUTH_ERROR; } - found = buf; - break; - } - - if (found == NULL) { + if (addr_match_size_mismatch) { + LOG(LOG_WARNING, "buffer_close: size mismatch at addr=0x%llx (got %llu)", + (unsigned long long)req_body->phys_addr, + (unsigned long long)req_body->size); + return VRTD_RET_INVALID_ARGUMENT; + } LOG(LOG_NOTICE, "buffer_close: no buffer at addr=0x%llx on device %u", (unsigned long long)req_body->phys_addr, (unsigned int)req_body->dev_number); return VRTD_RET_NOEXIST; diff --git a/vrt/vrtd/src/serve.h b/vrt/vrtd/src/serve.h index 55cdd9ba..8d4e728c 100644 --- a/vrt/vrtd/src/serve.h +++ b/vrt/vrtd/src/serve.h @@ -73,9 +73,11 @@ struct client { /** @brief True when @c in_fd contains a valid received file descriptor. */ bool have_in_fd; - /** @brief File descriptor to send back to the client via SCM_RIGHTS ancillary data. */ - int out_fd; - /** @brief True when @c out_fd contains a valid file descriptor to transmit. */ + /** @brief File descriptors to send back to the client via SCM_RIGHTS ancillary data. */ + int out_fds[2]; + /** @brief Number of valid descriptors in @c out_fds. */ + uint32_t out_fd_count; + /** @brief True when @c out_fds contains at least one valid file descriptor to transmit. */ bool have_out_fd; /** @brief True when a complete request has been read into @c inb and is awaiting dispatch. */ diff --git a/vrt/vrtd/tests/CMakeLists.txt b/vrt/vrtd/tests/CMakeLists.txt index f5197f45..241f01d4 100644 --- a/vrt/vrtd/tests/CMakeLists.txt +++ b/vrt/vrtd/tests/CMakeLists.txt @@ -29,5 +29,6 @@ add_vrtd_test(hotplug_test hotplug_test.cpp) add_vrtd_test(config_test config_test.cpp) add_vrtd_test(auth_test auth_test.cpp) add_vrtd_test(buffer_test buffer_test.cpp) +add_vrtd_test(v80_policy_test v80_policy_test.cpp) add_vrtd_test(design_writer_test design_writer_test.cpp) add_vrtd_test(device_test device_test.cpp) diff --git a/vrt/vrtd/tests/buffer_test.cpp b/vrt/vrtd/tests/buffer_test.cpp index 078f5819..1038678b 100644 --- a/vrt/vrtd/tests/buffer_test.cpp +++ b/vrt/vrtd/tests/buffer_test.cpp @@ -36,6 +36,27 @@ static constexpr const char *REAL_QDMA_PATH = "/dev/slash_qdma_ctl0"; static constexpr uint64_t XFER_SIZE = 4096; static constexpr uint64_t CLIENT_ID = 42; +static void qpair_fd_round_trip(int fd, uint64_t addr, const uint8_t *src, uint8_t *dst) +{ + struct slash_qdma_buffer write_buf{}; + struct slash_qdma_buffer read_buf{}; + ASSERT_EQ(slash_qdma_qpair_buffer_create(fd, XFER_SIZE, &write_buf), 0); + ASSERT_EQ(slash_qdma_qpair_buffer_create(fd, XFER_SIZE, &read_buf), 0); + std::memcpy(write_buf.addr, src, XFER_SIZE); + + ssize_t written = slash_qdma_qpair_transfer( + fd, write_buf.fd, 0, addr, XFER_SIZE, SLASH_QDMA_XFER_H2C); + EXPECT_EQ(written, static_cast(XFER_SIZE)); + + ssize_t read_bytes = slash_qdma_qpair_transfer( + fd, read_buf.fd, 0, addr, XFER_SIZE, SLASH_QDMA_XFER_C2H); + EXPECT_EQ(read_bytes, static_cast(XFER_SIZE)); + std::memcpy(dst, read_buf.addr, XFER_SIZE); + + EXPECT_EQ(slash_qdma_buffer_destroy(&write_buf), 0); + EXPECT_EQ(slash_qdma_buffer_destroy(&read_buf), 0); +} + // ─── Null / argument validation (no hardware needed, always run) ────────────── TEST(BufferNullTest, NullQdma) { @@ -43,7 +64,7 @@ TEST(BufferNullTest, NullQdma) { ASSERT_NE(map, nullptr); struct buffer *buf = buffer_create(nullptr, map, ALLOCATION_TYPE_DDR, VRTD_ALLOC_DIR_HOST_TO_DEVICE, - XFER_SIZE, 0, CLIENT_ID, nullptr); + XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); EXPECT_EQ(buf, nullptr); device_memory_map_cleanup(map); } @@ -53,7 +74,7 @@ TEST(BufferNullTest, NullMap) { ASSERT_NE(qdma, nullptr); struct buffer *buf = buffer_create(qdma, nullptr, ALLOCATION_TYPE_DDR, VRTD_ALLOC_DIR_HOST_TO_DEVICE, - XFER_SIZE, 0, CLIENT_ID, nullptr); + XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); EXPECT_EQ(buf, nullptr); slash_qdma_close(qdma); } @@ -65,7 +86,7 @@ TEST(BufferNullTest, ZeroSize) { ASSERT_NE(map, nullptr); struct buffer *buf = buffer_create(qdma, map, ALLOCATION_TYPE_DDR, VRTD_ALLOC_DIR_HOST_TO_DEVICE, - 0, 0, CLIENT_ID, nullptr); + 0, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); EXPECT_EQ(buf, nullptr); device_memory_map_cleanup(map); slash_qdma_close(qdma); @@ -78,7 +99,7 @@ TEST(BufferNullTest, ZeroClientId) { ASSERT_NE(map, nullptr); struct buffer *buf = buffer_create(qdma, map, ALLOCATION_TYPE_DDR, VRTD_ALLOC_DIR_HOST_TO_DEVICE, - XFER_SIZE, 0, 0, nullptr); + XFER_SIZE, 0, 0, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); EXPECT_EQ(buf, nullptr); device_memory_map_cleanup(map); slash_qdma_close(qdma); @@ -91,7 +112,7 @@ TEST(BufferNullTest, InvalidDirection) { ASSERT_NE(map, nullptr); struct buffer *buf = buffer_create(qdma, map, ALLOCATION_TYPE_DDR, static_cast(99), - XFER_SIZE, 0, CLIENT_ID, nullptr); + XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); EXPECT_EQ(buf, nullptr); device_memory_map_cleanup(map); slash_qdma_close(qdma); @@ -103,7 +124,8 @@ TEST(BufferNullTest, CleanupNull) { TEST(BufferNullTest, RawNullQdma) { struct buffer *buf = buffer_create_raw(nullptr, DDR_START_ADDRESS, XFER_SIZE, - VRTD_ALLOC_DIR_HOST_TO_DEVICE); + VRTD_ALLOC_DIR_HOST_TO_DEVICE, CLIENT_ID, + SLASH_QDMA_MM_CHANNEL_AUTO); EXPECT_EQ(buf, nullptr); EXPECT_EQ(errno, EINVAL); } @@ -112,7 +134,8 @@ TEST(BufferNullTest, RawZeroSize) { struct slash_qdma *qdma = slash_qdma_open("@mock"); ASSERT_NE(qdma, nullptr); struct buffer *buf = buffer_create_raw(qdma, DDR_START_ADDRESS, 0, - VRTD_ALLOC_DIR_HOST_TO_DEVICE); + VRTD_ALLOC_DIR_HOST_TO_DEVICE, CLIENT_ID, + SLASH_QDMA_MM_CHANNEL_AUTO); EXPECT_EQ(buf, nullptr); EXPECT_EQ(errno, EINVAL); slash_qdma_close(qdma); @@ -154,20 +177,17 @@ class BufferTest : public ::testing::TestWithParam { TEST_P(BufferTest, LifecycleBidirectional) { struct buffer *buf = buffer_create(qdma_, map_, ALLOCATION_TYPE_DDR, VRTD_ALLOC_DIR_BIDIRECTIONAL, - XFER_SIZE, 0, CLIENT_ID, nullptr); + XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); ASSERT_NE(buf, nullptr); + ASSERT_GE(buf->qpair_count, 1u); EXPECT_GE(buf->fd, 0); uint8_t src[XFER_SIZE]; for (size_t i = 0; i < XFER_SIZE; ++i) src[i] = static_cast(i & 0xFF); - ssize_t written = pwrite(buf->fd, src, XFER_SIZE, static_cast(buf->addr)); - EXPECT_EQ(written, static_cast(XFER_SIZE)); - uint8_t dst[XFER_SIZE]{}; - ssize_t read_bytes = pread(buf->fd, dst, XFER_SIZE, static_cast(buf->addr)); - EXPECT_EQ(read_bytes, static_cast(XFER_SIZE)); + qpair_fd_round_trip(buf->fd, buf->addr, src, dst); EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); cleanup_buffer(buf); @@ -175,20 +195,19 @@ TEST_P(BufferTest, LifecycleBidirectional) { TEST_P(BufferTest, RawCreateAndIO) { struct buffer *buf = buffer_create_raw(qdma_, DDR_START_ADDRESS, XFER_SIZE, - VRTD_ALLOC_DIR_BIDIRECTIONAL); + VRTD_ALLOC_DIR_BIDIRECTIONAL, CLIENT_ID, + SLASH_QDMA_MM_CHANNEL_AUTO); ASSERT_NE(buf, nullptr); + ASSERT_GE(buf->qpair_count, 1u); EXPECT_GE(buf->fd, 0); EXPECT_EQ(buf->addr, DDR_START_ADDRESS); EXPECT_FALSE(buf->allocation_valid); + EXPECT_EQ(buf->client_id, CLIENT_ID); uint8_t src[XFER_SIZE]; std::memset(src, 0xCD, sizeof(src)); - ssize_t written = pwrite(buf->fd, src, XFER_SIZE, static_cast(DDR_START_ADDRESS)); - EXPECT_EQ(written, static_cast(XFER_SIZE)); - uint8_t dst[XFER_SIZE]{}; - ssize_t n = pread(buf->fd, dst, XFER_SIZE, static_cast(DDR_START_ADDRESS)); - EXPECT_EQ(n, static_cast(XFER_SIZE)); + qpair_fd_round_trip(buf->fd, DDR_START_ADDRESS, src, dst); EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); cleanup_buffer(buf); @@ -202,20 +221,22 @@ TEST_P(BufferTest, QueueExhaustion) { GTEST_SKIP() << "Queue exhaustion test is mock-only"; } - static constexpr int MAX_QUEUES = 64; + static constexpr int MAX_BUFFERS = 32; /* two mock queues per buffer */ std::vector bufs; - bufs.reserve(MAX_QUEUES); + bufs.reserve(MAX_BUFFERS); - for (int i = 0; i < MAX_QUEUES; ++i) { + for (int i = 0; i < MAX_BUFFERS; ++i) { struct buffer *buf = buffer_create_raw(qdma_, DDR_START_ADDRESS + i * XFER_SIZE, - XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE); - ASSERT_NE(buf, nullptr) << "Expected success for queue " << i; + XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE, CLIENT_ID, + SLASH_QDMA_MM_CHANNEL_AUTO); + ASSERT_NE(buf, nullptr) << "Expected success for buffer " << i; bufs.push_back(buf); } - /* 65th allocation must fail */ + /* 33rd allocation needs queues 65/66 and must fail. */ struct buffer *overflow = buffer_create_raw(qdma_, DDR_START_ADDRESS, - XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE); + XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE, CLIENT_ID, + SLASH_QDMA_MM_CHANNEL_AUTO); EXPECT_EQ(overflow, nullptr); EXPECT_EQ(errno, ENOSPC); diff --git a/vrt/vrtd/tests/device_test.cpp b/vrt/vrtd/tests/device_test.cpp index 36518c10..c66fd9f6 100644 --- a/vrt/vrtd/tests/device_test.cpp +++ b/vrt/vrtd/tests/device_test.cpp @@ -149,7 +149,8 @@ TEST(DeviceCleanupTest, CleanupWithBuffers) { /* Allocate a raw buffer on the mock QDMA and hand ownership to d->buffers. */ struct buffer *buf = buffer_create_raw(d->qdma, DDR_START_ADDRESS, 4096, - VRTD_ALLOC_DIR_HOST_TO_DEVICE); + VRTD_ALLOC_DIR_HOST_TO_DEVICE, /*client_id=*/1, + SLASH_QDMA_MM_CHANNEL_AUTO); ASSERT_NE(buf, nullptr); int ret = buffer_ptr_array_push_move(&d->buffers, &buf); diff --git a/vrt/vrtd/tests/v80_policy_test.cpp b/vrt/vrtd/tests/v80_policy_test.cpp new file mode 100644 index 00000000..068a724b --- /dev/null +++ b/vrt/vrtd/tests/v80_policy_test.cpp @@ -0,0 +1,124 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software + * and associated documentation files (the "Software"), to deal in the Software without restriction, + * including without limitation the rights to use, copy, modify, merge, publish, distribute, + * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or + * substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT + * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +#include + +#include "../libvrtd/src/v80_policy.h" + +namespace { + +constexpr uint64_t STEP = 4096; +constexpr uint64_t MiB = 1024ULL * 1024ULL; +constexpr uint64_t GiB = 1024ULL * MiB; + +// A single available queue always carries the whole transfer on fds[0]. +TEST(V80Plan, SingleQueueIsWhole) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint32_t n = vrtd_plan_v80(VRTD_V80_HBM_BASE, 0, 512 * MiB, STEP, 1, segs); + ASSERT_EQ(n, 1u); + EXPECT_EQ(segs[0].qpair_index, 0u); + EXPECT_EQ(segs[0].offset, 0u); + EXPECT_EQ(segs[0].size, 512 * MiB); +} + +// DDR has a single NSU, so the range is split in half across both channels. +TEST(V80Plan, DdrSplitsInHalf) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint32_t n = vrtd_plan_v80(VRTD_V80_DDR_BASE, 0, 512 * MiB, STEP, 2, segs); + ASSERT_EQ(n, 2u); + EXPECT_EQ(segs[0].qpair_index, 0u); + EXPECT_EQ(segs[0].offset, 0u); + EXPECT_EQ(segs[0].size, 256 * MiB); + EXPECT_EQ(segs[1].qpair_index, 1u); + EXPECT_EQ(segs[1].offset, 256 * MiB); + EXPECT_EQ(segs[1].size, 256 * MiB); +} + +// A DDR transfer too small to halve along the step boundary stays on fds[0]. +TEST(V80Plan, DdrTinyTransferStaysOnPrimary) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint32_t n = vrtd_plan_v80(VRTD_V80_DDR_BASE, 0, STEP, STEP, 2, segs); + ASSERT_EQ(n, 1u); + EXPECT_EQ(segs[0].qpair_index, 0u); + EXPECT_EQ(segs[0].size, STEP); +} + +// An HBM buffer entirely below the half-boundary uses channel 0 only. +TEST(V80Plan, HbmLowerHalfChannel0) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint32_t n = vrtd_plan_v80(VRTD_V80_HBM_BASE, 0, 512 * MiB, STEP, 2, segs); + ASSERT_EQ(n, 1u); + EXPECT_EQ(segs[0].qpair_index, 0u); + EXPECT_EQ(segs[0].offset, 0u); + EXPECT_EQ(segs[0].size, 512 * MiB); +} + +// An HBM buffer entirely at/above the half-boundary uses channel 1 only. +TEST(V80Plan, HbmUpperHalfChannel1) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint64_t base = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF + 4 * GiB; + uint32_t n = vrtd_plan_v80(base, 0, 512 * MiB, STEP, 2, segs); + ASSERT_EQ(n, 1u); + EXPECT_EQ(segs[0].qpair_index, 1u); + EXPECT_EQ(segs[0].offset, 0u); + EXPECT_EQ(segs[0].size, 512 * MiB); +} + +// A buffer sitting exactly on the boundary belongs to the upper half. +TEST(V80Plan, HbmOnBoundaryIsUpperHalf) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint64_t base = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF; + uint32_t n = vrtd_plan_v80(base, 0, 256 * MiB, STEP, 2, segs); + ASSERT_EQ(n, 1u); + EXPECT_EQ(segs[0].qpair_index, 1u); +} + +// An HBM range straddling the boundary splits exactly at it. +TEST(V80Plan, HbmSpanningSplitsAtBoundary) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint64_t base = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF - 256 * MiB; + uint32_t n = vrtd_plan_v80(base, 0, 512 * MiB, STEP, 2, segs); + ASSERT_EQ(n, 2u); + EXPECT_EQ(segs[0].qpair_index, 0u); + EXPECT_EQ(segs[0].offset, 0u); + EXPECT_EQ(segs[0].size, 256 * MiB); + EXPECT_EQ(segs[1].qpair_index, 1u); + EXPECT_EQ(segs[1].offset, 256 * MiB); + EXPECT_EQ(segs[1].size, 256 * MiB); +} + +// The split point is computed from the absolute device address, so a non-zero +// buffer offset that crosses the boundary is honoured. +TEST(V80Plan, HbmSpanningWithOffset) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint64_t offset = VRTD_V80_HBM_HALF - STEP; // crosses boundary STEP into the range + uint32_t n = vrtd_plan_v80(VRTD_V80_HBM_BASE, offset, 2 * STEP, STEP, 2, segs); + ASSERT_EQ(n, 2u); + EXPECT_EQ(segs[0].qpair_index, 0u); + EXPECT_EQ(segs[0].offset, offset); + EXPECT_EQ(segs[0].size, STEP); + EXPECT_EQ(segs[1].qpair_index, 1u); + EXPECT_EQ(segs[1].offset, offset + STEP); + EXPECT_EQ(segs[1].size, STEP); +} + +} // namespace