From a91ea20b41fb09ce856bbf5f9935626e3f0f5a1f Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Mon, 8 Jun 2026 14:34:30 +0100 Subject: [PATCH 01/23] Added slash_compat.h force include and fix from_timer backport issue for RHEL 9.8 Signed-off-by: Vlad-Gabriel Serbu --- driver/Makefile | 5 +++++ driver/slash_compat.h | 24 ++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/driver/Makefile b/driver/Makefile index 98a56815..281455ee 100644 --- a/driver/Makefile +++ b/driver/Makefile @@ -82,6 +82,11 @@ ifeq ($(SLASH_HAVE_MODULE_IMPORT_NS_TOKEN),y) ccflags-y += -DSLASH_HAVE_MODULE_IMPORT_NS_TOKEN endif +# Force-include the compat header into every TU (including the pinned libqdma +# submodule sources we don't modify) so kernel-API shims such as from_timer() +# reach third-party code too. Safe on all kernels: the shims are guarded. +ccflags-y += -include $(src)/slash_compat.h + LIBQDMA_OBJS := \ $(LIBQDMA_PATH)/qdma_mbox.o \ diff --git a/driver/slash_compat.h b/driver/slash_compat.h index 5b3a50c2..2352458c 100644 --- a/driver/slash_compat.h +++ b/driver/slash_compat.h @@ -17,6 +17,7 @@ #include #include +#include /* * Compat shims selected by the kcompat probes in driver/kcompat/. @@ -53,4 +54,27 @@ static inline void slash_vm_flags_set(struct vm_area_struct *vma, vm_flags_t fla #define SLASH_MODULE_IMPORT_NS(ns) MODULE_IMPORT_NS(#ns) #endif +/* + * from_timer() was renamed to timer_container_of() upstream in v6.16 + * (commit 41cb08555c41) and backported by RHEL/CentOS 9.8 (kernel + * 5.14.0-687; 9.7 / 5.14.0-611 and earlier still ship from_timer) into the + * 5.14 baseline, so a LINUX_VERSION_CODE / RHEL_RELEASE_CODE check is + * unreliable across the 9.x rebuilds. Both names are typeof()-based macros, + * so detect them directly and prefer the kernel's own API: + * 1. kernel still defines from_timer() -> use it as-is (no redefine) + * 2. kernel renamed it to timer_container_of() -> delegate to that + * 3. neither exists -> hand-roll the historical body + * is included above so the guard sees whichever name the + * kernel defines, regardless of -include ordering. + */ +#ifndef from_timer +# ifdef timer_container_of +# define from_timer(var, callback_timer, timer_fieldname) \ + timer_container_of(var, callback_timer, timer_fieldname) +# else +# define from_timer(var, callback_timer, timer_fieldname) \ + container_of(callback_timer, typeof(*var), timer_fieldname) +# endif +#endif + #endif /* SLASH_COMPAT_H */ From ed0f69d159b1e0caf0e8f2d7ceebd68c4070cdc0 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Mon, 8 Jun 2026 10:12:23 +0100 Subject: [PATCH 02/23] driver: large-page qdma transfers, NoC channel steering, libqdma patches Signed-off-by: Vlad-Gabriel Serbu --- docs/reference/kernel-abi/index.rst | 10 +- driver/Makefile | 83 ++- driver/README.md | 47 ++ driver/patches/0001-libqdma-sg-mapping.patch | 172 +++++ .../patches/0002-libqdma-versal-channel.patch | 32 + driver/slash_config.h | 8 +- driver/slash_qdma.c | 644 +++++++++++++++--- driver/tests/test_slash_qdma.c | 134 +++- 8 files changed, 1016 insertions(+), 114 deletions(-) create mode 100644 driver/patches/0001-libqdma-sg-mapping.patch create mode 100644 driver/patches/0002-libqdma-versal-channel.patch diff --git a/docs/reference/kernel-abi/index.rst b/docs/reference/kernel-abi/index.rst index 6fbc1faa..dd4e6a1d 100644 --- a/docs/reference/kernel-abi/index.rst +++ b/docs/reference/kernel-abi/index.rst @@ -407,6 +407,14 @@ All transfers are synchronous and block until the transfer completes or times ou **10 seconds**; after expiry the call returns ``-ETIME``. Partial transfers are possible; the return value is the number of bytes transferred, and the file position is advanced accordingly. +The userspace buffer address and ``count`` must be page-aligned: the address +must be 4 KiB-aligned and ``count`` must be a non-zero multiple of 4 KiB. A +2 MiB-aligned, 2 MiB-multiple transfer backed by 2 MiB hugetlb pages uses one +descriptor per hugepage; all other accepted transfers use one descriptor per +4 KiB base page. Transparent hugepages are not accepted on the 4 KiB path, so +callers using anonymous mappings should apply ``MADV_NOHUGEPAGE`` before +faulting pages when they need deterministic base-page transfers. + Multiple fds can be obtained for the same qpair via multiple ``QPAIR_GET_FD`` calls, including from different processes. Concurrent ``read()``/``write()`` calls on the same qpair (from any fd or thread) are serialized by the kernel and execute one at a time; for parallel I/O, allocate @@ -425,7 +433,7 @@ The following errno values can be returned by ``read()`` and ``write()`` on the * - ``-ENODEV`` - Device shutting down, or the required direction is not enabled for this qpair * - ``-EINVAL`` - - Zero-length transfer (``count`` results in 0 pages) + - Zero-length, unaligned, or non-page-multiple transfer * - ``-ENOMEM`` - SGL allocation failure * - ``-EFAULT`` diff --git a/driver/Makefile b/driver/Makefile index 281455ee..ab48039b 100644 --- a/driver/Makefile +++ b/driver/Makefile @@ -42,8 +42,21 @@ else LIBQDMA_PATH := $(LIBQDMA_FALLBACK) endif +# SLASH carries a few local modifications to the pinned QDMA submodule's +# libqdma sources (see $(LIBQDMA_PATCH_DIR)/). The submodule itself stays +# pristine; the patches are applied to whichever libqdma tree is being built +# (the DKMS-local ./libqdma or the in-tree submodule) by the libqdma-patches +# target before the module is compiled. See that target for details. +LIBQDMA_PATCH_DIR := patches + SLASH_QDMA_OP_DEBUG ?= 0 +# Per-transfer timing instrumentation. Set to 1 to emit one dmesg line per +# DMA transfer breaking down the kernel phases (SLASH_QDMA_TIMING) and the +# libqdma submit sub-phases (QDMA_TIMING). Default off (zero overhead). +SLASH_QDMA_TIMING ?= 0 +QDMA_TIMING ?= 0 + # Kcompat feature flags. Defaults are "n"; the all: recipe runs # driver/kcompat/probe.sh against $(KDIR) to detect the actual values # and passes them into the kbuild recursion. Each pair (modern API + @@ -72,6 +85,8 @@ ccflags-y += \ \ -DTANDEM_BOOT_SUPPORTED=1 \ -DSLASH_QDMA_OP_DEBUG=$(SLASH_QDMA_OP_DEBUG) \ + -DSLASH_QDMA_TIMING=$(SLASH_QDMA_TIMING) \ + -DQDMA_TIMING=$(QDMA_TIMING) \ -DSLASH_VERSION_STR=\"$(SLASH_VERSION)\" ifeq ($(SLASH_HAVE_VM_FLAGS_SET),y) @@ -125,18 +140,80 @@ $(MODULE)-objs += $(LIBQDMA_OBJS) $(QDMA_ACCESS_OBJS) KCOMPAT := "$(SHELL)" "$(PWD)/kcompat/probe.sh" -all: +all: libqdma-patches @flags="$$($(KCOMPAT) "$(KDIR)" | tr '\n' ' ')"; \ echo "slash: kcompat: $$flags"; \ $(MAKE) -C "$(KDIR)" M="$(PWD)" $$flags modules +# Apply SLASH's local libqdma patches ($(LIBQDMA_PATCH_DIR)/*.patch) to the +# libqdma source tree in use, in filename order, right before building. +# +# The pinned submodule is not edited directly by commits: patches live in-tree +# and are stamped onto the working copy here. Application is idempotent — each patch is first tested +# for being already applied (reverse dry-run) and skipped if so — so repeated +# `make` runs, incremental builds, and DKMS rebuilds are all safe. A patch that +# neither applies cleanly nor is already present aborts the build. +# +# $(PWD) is the driver dir for both `make` (in-tree) and DKMS (MAKE[0] runs +# `make -C driver ...`); ./libqdma is the DKMS-packaged copy, otherwise fall +# back to the in-tree submodule path. Uses patch(1) so it is independent of +# whether the libqdma tree lives inside a git checkout. +libqdma-patches: + @set -e; \ + patch_dir="$(PWD)/$(LIBQDMA_PATCH_DIR)"; \ + set -- "$$patch_dir"/*.patch; \ + if [ ! -e "$$1" ]; then exit 0; fi; \ + if [ -d "$(PWD)/libqdma" ]; then lq="$(PWD)/libqdma"; \ + else lq="$(PWD)/$(LIBQDMA_FALLBACK)"; fi; \ + if [ ! -d "$$lq" ]; then \ + echo "slash: ERROR libqdma sources not found at $$lq" >&2; \ + echo "slash: run 'git submodule update --init --recursive' first" >&2; \ + exit 1; \ + fi; \ + command -v patch >/dev/null 2>&1 || { \ + echo "slash: ERROR patch(1) not found; it is required to apply libqdma patches" >&2; \ + exit 1; }; \ + for p in "$$@"; do \ + name="$$(basename "$$p")"; \ + if patch -R -p1 -d "$$lq" --dry-run -f -s -i "$$p" >/dev/null 2>&1; then \ + echo "slash: libqdma patch already applied, skipping: $$name"; \ + elif patch -p1 -d "$$lq" --dry-run -f -s -i "$$p" >/dev/null 2>&1; then \ + echo "slash: applying libqdma patch: $$name"; \ + patch -p1 -d "$$lq" -f -s -i "$$p"; \ + else \ + echo "slash: ERROR libqdma patch does not apply cleanly: $$name" >&2; \ + echo "slash: (libqdma tree at $$lq is neither pristine nor already patched)" >&2; \ + exit 1; \ + fi; \ + done + +# Best-effort revert of the libqdma patches, restoring the submodule working +# copy to pristine. Useful when editing the patches themselves. Never fails the +# build: patches that are not currently applied are simply skipped. +unpatch-libqdma: + @set -e; \ + patch_dir="$(PWD)/$(LIBQDMA_PATCH_DIR)"; \ + set -- "$$patch_dir"/*.patch; \ + if [ ! -e "$$1" ]; then exit 0; fi; \ + if [ -d "$(PWD)/libqdma" ]; then lq="$(PWD)/libqdma"; \ + else lq="$(PWD)/$(LIBQDMA_FALLBACK)"; fi; \ + [ -d "$$lq" ] || exit 0; \ + for p in $$(printf '%s\n' "$$@" | tac); do \ + name="$$(basename "$$p")"; \ + if patch -R -p1 -d "$$lq" --dry-run -f -s -i "$$p" >/dev/null 2>&1; then \ + echo "slash: reverting libqdma patch: $$name"; \ + patch -R -p1 -d "$$lq" -f -s -i "$$p"; \ + fi; \ + done + clean: - $(MAKE) -C "$(KDIR)" M="$(PWD)" clean + -$(MAKE) -C "$(KDIR)" M="$(PWD)" clean rm -rf "$(PWD)/kcompat/.scratch" + $(MAKE) unpatch-libqdma install: all sudo install -d -m 755 /lib/modules/$(shell uname -r)/extra sudo install -m 644 $(MODULE).ko /lib/modules/$(shell uname -r)/extra sudo depmod -a -.PHONY: all clean install +.PHONY: all clean install libqdma-patches unpatch-libqdma diff --git a/driver/README.md b/driver/README.md index 65cd911a..c04776d7 100644 --- a/driver/README.md +++ b/driver/README.md @@ -1,10 +1,57 @@ # SLASH kernel module +## Module parameters + +Exposed under `/sys/module/slash/parameters/` (all writable at runtime; see +`modinfo slash.ko`): + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `qdma_num_threads` | uint | 8 | Number of libqdma worker threads. | +| `qdma_debugfs_path` | charp | disabled | debugfs mount path for libqdma. | +| `qdma_force_mm_channel` | int | -1 | Force the QDMA AXI-MM / NoC channel for newly-added queues: `<0` = auto (stripe by `qid & 1`), `0` or `1` = pin every new queue to that channel. | + +### A/B testing NoC channel bandwidth + +`qdma_force_mm_channel` is read when each queue pair is added, so it can be +changed between test runs to check whether both PCIe NMUs (NoC channels) +actually contribute bandwidth. Each value pins all new queues to one NoC +channel; the default (`-1`) splits them across both: + +```sh +# All queues on NoC channel 0 (NMU S00) +echo 0 | sudo tee /sys/module/slash/parameters/qdma_force_mm_channel +sudo v80-smi validate -d --raw-transfer-test --no-reset + +# All queues on NoC channel 1 (NMU S01) +echo 1 | sudo tee /sys/module/slash/parameters/qdma_force_mm_channel +sudo v80-smi validate -d --raw-transfer-test --no-reset + +# Default: split across both channels (qid & 1) +echo -1 | sudo tee /sys/module/slash/parameters/qdma_force_mm_channel +sudo v80-smi validate -d --raw-transfer-test --no-reset +``` + +Debug builds with `SLASH_QDMA_OP_DEBUG=1` log each queue's selected +`mm_channel` when it is added. If the split run is no faster than a single +forced channel, traffic is not being spread across both NMUs. The parameter +affects every queue created through this driver (both the VRTD buffer path and +`--raw-transfer-test`), but not the off-the-shelf Xilinx QDMA driver path +(`--use-qdma-driver`). + ## Testing The test suite requires a physical V80 to be present and the module to be loaded into a running kernel. +## Local libqdma patches + +SLASH carries small patches for the pinned `libqdma` submodule under +`driver/patches/`. The driver `Makefile` applies them before building, and +`make clean` attempts to revert them so the submodule working copy returns to +its pristine pinned state. DKMS packages include the same patch directory and +depend on `patch(1)`. + ### Prerequisites - A kernel built with `CONFIG_GCOV_KERNEL=y` (only needed for coverage runs). diff --git a/driver/patches/0001-libqdma-sg-mapping.patch b/driver/patches/0001-libqdma-sg-mapping.patch new file mode 100644 index 00000000..c94866ef --- /dev/null +++ b/driver/patches/0001-libqdma-sg-mapping.patch @@ -0,0 +1,172 @@ +SLASH local modification to the pinned QDMA submodule (libqdma). + +libqdma: length/offset-aware SG mapping + optional submit timing + +The stock libqdma sgl_map()/sgl_unmap() always DMA-map a fixed PAGE_SIZE per +scatter-gather entry and ignore sg->len / sg->offset. The SLASH driver +(driver/slash_qdma.c) builds SG lists with variable-length entries (a single +base page or one 2 MiB hugetlb page per entry), so the mapping must honour +sg->offset and sg->len. This also documents sg->len in libqdma_export.h and +adds the optional, compile-time gated (QDMA_TIMING) per-request submit timing +that pairs with SLASH_QDMA_TIMING in the SLASH driver. + +Generated against qdma_drv @ 03ac7f3 (pinned submodule commit). +Applied automatically by driver/Makefile (libqdma-patches target, patch -p1). +diff --git a/libqdma_export.c b/libqdma_export.c +index f0524d8..bff6161 100755 +--- a/libqdma_export.c ++++ b/libqdma_export.c +@@ -39,6 +39,8 @@ + #include "qdma_mbox.h" + #include "qdma_platform.h" + ++#include ++ + #ifdef DEBUGFS + #include "qdma_debugfs_queue.h" + +@@ -50,6 +52,27 @@ static bool qdma_debufs_cleanup = true; + + #define QDMA_Q_PEND_LIST_COMPLETION_TIMEOUT 1000 /* msec */ + ++/* ++ * Per-request timing instrumentation for the synchronous MM submit path. ++ * ++ * When QDMA_TIMING is non-zero (compile-time flag, e.g. built with ++ * -DQDMA_TIMING=1), qdma_request_submit() emits one line per request that ++ * splits the submit cost into: ++ * ++ * - sgl_map: DMA-mapping the scatter-gather list (dma_map_page per entry; ++ * this is where IOMMU programming/IOTLB flushes show up). ++ * - proc: qdma_descq_proc_sgt_request() -- descriptor-ring fill plus ++ * the PIDX doorbell MMIO write that kicks the hardware. ++ * - wait: qdma_request_wait_for_cmpl() -- the blocking wait covering ++ * the actual HW data movement and poll-mode completion spin. ++ * ++ * Pairs with SLASH_QDMA_TIMING in the SLASH driver, whose "submit" phase is ++ * exactly the sum of these three. ++ */ ++#ifndef QDMA_TIMING ++#define QDMA_TIMING 0 ++#endif ++ + struct drv_mode_name mode_name_list[] = { + { AUTO_MODE, "auto"}, + { POLL_MODE, "poll"}, +@@ -2324,8 +2347,8 @@ void sgl_unmap(struct pci_dev *pdev, struct qdma_sw_sg *sg, unsigned int sgcnt, + if (!sg->pg) + break; + if (sg->dma_addr) { +- dma_unmap_page(&pdev->dev, sg->dma_addr - sg->offset, +- PAGE_SIZE, dir); ++ dma_unmap_page(&pdev->dev, sg->dma_addr, sg->len, ++ dir); + sg->dma_addr = 0UL; + } + } +@@ -2351,20 +2374,21 @@ int sgl_map(struct pci_dev *pdev, struct qdma_sw_sg *sgl, unsigned int sgcnt, + int i; + struct qdma_sw_sg *sg = sgl; + +- /** Map the sg list onto a dma pages where +- * each page has max of PAGE_SIZE i.e 4K +- */ + for (i = 0; i < sgcnt; i++, sg++) { +- /* !! TODO page size !! */ +- sg->dma_addr = dma_map_page(&pdev->dev, sg->pg, 0, PAGE_SIZE, +- dir); ++ if (!sg->len) { ++ pr_err("map sgl failed, sg %d has zero length.\n", i); ++ if (i) ++ sgl_unmap(pdev, sgl, i, dir); ++ return -EINVAL; ++ } ++ sg->dma_addr = dma_map_page(&pdev->dev, sg->pg, sg->offset, ++ sg->len, dir); + if (unlikely(dma_mapping_error(&pdev->dev, sg->dma_addr))) { + pr_err("map sgl failed, sg %d, %u.\n", i, sg->len); + if (i) + sgl_unmap(pdev, sgl, i, dir); + return -EIO; + } +- sg->dma_addr += sg->offset; + } + + return 0; +@@ -2393,6 +2417,9 @@ ssize_t qdma_request_submit(unsigned long dev_hndl, unsigned long id, + enum dma_data_direction dir; + int wait = 0; + int rv = 0; ++#if QDMA_TIMING ++ ktime_t t_start, t_mapped, t_proc, t_wait; ++#endif + + /** make sure that the dev_hndl passed is Valid */ + if (!xdev) { +@@ -2459,6 +2486,9 @@ ssize_t qdma_request_submit(unsigned long dev_hndl, unsigned long id, + if (descq->conf.st && (descq->conf.q_type == Q_C2H)) + return qdma_request_submit_st_c2h(xdev, descq, req); + ++#if QDMA_TIMING ++ t_start = ktime_get(); ++#endif + if (!req->dma_mapped) { + rv = sgl_map(xdev->conf.pdev, req->sgl, req->sgcnt, dir); + if (rv < 0) { +@@ -2468,6 +2498,9 @@ ssize_t qdma_request_submit(unsigned long dev_hndl, unsigned long id, + } + cb->unmap_needed = 1; + } ++#if QDMA_TIMING ++ t_mapped = ktime_get(); ++#endif + + lock_descq(descq); + /** if the descq is already in online state*/ +@@ -2484,6 +2517,9 @@ ssize_t qdma_request_submit(unsigned long dev_hndl, unsigned long id, + pr_debug("%s: cb 0x%p submitted.\n", descq->conf.name, cb); + + qdma_descq_proc_sgt_request(descq); ++#if QDMA_TIMING ++ t_proc = ktime_get(); ++#endif + + if (!wait) + return 0; +@@ -2492,6 +2528,18 @@ ssize_t qdma_request_submit(unsigned long dev_hndl, unsigned long id, + if (rv < 0) + goto unmap_sgl; + ++#if QDMA_TIMING ++ t_wait = ktime_get(); ++ pr_info("qdma: timing %s %s count=%u sgcnt=%u ep=0x%llx off=%u | sgl_map=%lld proc=%lld wait=%lld total=%lld ns\n", ++ descq->conf.name, req->write ? "H2C" : "C2H", ++ req->count, req->sgcnt, ++ (unsigned long long)req->ep_addr, cb->offset, ++ ktime_to_ns(ktime_sub(t_mapped, t_start)), ++ ktime_to_ns(ktime_sub(t_proc, t_mapped)), ++ ktime_to_ns(ktime_sub(t_wait, t_proc)), ++ ktime_to_ns(ktime_sub(t_wait, t_start))); ++#endif ++ + return cb->offset; + + unmap_sgl: +diff --git a/libqdma_export.h b/libqdma_export.h +index baeb78e..9bd60ee 100755 +--- a/libqdma_export.h ++++ b/libqdma_export.h +@@ -558,7 +558,12 @@ struct qdma_sw_sg { + struct page *pg; + /** offset in current page */ + unsigned int offset; +- /** length of the page */ ++ /** ++ * Length of this scatter-gather entry. The DMA mapping helpers map ++ * exactly this many bytes starting at @offset, so callers must set this ++ * to the full backing granule they intend to expose (for example 4 KiB ++ * for base pages or 2 MiB for huge pages). ++ */ + unsigned int len; + /** dma address of the allocated page */ + dma_addr_t dma_addr; diff --git a/driver/patches/0002-libqdma-versal-channel.patch b/driver/patches/0002-libqdma-versal-channel.patch new file mode 100644 index 00000000..89bddd43 --- /dev/null +++ b/driver/patches/0002-libqdma-versal-channel.patch @@ -0,0 +1,32 @@ +SLASH local modification to the pinned QDMA submodule (libqdma). + +libqdma: set descq->channel on the initial queue-add path (Versal) + +qdma_descq_config() only mirrored qconf->mm_channel into descq->channel on the +reconfig path. qdma_queue_add() calls it with reconfig=0, so on Versal hard IP +(QDMA_VERSAL_HARD_IP) the SW-context mm_chn/host_id stayed 0. Mirror mm_channel +into descq->channel on the initial add path too. + +Generated against qdma_drv @ 03ac7f3 (pinned submodule commit). +Applied automatically by driver/Makefile (libqdma-patches target, patch -p1). +diff --git a/qdma_descq.c b/qdma_descq.c +index c2f19d1..b432737 100755 +--- a/qdma_descq.c ++++ b/qdma_descq.c +@@ -1261,6 +1261,16 @@ void qdma_descq_config(struct qdma_descq *descq, struct qdma_queue_conf *qconf, + descq->conf.st = qconf->st; + descq->conf.q_type = qconf->q_type; + ++ /* Below check is applicable only for Versal family. ++ * Mirror mm_channel into descq->channel on the initial add path ++ * too; qdma_queue_add() only calls this with reconfig=0, so ++ * without this the SW-context mm_chn/host_id would always be 0 ++ * (the reconfig-only assignment below is reached solely via ++ * qdma_queue_config()). ++ */ ++ if (descq->xdev->version_info.ip_type == QDMA_VERSAL_HARD_IP) ++ descq->channel = qconf->mm_channel; ++ + } else { + descq->conf.desc_rng_sz_idx = qconf->desc_rng_sz_idx; + descq->conf.cmpl_rng_sz_idx = qconf->cmpl_rng_sz_idx; diff --git a/driver/slash_config.h b/driver/slash_config.h index c06c5962..acebe253 100644 --- a/driver/slash_config.h +++ b/driver/slash_config.h @@ -23,9 +23,9 @@ * * The SLASH design exposes two PCI physical functions per card: * - * - **PF1** (device 0x50B5) — QDMA function. Hosts the Xilinx QDMA - * IP used for high-throughput DMA transfers between host memory and - * the FPGA fabric. + * - **PF1** (device 0x50B5, or 0x50BD on AVED/V80P designs) — QDMA + * function. Hosts the Xilinx QDMA IP used for high-throughput DMA + * transfers between host memory and the FPGA fabric. * * - **PF2** (device 0x50B6) — Control function. Exposes PCI BARs * that the host can mmap for register-level MMIO access to the @@ -52,6 +52,8 @@ #define SLASH_QDMA_PCI_VENDOR_ID 0x10EE /** PCI device ID for the V80 SLASH QDMA function. */ #define SLASH_QDMA_PCI_DEVICE_ID 0x50B5 +/** PCI device ID for the V80P/AVED QDMA function. */ +#define SLASH_AVED_QDMA_PCI_DEVICE_ID 0x50BD /** Physical function number for the QDMA DMA engine. */ #define SLASH_QDMA_PF 1 diff --git a/driver/slash_qdma.c b/driver/slash_qdma.c index 6c64272b..18c78206 100644 --- a/driver/slash_qdma.c +++ b/driver/slash_qdma.c @@ -23,8 +23,9 @@ * to provide queue-pair-based DMA transfers between host memory and the * FPGA fabric. * - * The QDMA subsystem binds to PF1 (PCI device ID 0x50B5), while the - * control device (slash_ctldev) binds to PF2 (device ID 0x50B6). + * The QDMA subsystem binds to PF1 (PCI device ID 0x50B5, or 0x50BD on + * AVED/V80P designs), while the control device (slash_ctldev) binds to + * PF2 (device ID 0x50B6). * * Queue pair lifecycle: * add -> start -> I/O (via anon_inode fd) -> stop -> del @@ -50,10 +51,15 @@ #include #include +#include #include #include #include +#include +#include #include +#include +#include #include #include #include @@ -101,6 +107,80 @@ #define SLASH_QDMA_QPAIR_GET_FD_MIN_SIZE \ offsetofend(struct slash_qdma_qpair_fd_request, flags) +/* + * CPM5 Host Profile indirect-context programming. + * + * The Host Profile context tells the CPM5 QDMA how to route AXI4-MM + * traffic onto the Versal NoC. It is programmed via the same indirect + * context command interface libqdma uses for queue contexts, but with + * the host-profile selector (0xA). Register offsets and the command + * word layout mirror eqdma_cpm5_reg.h: + * + * IND_CTXT_DATA base 0x804 (8 x u32 context words) + * IND_CTXT_MASK base 0x824 (8 x u32 write masks) + * IND_CTXT_CMD 0x844 (busy[0], sel[4:1], op[6:5], qid[18:7]) + * + * We program two profiles so the per-queue SW-context host_id selects + * the NoC channel: Host ID 0 -> NoC Channel 0, Host ID 1 -> NoC Channel 1. + */ +#define SLASH_QDMA_HP_DATA_ADDR 0x804u +#define SLASH_QDMA_HP_MASK_ADDR 0x824u +#define SLASH_QDMA_HP_CMD_ADDR 0x844u +#define SLASH_QDMA_HP_CMD_BUSY BIT(0) +#define SLASH_QDMA_HP_NUM_WORDS 8 +#define SLASH_QDMA_HP_SEL 0xAu /* QDMA_CTXT_SELC_HOST_PROFILE */ +#define SLASH_QDMA_HP_OP_WR 0x1u /* indirect context WR opcode */ +#define SLASH_QDMA_HP_SMID_BASE 0x100u /* bit 8 set; base AXI-MM master ID */ +#define SLASH_QDMA_HP_POLL_US 1000 /* busy-wait budget in microseconds */ + +/* + * qdma_force_mm_channel - Debug/experiment override for AXI-MM/NoC channel + * assignment of newly-added queue pairs. + * + * < 0 : automatic - stripe across channels by (qid & 1) [default] + * 0 : pin every new queue to MM channel 0 (Host Profile 0 / NoC Channel 0) + * 1 : pin every new queue to MM channel 1 (Host Profile 1 / NoC Channel 1) + * + * The value is read when a queue pair is added, so it can be changed at + * runtime via /sys/module/slash/parameters/qdma_force_mm_channel to A/B test + * whether both PCIe NMUs (NoC channels) actually contribute bandwidth: + * + * echo 0 > .../qdma_force_mm_channel # all traffic on NoC channel 0 (S00) + * echo 1 > .../qdma_force_mm_channel # all traffic on NoC channel 1 (S01) + * echo -1 > .../qdma_force_mm_channel # default split (qid & 1) + * + * Affects both the VRTD buffer path and the raw-transfer path (any queue + * created through this driver). It does not affect the off-the-shelf Xilinx + * QDMA driver path. + */ +static int qdma_force_mm_channel = -1; + +static int slash_qdma_force_mm_channel_set(const char *val, + const struct kernel_param *kp) +{ + int parsed; + int err; + + err = kstrtoint(val, 0, &parsed); + if (err) + return err; + + if (parsed < -1 || parsed > 1) + return -EINVAL; + + return param_set_int(val, kp); +} + +static const struct kernel_param_ops slash_qdma_force_mm_channel_ops = { + .set = slash_qdma_force_mm_channel_set, + .get = param_get_int, +}; + +module_param_cb(qdma_force_mm_channel, &slash_qdma_force_mm_channel_ops, + &qdma_force_mm_channel, 0644); +MODULE_PARM_DESC(qdma_force_mm_channel, + "Force QDMA AXI-MM/NoC channel for new queues: <0=auto(qid&1), 0 or 1 to pin (default -1)"); + /** * SLASH_QDMA_QTYPE_COUNT - Number of queue types tracked per queue pair. * @@ -118,6 +198,25 @@ */ #define SLASH_QDMA_MAX_QPAIRS 256 +/* + * The qpair fd data path accepts either a span of 4 KiB base pages or a span + * of 2 MiB hugetlb pages. Every scatter-gather entry within one request uses + * the same granule, which keeps the DMA mapping semantics unambiguous; the two + * granules are never mixed in a single request. A whole transfer (of either + * granule) is submitted to libqdma as a single multi-descriptor request, and + * libqdma refills the descriptor ring as needed -- so the transfer size is not + * bounded by the ring depth. + */ +#define SLASH_QDMA_HUGEPAGE_SIZE (2UL * 1024UL * 1024UL) + +/* + * Upper bound on the number of pages pinned per get_user_pages_fast() call when + * mapping a multi-page base-page transfer. Bounds the work done in a single + * GUP call (and keeps the per-call page count within int range) while still + * pinning large buffers in only a handful of iterations. + */ +#define SLASH_QDMA_GUP_BATCH 8192u + /** * SLASH_QDMA_QPAIR_ID_RANGE - XArray allocation range for qpair IDs. * @@ -152,6 +251,30 @@ } while (0) #endif +/* + * Per-transfer timing instrumentation. + * + * When SLASH_QDMA_TIMING is non-zero (compile-time flag, e.g. built with + * -DSLASH_QDMA_TIMING=1), slash_qdma_qpair_read_write() emits one dev_info + * line per transfer breaking down the wall-clock cost of the kernel-side + * phases: + * + * - map: pin user pages, validate page shape, build the SGL + * (slash_qdma_map_user_buf_to_sgl()). + * - submit: the whole libqdma qdma_request_submit() call, which covers + * SGL DMA-mapping (IOMMU), descriptor-ring fill, the PIDX + * doorbell, and the synchronous completion wait (HW transfer + + * poll-mode spin). libqdma can be built with QDMA_TIMING=1 for + * a finer breakdown of this phase. + * - unmap: unpin pages (mark dirty for C2H) and free the SGL. + * + * Timestamps use ktime_get() (CLOCK_MONOTONIC); the reads are cheap, but + * the whole block compiles out entirely when the flag is 0. + */ +#ifndef SLASH_QDMA_TIMING +#define SLASH_QDMA_TIMING 0 +#endif + /* Forward declaration; full definition follows. */ struct slash_qdma_dev; @@ -625,10 +748,12 @@ static void slash_qdma_ioctl_info(struct miscdevice *misc, struct slash_qdma_dev /** * slash_qdma_ids - PCI device ID table for the QDMA PF. * - * Matches only PF1 (device ID 0x50B5) on AMD/Xilinx V80 cards. + * Matches PF1 QDMA functions on AMD/Xilinx V80 cards, including the + * AVED/V80P device ID. */ static const struct pci_device_id slash_qdma_ids[] = { {PCI_DEVICE(SLASH_QDMA_PCI_VENDOR_ID, SLASH_QDMA_PCI_DEVICE_ID)}, + {PCI_DEVICE(SLASH_QDMA_PCI_VENDOR_ID, SLASH_AVED_QDMA_PCI_DEVICE_ID)}, {0,} }; MODULE_DEVICE_TABLE(pci, slash_qdma_ids); @@ -850,6 +975,157 @@ void slash_qdma_exit(void) SLASH_QDMA_OP_LOG("libqdma_exit done\n"); } +/* ───────────────────────────────────────────────────────────────────── + * CPM5 Host Profile context programming + * ───────────────────────────────────────────────────────────────────── */ + +/** + * slash_qdma_hp_set_field() - Set a bit field in the host profile context. + * @words: Array of SLASH_QDMA_HP_NUM_WORDS u32s holding the 256-bit context + * (word i covers bits [32*i+31 : 32*i]). + * @hi: Most-significant bit index of the field (inclusive). + * @lo: Least-significant bit index of the field (inclusive). + * @val: Value to place in [hi:lo]; bits outside the field width are masked. + * + * Handles fields that straddle a 32-bit word boundary (e.g. the C2H + * AXI4-MM steering field at bits [97:94], which spans words 2 and 3). + */ +static void slash_qdma_hp_set_field(u32 *words, unsigned int hi, + unsigned int lo, u32 val) +{ + unsigned int width = hi - lo + 1; + u32 fmask = (width >= 32) ? ~0u : ((1u << width) - 1u); + unsigned int word = lo >> 5; + unsigned int off = lo & 31; + u64 wmask = (u64)fmask << off; + u64 wval = (u64)(val & fmask) << off; + + words[word] = (words[word] & ~(u32)(wmask & 0xFFFFFFFFu)) | + (u32)(wval & 0xFFFFFFFFu); + + if ((off + width) > 32 && (word + 1) < SLASH_QDMA_HP_NUM_WORDS) + words[word + 1] = (words[word + 1] & ~(u32)(wmask >> 32)) | + (u32)(wval >> 32); +} + +/** + * slash_qdma_write_host_profile() - Program one CPM5 Host Profile entry. + * @device: QDMA device (provides the libqdma handle for register access). + * @host_id: Host Profile index to program (also the AXI4-MM steering value, + * i.e. the target NoC channel). + * + * Builds the 256-bit host profile context with the SMID and H2C/C2H + * AXI4-MM steering fields, writes it through the indirect-context + * registers via the libqdma-exported config register accessors, and + * polls the command BUSY bit until the controller completes the write. + * + * Only the SMID and the two steering fields are non-zero; the AXI + * prot/cache attributes are left at 0. + * + * Return: 0 on success, negative errno on register-access error or + * -ETIMEDOUT if the BUSY bit never clears. + */ +static int slash_qdma_write_host_profile(struct slash_qdma_dev *device, + u32 host_id) +{ + u32 data[SLASH_QDMA_HP_NUM_WORDS] = {0}; + unsigned int waited_us = 0; + u32 smid = SLASH_QDMA_HP_SMID_BASE + host_id; + u32 cmd; + u32 val = 0; + int err; + int i; + + /* SMID [201:192]; H2C steering [181:178]; C2H steering [97:94]. */ + slash_qdma_hp_set_field(data, 201, 192, smid); + slash_qdma_hp_set_field(data, 181, 178, host_id); + slash_qdma_hp_set_field(data, 97, 94, host_id); + + /* Context data words. */ + for (i = 0; i < SLASH_QDMA_HP_NUM_WORDS; i++) { + err = qdma_device_write_config_register(device->qdma_handle, + SLASH_QDMA_HP_DATA_ADDR + (i * sizeof(u32)), data[i]); + if (err) + goto err_reg; + } + + /* Context masks: write every bit. */ + for (i = 0; i < SLASH_QDMA_HP_NUM_WORDS; i++) { + err = qdma_device_write_config_register(device->qdma_handle, + SLASH_QDMA_HP_MASK_ADDR + (i * sizeof(u32)), 0xFFFFFFFFu); + if (err) + goto err_reg; + } + + /* Command: qid=host_id, op=WR, sel=HOST_PROFILE (0x34 for id 0, 0xB4 for id 1). */ + cmd = (host_id << 7) | (SLASH_QDMA_HP_OP_WR << 5) | (SLASH_QDMA_HP_SEL << 1); + err = qdma_device_write_config_register(device->qdma_handle, + SLASH_QDMA_HP_CMD_ADDR, cmd); + if (err) + goto err_reg; + + /* Wait for the controller to consume the command. */ + do { + err = qdma_device_read_config_register(device->qdma_handle, + SLASH_QDMA_HP_CMD_ADDR, &val); + if (err) + goto err_reg; + if (!(val & SLASH_QDMA_HP_CMD_BUSY)) + break; + udelay(1); + } while (++waited_us < SLASH_QDMA_HP_POLL_US); + + if (val & SLASH_QDMA_HP_CMD_BUSY) { + dev_err(&device->pdev->dev, + "qdma: host profile %u programming timed out (cmd=0x%x)\n", + host_id, val); + return -ETIMEDOUT; + } + + dev_info(&device->pdev->dev, + "slash: qdma: host profile %u applied: H2C/C2H AXI-MM steering=%u (NoC channel %u), smid=0x%03x (cmd=0x%02x)\n", + host_id, host_id, host_id, smid, cmd); + return 0; + +err_reg: + dev_err(&device->pdev->dev, + "qdma: host profile %u register access failed: %d\n", + host_id, err); + return err; +} + +/** + * slash_qdma_program_host_profiles() - Program the CPM5 Host Profiles. + * @device: QDMA device. + * + * Programs Host Profile 0 (steer to NoC Channel 0) and Host Profile 1 + * (steer to NoC Channel 1). Must run after qdma_device_open() (which + * clears all contexts) and before any queue context is programmed, per + * the CPM5 requirement that the host profile exist before AXI4-MM + * queues are set up. + * + * Return: 0 on success, negative errno on failure. + */ +static int slash_qdma_program_host_profiles(struct slash_qdma_dev *device) +{ + u32 host_id; + int err; + + dev_info(&device->pdev->dev, + "slash: qdma: programming CPM5 host profiles (host_id 0 -> NoC channel 0, host_id 1 -> NoC channel 1)\n"); + + for (host_id = 0; host_id <= 1; host_id++) { + err = slash_qdma_write_host_profile(device, host_id); + if (err) + return err; + } + + dev_info(&device->pdev->dev, + "slash: qdma: CPM5 host profiles programmed\n"); + + return 0; +} + /* ───────────────────────────────────────────────────────────────────── * PCI probe / remove * ───────────────────────────────────────────────────────────────────── */ @@ -912,6 +1188,20 @@ static int slash_qdma_probe(struct pci_dev *pdev, const struct pci_device_id *id device->qdma_handle); device->have_qdma_handle = true; + /* + * Program the CPM5 Host Profiles before exposing the miscdevice, so + * they exist before userspace can add any queue. Host ID 0 steers + * AXI4-MM traffic to NoC Channel 0 and Host ID 1 to NoC Channel 1; + * the per-queue SW-context host_id (mirrored from mm_channel = qid & 1) + * selects between them. + */ + err = slash_qdma_program_host_profiles(device); + if (err) { + dev_err(&pdev->dev, + "slash: qdma: could not program host profiles: %d", err); + goto err_free; + } + /* Register the management miscdevice so userspace can issue ioctls. */ err = misc_register(&device->misc); if (err) { @@ -1619,8 +1909,9 @@ static int slash_qdma_ioctl_qpair_add(struct miscdevice *misc, * (required for poll-mode operation per the reference driver). * - qconf.cmpl_stat_en = 1: enable completion status generation * (required for poll-mode operation per the reference driver). - * - qconf.aperture_size = 4096: page-granularity (4 KB) for descriptor - * addressing. Each descriptor addresses one page-sized chunk. + * - qconf.aperture_size = 0: disables libqdma keyhole mode so MM + * transfers advance linearly through endpoint memory. Non-zero + * values are keyhole apertures and wrap addresses within that window. * - qconf.desc_rng_sz_idx: CSR table index (0-15) selecting the * descriptor ring depth. Not a raw descriptor count — the actual * count is looked up from the global CSR ring-size table. @@ -1664,7 +1955,21 @@ static int slash_qdma_ioctl_qpair_add_q(struct miscdevice *misc, qconf.cmpl_status_pend_chk = 1; /* Check pending completions (poll-mode req) */ qconf.cmpl_stat_en = 1; /* Enable completion status generation */ - qconf.aperture_size = 4096; /* Page-granularity descriptor addressing */ + qconf.aperture_size = 0; /* Linear MM addressing; non-zero enables keyhole mode */ + /* + * CPM5 exposes two MM channels; by default stripe queue pairs across + * them via (qid & 1). libqdma also mirrors mm_channel into the SW-context + * host_id, so this selects the programmed Host Profile too: even queues -> + * Host Profile 0 (NoC Channel 0), odd queues -> Host Profile 1 (NoC + * Channel 1). See slash_qdma_program_host_profiles(). + * + * The qdma_force_mm_channel module parameter overrides the split and pins + * every new queue to a single channel, for NoC-bandwidth A/B testing. + */ + if (qdma_force_mm_channel >= 0) + qconf.mm_channel = (u32)qdma_force_mm_channel; + else + qconf.mm_channel = req->qid & 1; /* --- Per-direction ring configuration --- */ switch (qtype) { @@ -1688,8 +1993,9 @@ static int slash_qdma_ioctl_qpair_add_q(struct miscdevice *misc, } SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, - "qdma_queue_add start: qid=%u type=%u mode=%u\n", - req->qid, qtype, req->mode); + "queue add qid=%u type=%u mode=%u mm_channel=%u%s\n", + req->qid, qtype, req->mode, qconf.mm_channel, + qdma_force_mm_channel >= 0 ? " (forced)" : ""); err = qdma_queue_add(qdma_dev->qdma_handle, &qconf, &qhndl, errbuf, sizeof(errbuf)); if (err) { @@ -1989,7 +2295,7 @@ static inline void slash_qdma_iocb_release(struct slash_qdma_io_cb *iocb) if (iocb->pages) iocb->pages = NULL; - kfree(iocb->sgl); + kvfree(iocb->sgl); iocb->sgl = NULL; iocb->buf = NULL; } @@ -2033,124 +2339,242 @@ static void slash_qdma_unmap_user_buf(struct slash_qdma_io_cb *iocb, bool write) iocb->pages_nr = 0; } -/** - * slash_qdma_map_user_buf_to_sgl() - Pin user pages and build a scatter-gather list. - * @iocb: I/O control block. @iocb->buf and @iocb->len must be set - * before calling. On success, @iocb->sgl, @iocb->pages, and - * @iocb->pages_nr are populated. - * @write: Transfer direction (true = H2C write, false = C2H read). - * - * Steps: - * 1. Compute the number of pages spanned by the user buffer (accounting - * for the offset within the first page). - * 2. Allocate a single contiguous block for the SGL entries and the - * page pointer array (avoids two allocations). - * 3. Pin user pages via get_user_pages_fast() with write=1 (even for - * H2C, because libqdma may write status back). - * 4. Build the qdma_sw_sg linked list: one entry per page, with the - * first entry's offset reflecting the sub-page position of the - * user buffer, and the last entry's length truncated to the - * remaining byte count. - * 5. Flush the data cache for each page to ensure coherency between - * the CPU cache and the DMA engine's view of memory. - * - * Return: 0 on success, negative errno on failure (pages are unpinned - * and the SGL is freed on error). - */ -static int slash_qdma_map_user_buf_to_sgl(struct slash_qdma_io_cb *iocb, - bool write) +static int slash_qdma_iocb_alloc_sgl(struct slash_qdma_io_cb *iocb, + unsigned int entries) { - unsigned long len = iocb->len; - char *buf = (char *)iocb->buf; + size_t entry_size = sizeof(struct qdma_sw_sg) + sizeof(struct page *); struct qdma_sw_sg *sg; - unsigned int pg_off = offset_in_page(buf); - unsigned int pages_nr = (len + pg_off + PAGE_SIZE - 1) >> PAGE_SHIFT; - int i; - int rv; - if (len == 0) - pages_nr = 1; - if (pages_nr == 0) + if (!entries || entries > SIZE_MAX / entry_size) return -EINVAL; - iocb->pages_nr = 0; - /* - * Single allocation for both the SGL array and the page pointer - * array. The page pointers are placed immediately after the SGL - * entries in memory. + * A large base-page transfer needs one entry per 4 KiB page (e.g. ~5 MiB + * of SGL for a 512 MiB transfer), which exceeds kmalloc's limit, so use + * kvcalloc(). The SGL is only ever touched by the CPU (libqdma DMA-maps + * the pages it references), so a vmalloc-backed allocation is fine. */ - sg = kmalloc(pages_nr * (sizeof(struct qdma_sw_sg) + - sizeof(struct page *)), GFP_KERNEL); + sg = kvcalloc(entries, entry_size, GFP_KERNEL); if (!sg) { - pr_err("slash: qdma: sgl allocation failed for %u pages\n", - pages_nr); + pr_err("slash: qdma: sgl allocation failed for %u entries\n", + entries); return -ENOMEM; } - memset(sg, 0, pages_nr * (sizeof(struct qdma_sw_sg) + - sizeof(struct page *))); + iocb->sgl = sg; + iocb->pages = (struct page **)(sg + entries); + return 0; +} + +static bool slash_qdma_page_is_base_page(struct page *page) +{ + return !PageCompound(page); +} - /* Page pointer array lives right after the SGL entries. */ - iocb->pages = (struct page **)(sg + pages_nr); +static bool slash_qdma_page_is_2m_hugetlb_head(struct page *page) +{ +#ifdef CONFIG_HUGETLB_PAGE + struct page *head = compound_head(page); + + return page == head && + PageHuge(head) && + compound_order(head) == get_order(SLASH_QDMA_HUGEPAGE_SIZE); +#else + return false; +#endif +} + +static int slash_qdma_map_user_base_pages_to_sgl(struct slash_qdma_io_cb *iocb, + bool write) +{ + unsigned long addr = (unsigned long)iocb->buf; + size_t entries = iocb->len / PAGE_SIZE; + unsigned int pinned = 0; + unsigned int i; + int rv; + + if ((iocb->len % PAGE_SIZE) != 0 || entries == 0 || entries > UINT_MAX) + return -EINVAL; + + rv = slash_qdma_iocb_alloc_sgl(iocb, (unsigned int)entries); + if (rv) + return rv; /* - * Pin the user pages into physical memory. The write=1 flag tells - * the kernel these pages may be written to (needed for C2H, but we - * always request write permission for simplicity). + * Pin every base page in the span. get_user_pages_fast() may return + * fewer pages than requested, so loop (in bounded batches) until the + * whole buffer is pinned. */ - rv = get_user_pages_fast((unsigned long)buf, pages_nr, - 1 /* write */, iocb->pages); - if (rv < 0) { - pr_err("slash: qdma: unable to pin down %u user pages, %d\n", - pages_nr, rv); - goto err_out; - } - if (rv != pages_nr) { - pr_err("slash: qdma: unable to pin down all %u user pages, %d\n", - pages_nr, rv); - iocb->pages_nr = rv; - rv = -EFAULT; - goto err_out; + while (pinned < entries) { + unsigned int want = min_t(unsigned int, + (unsigned int)entries - pinned, + SLASH_QDMA_GUP_BATCH); + int got = get_user_pages_fast(addr + (size_t)pinned * PAGE_SIZE, + (int)want, 1 /* write */, + iocb->pages + pinned); + + if (got <= 0) { + pr_err("slash: qdma: unable to pin 4 KiB user pages %u/%zu, %d\n", + pinned, entries, got); + rv = (got < 0) ? got : -EFAULT; + goto err_out; + } + + pinned += (unsigned int)got; + iocb->pages_nr = pinned; } - /* - * Build the scatter-gather list. Each entry describes one page's - * worth of data. The first page may have a non-zero offset, and - * the last page may have fewer than PAGE_SIZE bytes. - */ - sg = iocb->sgl; - for (i = 0; i < pages_nr; i++, sg++) { - unsigned int offset = offset_in_page(buf); - unsigned int nbytes = min_t(unsigned int, - PAGE_SIZE - offset, len); - struct page *pg = iocb->pages[i]; - - /* Ensure CPU cache is flushed so the DMA engine sees fresh data. */ - flush_dcache_page(pg); - - sg->next = sg + 1; - sg->pg = pg; - sg->offset = offset; - sg->len = nbytes; - sg->dma_addr = 0UL; + for (i = 0; i < entries; i++) { + struct qdma_sw_sg *sg = &iocb->sgl[i]; - buf += nbytes; - len -= nbytes; + if (!slash_qdma_page_is_base_page(iocb->pages[i])) { + pr_err("slash: qdma: 4 KiB transfer page %u/%zu is not backed by a base page\n", + i, entries); + rv = -EINVAL; + goto err_out; + } + + flush_dcache_page(iocb->pages[i]); + + sg->next = (i + 1 < entries) ? &iocb->sgl[i + 1] : NULL; + sg->pg = iocb->pages[i]; + sg->offset = 0; + sg->len = PAGE_SIZE; + sg->dma_addr = 0UL; } - /* Terminate the linked list. */ - iocb->sgl[pages_nr - 1].next = NULL; - iocb->pages_nr = pages_nr; + SLASH_QDMA_OP_LOG("user transfer path=base-4k addr=0x%lx len=%zu pages=%zu write=%d\n", + addr, iocb->len, entries, write); + return 0; err_out: slash_qdma_unmap_user_buf(iocb, write); slash_qdma_iocb_release(iocb); + return rv; +} + +static int slash_qdma_map_user_huge_page_to_sgl(struct slash_qdma_io_cb *iocb, + bool write) +{ + unsigned long addr = (unsigned long)iocb->buf; + size_t entries = iocb->len / SLASH_QDMA_HUGEPAGE_SIZE; + unsigned int i; + int rv; + + if ((iocb->len % SLASH_QDMA_HUGEPAGE_SIZE) != 0 || + entries == 0 || entries > UINT_MAX) + return -EINVAL; + + rv = slash_qdma_iocb_alloc_sgl(iocb, (unsigned int)entries); + if (rv) + return rv; + + for (i = 0; i < entries; i++) { + unsigned long curr_addr = addr + (i * SLASH_QDMA_HUGEPAGE_SIZE); + struct qdma_sw_sg *sg = &iocb->sgl[i]; + + rv = get_user_pages_fast(curr_addr, 1, 1 /* write */, &iocb->pages[i]); + if (rv != 1) { + pr_err("slash: qdma: unable to pin 2 MiB user page %u/%zu, %d\n", + i, entries, rv); + rv = rv < 0 ? rv : -EFAULT; + goto err_out; + } + iocb->pages_nr = i + 1; + if (!slash_qdma_page_is_2m_hugetlb_head(iocb->pages[i])) { + pr_err("slash: qdma: 2 MiB transfer page %u/%zu is not backed by a 2 MiB hugetlb head page\n", + i, entries); + rv = -EINVAL; + goto err_out; + } + + flush_dcache_page(iocb->pages[i]); + + sg->next = (i + 1 < entries) ? &iocb->sgl[i + 1] : NULL; + sg->pg = iocb->pages[i]; + sg->offset = 0; + sg->len = SLASH_QDMA_HUGEPAGE_SIZE; + sg->dma_addr = 0UL; + } + + SLASH_QDMA_OP_LOG("user transfer path=hugetlb-2m addr=0x%lx len=%zu pages=%zu write=%d\n", + addr, iocb->len, entries, write); + + return 0; + +err_out: + slash_qdma_unmap_user_buf(iocb, write); + slash_qdma_iocb_release(iocb); return rv; } +/** + * slash_qdma_map_user_buf_to_sgl() - Pin a user buffer and build its SGL. + * @iocb: I/O control block. @iocb->buf and @iocb->len must be set. + * @write: Transfer direction (true = H2C write, false = C2H read). + * + * The buffer must be page-aligned and a whole number of 4 KiB pages. It is + * mapped as either: + * - a span of 2 MiB hugetlb pages (when it is 2 MiB-aligned, a multiple of + * 2 MiB, and actually backed by hugetlb pages), or + * - a span of 4 KiB base pages (every other accepted case). + * + * Each page becomes one SGL entry / one DMA descriptor, and the whole span is + * submitted to libqdma as a single request. + * + * The hugetlb-vs-base decision is made by probing the first page rather than by + * length/alignment alone: a large anonymous (base-page) mapping can happen to + * be 2 MiB-aligned, and must not be mistaken for a hugetlb buffer. + * + * Return: 0 on success, negative errno on failure. + */ +static int slash_qdma_map_user_buf_to_sgl(struct slash_qdma_io_cb *iocb, + bool write) +{ + unsigned long addr = (unsigned long)iocb->buf; + size_t len = iocb->len; + bool huge = false; + + iocb->pages_nr = 0; + + if (!addr || !len || addr > ULONG_MAX - len) + return -EINVAL; + + if (!IS_ALIGNED(addr, PAGE_SIZE) || (len % PAGE_SIZE) != 0) { + pr_err("slash: qdma: unsupported user transfer addr=0x%lx len=%zu (must be page-aligned and a multiple of 4 KiB)\n", + addr, len); + return -EINVAL; + } + + /* + * Only a 2 MiB-aligned, 2 MiB-multiple span can be hugetlb-backed. Probe + * the first page to confirm it actually is a hugetlb page before committing + * to the huge path; otherwise fall through to the base-page path. + */ + if (IS_ALIGNED(addr, SLASH_QDMA_HUGEPAGE_SIZE) && + (len % SLASH_QDMA_HUGEPAGE_SIZE) == 0) { + struct page *probe = NULL; + int probe_ret; + + probe_ret = get_user_pages_fast(addr, 1, 1 /* write */, &probe); + if (probe_ret < 0) + return probe_ret; + if (probe_ret == 0) + return -EFAULT; + if (probe_ret == 1) { + huge = slash_qdma_page_is_2m_hugetlb_head(probe); + put_page(probe); + } + } + + if (huge) + return slash_qdma_map_user_huge_page_to_sgl(iocb, write); + + return slash_qdma_map_user_base_pages_to_sgl(iocb, write); +} + /** * slash_qdma_qpair_read_write() - Perform a DMA transfer via a qpair fd. * @file: The anon_inode file for this queue pair. @@ -2193,6 +2617,9 @@ static ssize_t slash_qdma_qpair_read_write(struct file *file, char __user *buf, unsigned long qhndl; ssize_t res; int rv; +#if SLASH_QDMA_TIMING + ktime_t t_start, t_mapped, t_submitted, t_done; +#endif if (!ctx) return -EINVAL; @@ -2230,12 +2657,18 @@ static ssize_t slash_qdma_qpair_read_write(struct file *file, char __user *buf, mutex_unlock(&qdma_dev->lock); /* Pin user pages and build the scatter-gather list. */ +#if SLASH_QDMA_TIMING + t_start = ktime_get(); +#endif memset(&iocb, 0, sizeof(iocb)); iocb.buf = buf; iocb.len = count; rv = slash_qdma_map_user_buf_to_sgl(&iocb, write); if (rv < 0) return rv; +#if SLASH_QDMA_TIMING + t_mapped = ktime_get(); +#endif /* Populate the libqdma request structure. */ req = &iocb.req; @@ -2258,6 +2691,9 @@ static ssize_t slash_qdma_qpair_read_write(struct file *file, char __user *buf, SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, "qdma_request_submit done: qid=%u qhndl=%lu res=%zd\n", ctx->qid, qhndl, res); +#if SLASH_QDMA_TIMING + t_submitted = ktime_get(); +#endif /* Advance the file position by the number of bytes transferred. */ if (res > 0) @@ -2267,6 +2703,18 @@ static ssize_t slash_qdma_qpair_read_write(struct file *file, char __user *buf, slash_qdma_unmap_user_buf(&iocb, write); slash_qdma_iocb_release(&iocb); +#if SLASH_QDMA_TIMING + t_done = ktime_get(); + dev_info(&qdma_dev->pdev->dev, + "slash: qdma: timing qid=%u %s count=%zu sgcnt=%u ep=0x%llx res=%zd | map=%lld submit=%lld unmap=%lld total=%lld ns\n", + ctx->qid, write ? "H2C" : "C2H", count, req->sgcnt, + (unsigned long long)req->ep_addr, res, + ktime_to_ns(ktime_sub(t_mapped, t_start)), + ktime_to_ns(ktime_sub(t_submitted, t_mapped)), + ktime_to_ns(ktime_sub(t_done, t_submitted)), + ktime_to_ns(ktime_sub(t_done, t_start))); +#endif + return res; } diff --git a/driver/tests/test_slash_qdma.c b/driver/tests/test_slash_qdma.c index 07784dc0..bf706cf9 100644 --- a/driver/tests/test_slash_qdma.c +++ b/driver/tests/test_slash_qdma.c @@ -15,6 +15,16 @@ #include #define TRANSFER_SIZE 4096 +#define HUGE_PAGE_SIZE (2 * 1024 * 1024) +#define HUGE_TRANSFER_SIZE (2 * HUGE_PAGE_SIZE) + +#ifndef MAP_HUGE_SHIFT +#define MAP_HUGE_SHIFT 26 +#endif + +#ifndef MAP_HUGE_2MB +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) +#endif /* ---------- helpers ---------- */ @@ -303,17 +313,8 @@ TEST_F(qdma, io_write_on_c2h_only_returns_enodev) free(buf); } -/* - * TODO: spec at docs/reference/kernel-abi/index.rst:417 documents zero-length - * transfers as returning -EINVAL, but the kernel's map_user_buf_to_sgl path - * (slash_qdma.c:2033-2034) explicitly patches around the len==0 case - * (`if (len == 0) pages_nr = 1;`), making the -EINVAL branch unreachable. - * The observed behaviour is ret == 0. Desired behaviour is under - * investigation — keep this test as-is so the discrepancy is visible. - */ TEST_F(qdma, io_zero_length_returns_einval) { - SKIP(return, "Test is disabled since the desired behavior is under investigation"); uint8_t *buf; ssize_t ret; @@ -707,4 +708,119 @@ TEST_F(qdma, qpair_get_fd_oversized_struct_zeros_tail) free(buf); } +TEST_F(qdma, reject_unaligned_4k_transfer) +{ + uint8_t *write_buf; + uint64_t dma_addr = get_dma_addr(); + ssize_t ret; + + bring_up_qpair(_metadata, self, 0x3); + + write_buf = aligned_alloc(4096, TRANSFER_SIZE * 2); + ASSERT_NE(NULL, write_buf); + fill_pattern(write_buf, TRANSFER_SIZE * 2); + + errno = 0; + ret = pwrite(self->io_fd, write_buf + 1, TRANSFER_SIZE, (off_t)dma_addr); + ASSERT_EQ(-1, ret); + ASSERT_EQ(EINVAL, errno); + + free(write_buf); +} + +TEST_F(qdma, reject_partial_4k_transfer) +{ + uint8_t *write_buf; + uint64_t dma_addr = get_dma_addr(); + ssize_t ret; + + bring_up_qpair(_metadata, self, 0x3); + + write_buf = aligned_alloc(4096, TRANSFER_SIZE); + ASSERT_NE(NULL, write_buf); + fill_pattern(write_buf, TRANSFER_SIZE); + + errno = 0; + ret = pwrite(self->io_fd, write_buf, TRANSFER_SIZE / 2, (off_t)dma_addr); + ASSERT_EQ(-1, ret); + ASSERT_EQ(EINVAL, errno); + + free(write_buf); +} + +TEST_F(qdma, multipage_4k_write_read_verify) +{ + const size_t xfer_size = TRANSFER_SIZE * 8; /* 8 base pages, one request */ + uint8_t *write_buf, *read_buf; + uint64_t dma_addr = get_dma_addr(); + ssize_t ret; + + bring_up_qpair(_metadata, self, 0x3); + + /* + * A multi-page base-page buffer is mapped as one SGL entry (one DMA + * descriptor) per 4 KiB page and submitted as a single libqdma request. + * The size is deliberately not a 2 MiB multiple, so this always takes the + * base-page path regardless of transparent-hugepage state; a sub-2-MiB + * anonymous mmap is always backed by 4 KiB base pages. + */ + write_buf = mmap(NULL, xfer_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, write_buf); + read_buf = mmap(NULL, xfer_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, read_buf); + + fill_pattern(write_buf, xfer_size); + memset(read_buf, 0, xfer_size); + + ret = pwrite(self->io_fd, write_buf, xfer_size, (off_t)dma_addr); + ASSERT_EQ((ssize_t)xfer_size, ret); + + ret = pread(self->io_fd, read_buf, xfer_size, (off_t)dma_addr); + ASSERT_EQ((ssize_t)xfer_size, ret); + + EXPECT_EQ(0, memcmp(write_buf, read_buf, xfer_size)); + + munmap(write_buf, xfer_size); + munmap(read_buf, xfer_size); +} + +TEST_F(qdma, hugepage_write_read_verify) +{ + uint8_t *write_buf, *read_buf; + uint64_t dma_addr = get_dma_addr(); + ssize_t ret; + + bring_up_qpair(_metadata, self, 0x3); + + write_buf = mmap(NULL, HUGE_TRANSFER_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB, + -1, 0); + if (write_buf == MAP_FAILED) + SKIP(return, "2 MiB hugepage write mmap failed (errno=%d)", errno); + + read_buf = mmap(NULL, HUGE_TRANSFER_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB, + -1, 0); + if (read_buf == MAP_FAILED) { + munmap(write_buf, HUGE_TRANSFER_SIZE); + SKIP(return, "2 MiB hugepage read mmap failed (errno=%d)", errno); + } + + fill_pattern(write_buf, HUGE_TRANSFER_SIZE); + memset(read_buf, 0, HUGE_TRANSFER_SIZE); + + ret = pwrite(self->io_fd, write_buf, HUGE_TRANSFER_SIZE, (off_t)dma_addr); + ASSERT_EQ(HUGE_TRANSFER_SIZE, ret); + + ret = pread(self->io_fd, read_buf, HUGE_TRANSFER_SIZE, (off_t)dma_addr); + ASSERT_EQ(HUGE_TRANSFER_SIZE, ret); + + EXPECT_EQ(0, memcmp(write_buf, read_buf, HUGE_TRANSFER_SIZE)); + + munmap(write_buf, HUGE_TRANSFER_SIZE); + munmap(read_buf, HUGE_TRANSFER_SIZE); +} + TEST_HARNESS_MAIN From 80745c82aafa083f0e45cb09d9b1808dd09afc5e Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Mon, 8 Jun 2026 10:12:31 +0100 Subject: [PATCH 03/23] libvrtd: hugepage host buffers and granule-aware partial sync Signed-off-by: Vlad-Gabriel Serbu --- vrt/vrtd/libvrtd/include/vrtd/vrtd.h | 30 ++- vrt/vrtd/libvrtd/src/buffer.c | 389 +++++++++++++++++++++++---- 2 files changed, 362 insertions(+), 57 deletions(-) diff --git a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h index 76cf2541..d7721d49 100644 --- a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h +++ b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h @@ -516,6 +516,8 @@ struct vrtd_buffer { uint64_t phys_addr; int qpair_fd; void *buf; + /* Internal DMA granule for the local host mapping: 4096 or 2 MiB. */ + uint64_t transfer_step_size; }; enum vrtd_ret vrtd_buffer_create_raw( @@ -530,6 +532,21 @@ enum vrtd_ret vrtd_buffer_create_raw( struct vrtd_buffer **buffer_out ); +/** + * @brief Synchronize a range from the local host buffer to the device. + * + * The requested range may be smaller than the QDMA transfer granule. libvrtd + * handles any required internal alignment. Bidirectional buffers preserve + * device bytes outside the requested range with an internal read-modify-write; + * host-to-device-only buffers keep the historical behavior of expanding the + * transfer to the backing DMA granule. + */ +enum vrtd_ret vrtd_buffer_sync_to_device( + struct vrtd_buffer *buffer, + uint64_t offset, + uint64_t size +); + /** * @brief Destroy a local buffer handle. * @@ -540,12 +557,13 @@ enum vrtd_ret vrtd_buffer_destroy( struct vrtd_buffer *buffer ); -enum vrtd_ret vrtd_buffer_sync_to_device( - struct vrtd_buffer *buffer, - uint64_t offset, - uint64_t size -); - +/** + * @brief Synchronize a range from the device into the local host buffer. + * + * The requested range may be smaller than the QDMA transfer granule. libvrtd + * handles any required internal alignment and preserves bytes outside the + * requested host range. + */ enum vrtd_ret vrtd_buffer_sync_from_device( struct vrtd_buffer *buffer, uint64_t offset, diff --git a/vrt/vrtd/libvrtd/src/buffer.c b/vrt/vrtd/libvrtd/src/buffer.c index b810de2c..b85ef588 100644 --- a/vrt/vrtd/libvrtd/src/buffer.c +++ b/vrt/vrtd/libvrtd/src/buffer.c @@ -29,9 +29,10 @@ * regular pages) and associated with a QDMA queue pair fd for * performing the actual H2C / C2H transfers. * - * Sync operations (sync_to_device / sync_from_device) transfer data - * between the host buffer and FPGA memory in TRANSFER_STEP_SIZE (4 KB) - * chunks using positional I/O on the QDMA qpair fd. + * Sync operations (sync_to_device / sync_from_device) accept arbitrary + * in-buffer ranges. Internally, the QDMA fd requires page-aligned transfer + * ranges, so libvrtd expands partial requests to the mapping granule and uses + * a staging buffer when needed to preserve host-side partial-range semantics. * * Buffer lifecycle: * 1. vrtd_buffer_open() -- daemon allocates, returns qpair fd @@ -46,9 +47,14 @@ #include #include +#include +#include #include #include +#include #include +#include +#include #include @@ -62,7 +68,127 @@ #define MAP_HUGE_2MB (21UL << MAP_HUGE_SHIFT) #endif -#define TRANSFER_STEP_SIZE (4ULL * 1024ULL) // 4K +#define BASE_TRANSFER_STEP_SIZE (4ULL * 1024ULL) // 4K +#define HUGE_TRANSFER_STEP_SIZE (2ULL * 1024ULL * 1024ULL) // 2M + +/* + * Per-sync timing instrumentation. + * + * When SLASH_QDMA_TIMING is non-zero (compile-time flag, e.g. built with + * -DSLASH_QDMA_TIMING=1), the sync_to/from_device paths log the wall-clock + * cost of each pwrite/pread syscall plus the aggregate per-sync time and + * effective bandwidth. This is the userspace counterpart to the kernel's + * SLASH_QDMA_TIMING and libqdma's QDMA_TIMING breakdowns. + */ +#ifndef SLASH_QDMA_TIMING +#define SLASH_QDMA_TIMING 0 +#endif + +#if SLASH_QDMA_TIMING +static inline uint64_t vrtd_now_ns(void) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t) ts.tv_sec * 1000000000ULL + (uint64_t) ts.tv_nsec; +} +#endif + +static void vrtd_prefault_mapping(void *addr, uint64_t size) { + volatile uint8_t *touch = (volatile uint8_t *) addr; + + for (uint64_t off = 0; off < size; off += BASE_TRANSFER_STEP_SIZE) { + touch[off] = 0; + } +} + +static int vrtd_mmap_regular_base_pages(uint64_t size, void **addr_out) { + void *addr; + + if (addr_out == NULL || size == 0) { + return -EINVAL; + } + + addr = mmap( + NULL, + size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, + 0 + ); + if (addr == MAP_FAILED) { + return -errno; + } + + if (madvise(addr, size, MADV_NOHUGEPAGE) != 0) { + int saved_errno = errno; + (void) munmap(addr, size); + return -saved_errno; + } + + vrtd_prefault_mapping(addr, size); + *addr_out = addr; + return 0; +} + +static int vrtd_transfer_pages( + int fd, + void *buf, + uint64_t phys_addr, + uint64_t offset, + uint64_t size, + uint64_t step, + bool to_device +) { + uint64_t max_chunk; + uint64_t transferred = 0; + + if (size == 0) { + return 0; + } + + if (step == 0 || (offset % step) != 0 || (size % step) != 0) { + return -EINVAL; + } + + max_chunk = (uint64_t)SSIZE_MAX - ((uint64_t)SSIZE_MAX % step); + if (max_chunk == 0) { + return -EINVAL; + } + + while (transferred < size) { + uint64_t chunk = size - transferred; + uint64_t done = 0; + + if (chunk > max_chunk) { + chunk = max_chunk; + } + + while (done < chunk) { + size_t remaining = (size_t)(chunk - done); + off_t dev_offset = (off_t)(phys_addr + offset + transferred + done); + uint8_t *ptr = (uint8_t *)buf + offset + transferred + done; + ssize_t ret; + + if (to_device) { + ret = pwrite(fd, ptr, remaining, dev_offset); + } else { + ret = pread(fd, ptr, remaining, dev_offset); + } + + if (ret < 0 && errno == EINTR) { + continue; + } + if (ret <= 0) { + return -EIO; + } + done += (uint64_t)ret; + } + + transferred += chunk; + } + + return 0; +} enum vrtd_ret vrtd_buffer_create_raw( int sock_fd, @@ -84,29 +210,57 @@ enum vrtd_ret vrtd_buffer_create_raw( return VRTD_RET_INTERNAL_ERROR; } - buffer->buf = mmap( - NULL, /* address (let the kernel choose) */ - size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, - -1, /* fd */ - 0 /* offset */ - ); - if (buffer->buf == MAP_FAILED) { - // Huge pages are an optimization, not a hard requirement. - // Fall back to normal anonymous mapping when hugepage mmap fails. + buffer->buf = MAP_FAILED; + buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE; + + if ((size % HUGE_TRANSFER_STEP_SIZE) == 0 && + (phys_addr % HUGE_TRANSFER_STEP_SIZE) == 0) { buffer->buf = mmap( NULL, /* address (let the kernel choose) */ size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, -1, /* fd */ 0 /* offset */ ); - if (buffer->buf == MAP_FAILED) { + if (buffer->buf != MAP_FAILED) { + buffer->transfer_step_size = HUGE_TRANSFER_STEP_SIZE; + } + } + + if (buffer->buf == MAP_FAILED) { + int huge_errno = errno; + // Huge pages are an optimization, not a hard requirement. + // Fall back to normal anonymous mapping when hugepage mmap fails. Do + // not use MAP_POPULATE before MADV_NOHUGEPAGE: THP=always can fault + // compound pages before the advice takes effect, and the kernel QDMA + // base-page path intentionally rejects those pages. + int mmap_ret = vrtd_mmap_regular_base_pages(size, &buffer->buf); + if (mmap_ret != 0) { free(buffer); return VRTD_RET_INTERNAL_ERROR; } + buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE; +#if SLASH_QDMA_TIMING + syslog( + LOG_INFO, + "libvrtd: buffer host mapping path=regular-4k size=%llu phys_addr=0x%llx step=%llu huge_errno=%d", + (unsigned long long)size, + (unsigned long long)phys_addr, + (unsigned long long)buffer->transfer_step_size, + huge_errno + ); +#endif + } else { +#if SLASH_QDMA_TIMING + syslog( + LOG_INFO, + "libvrtd: buffer host mapping path=hugetlb-2m size=%llu phys_addr=0x%llx step=%llu", + (unsigned long long)size, + (unsigned long long)phys_addr, + (unsigned long long)buffer->transfer_step_size + ); +#endif } buffer->sock_fd = sock_fd; @@ -123,6 +277,61 @@ enum vrtd_ret vrtd_buffer_create_raw( return VRTD_RET_OK; } +static enum vrtd_ret vrtd_buffer_prepare_sync_range( + const struct vrtd_buffer *buffer, + uint64_t offset, + uint64_t size, + uint64_t *aligned_offset_out, + uint64_t *aligned_size_out, + bool *needs_bounce_out +) { + uint64_t step; + uint64_t end; + uint64_t aligned_offset; + uint64_t aligned_end; + + if (buffer == NULL || aligned_offset_out == NULL || + aligned_size_out == NULL || needs_bounce_out == NULL) { + return VRTD_RET_BAD_LIB_CALL; + } + + step = buffer->transfer_step_size; + if (step == 0) { + return VRTD_RET_INVALID_ARGUMENT; + } + + if (offset > buffer->size || size > buffer->size - offset) { + return VRTD_RET_INVALID_ARGUMENT; + } + + if (size == 0) { + *aligned_offset_out = offset; + *aligned_size_out = 0; + *needs_bounce_out = false; + return VRTD_RET_OK; + } + + if ((buffer->size % step) != 0 || (buffer->phys_addr % step) != 0) { + return VRTD_RET_INVALID_ARGUMENT; + } + + end = offset + size; + aligned_offset = offset - (offset % step); + if (end > UINT64_MAX - (step - 1)) { + return VRTD_RET_INVALID_ARGUMENT; + } + aligned_end = ((end + step - 1) / step) * step; + if (aligned_end > buffer->size) { + return VRTD_RET_INVALID_ARGUMENT; + } + + *aligned_offset_out = aligned_offset; + *aligned_size_out = aligned_end - aligned_offset; + *needs_bounce_out = (aligned_offset != offset || aligned_end != end); + + return VRTD_RET_OK; +} + enum vrtd_ret vrtd_buffer_destroy( struct vrtd_buffer *buffer ) { @@ -191,29 +400,72 @@ enum vrtd_ret vrtd_buffer_sync_to_device( assert(buffer->qpair_fd >= 0); assert(buffer->buf != NULL); - assert(buffer->size % TRANSFER_STEP_SIZE == 0); - assert(buffer->phys_addr % TRANSFER_STEP_SIZE == 0); + uint64_t aligned_offset = 0; + uint64_t aligned_size = 0; + bool needs_bounce = false; + enum vrtd_ret range_ret = vrtd_buffer_prepare_sync_range( + buffer, offset, size, &aligned_offset, &aligned_size, &needs_bounce); + if (range_ret != VRTD_RET_OK) { + return range_ret; + } + if (aligned_size == 0) { + return VRTD_RET_OK; + } - uint64_t effective_offset = offset - (offset % TRANSFER_STEP_SIZE); - uint64_t end_offset = offset + size; + uint64_t step = buffer->transfer_step_size; +#if SLASH_QDMA_TIMING + uint64_t sync_start_ns = vrtd_now_ns(); +#endif - off_t ret = lseek(buffer->qpair_fd, buffer->phys_addr + effective_offset, SEEK_SET); - if (ret == -1) { + int transfer_ret; + if (needs_bounce && buffer->alloc_dir == VRTD_ALLOC_DIR_BIDIRECTIONAL) { + void *bounce = NULL; + int mmap_ret = vrtd_mmap_regular_base_pages(aligned_size, &bounce); + if (mmap_ret != 0) { + return VRTD_RET_INTERNAL_ERROR; + } + + transfer_ret = vrtd_transfer_pages( + buffer->qpair_fd, bounce, buffer->phys_addr + aligned_offset, + 0, aligned_size, BASE_TRANSFER_STEP_SIZE, false); + if (transfer_ret == 0) { + memcpy( + (uint8_t *)bounce + (offset - aligned_offset), + (uint8_t *)buffer->buf + offset, + size + ); + transfer_ret = vrtd_transfer_pages( + buffer->qpair_fd, bounce, buffer->phys_addr + aligned_offset, + 0, aligned_size, BASE_TRANSFER_STEP_SIZE, true); + } + (void) munmap(bounce, aligned_size); + } else { + /* + * Host-to-device-only buffers cannot read the surrounding device + * granule for a read-modify-write, so keep the historical behavior: + * expand partial syncs to the backing DMA granule. + */ + transfer_ret = vrtd_transfer_pages( + buffer->qpair_fd, buffer->buf, buffer->phys_addr, + aligned_offset, aligned_size, step, true); + } + if (transfer_ret != 0) { return VRTD_RET_INTERNAL_ERROR; } - for (uint64_t curr_offset = effective_offset; curr_offset < end_offset; curr_offset += TRANSFER_STEP_SIZE) { - ssize_t bytes_written = 0; - while (bytes_written < TRANSFER_STEP_SIZE) { - ssize_t bw = write(buffer->qpair_fd, - (uint8_t *) buffer->buf + curr_offset + bytes_written, - TRANSFER_STEP_SIZE - bytes_written); - if (bw == -1) { - return VRTD_RET_INTERNAL_ERROR; - } - bytes_written += bw; - } +#if SLASH_QDMA_TIMING + { + uint64_t total_ns = vrtd_now_ns() - sync_start_ns; + double mb = (double) size / (1024.0 * 1024.0); + double sec = (double) total_ns / 1e9; + syslog(LOG_INFO, + "libvrtd: timing H2C sync offset=%llu size=%llu aligned_offset=%llu aligned_size=%llu step=%llu total=%llu ns (%.1f MB/s)", + (unsigned long long) offset, (unsigned long long) size, + (unsigned long long) aligned_offset, (unsigned long long) aligned_size, + (unsigned long long) step, (unsigned long long) total_ns, + sec > 0.0 ? mb / sec : 0.0); } +#endif return VRTD_RET_OK; } @@ -233,29 +485,64 @@ enum vrtd_ret vrtd_buffer_sync_from_device( assert(buffer->qpair_fd >= 0); assert(buffer->buf != NULL); - assert(buffer->size % TRANSFER_STEP_SIZE == 0); - assert(buffer->phys_addr % TRANSFER_STEP_SIZE == 0); + uint64_t aligned_offset = 0; + uint64_t aligned_size = 0; + bool needs_bounce = false; + enum vrtd_ret range_ret = vrtd_buffer_prepare_sync_range( + buffer, offset, size, &aligned_offset, &aligned_size, &needs_bounce); + if (range_ret != VRTD_RET_OK) { + return range_ret; + } + if (aligned_size == 0) { + return VRTD_RET_OK; + } + + uint64_t step = buffer->transfer_step_size; +#if SLASH_QDMA_TIMING + uint64_t sync_start_ns = vrtd_now_ns(); +#endif - uint64_t effective_offset = offset - (offset % TRANSFER_STEP_SIZE); - uint64_t end_offset = offset + size; + int transfer_ret; + if (needs_bounce) { + void *bounce = NULL; + int mmap_ret = vrtd_mmap_regular_base_pages(aligned_size, &bounce); + if (mmap_ret != 0) { + return VRTD_RET_INTERNAL_ERROR; + } - off_t ret = lseek(buffer->qpair_fd, buffer->phys_addr + effective_offset, SEEK_SET); - if (ret == -1) { + transfer_ret = vrtd_transfer_pages( + buffer->qpair_fd, bounce, buffer->phys_addr + aligned_offset, + 0, aligned_size, BASE_TRANSFER_STEP_SIZE, false); + if (transfer_ret == 0) { + memcpy( + (uint8_t *)buffer->buf + offset, + (uint8_t *)bounce + (offset - aligned_offset), + size + ); + } + (void) munmap(bounce, aligned_size); + } else { + transfer_ret = vrtd_transfer_pages( + buffer->qpair_fd, buffer->buf, buffer->phys_addr, + aligned_offset, aligned_size, step, false); + } + if (transfer_ret != 0) { return VRTD_RET_INTERNAL_ERROR; } - for (uint64_t curr_offset = effective_offset; curr_offset < end_offset; curr_offset += TRANSFER_STEP_SIZE) { - ssize_t bytes_read = 0; - while (bytes_read < TRANSFER_STEP_SIZE) { - ssize_t br = read(buffer->qpair_fd, - (uint8_t *) buffer->buf + curr_offset + bytes_read, - TRANSFER_STEP_SIZE - bytes_read); - if (br == -1) { - return VRTD_RET_INTERNAL_ERROR; - } - bytes_read += br; - } +#if SLASH_QDMA_TIMING + { + uint64_t total_ns = vrtd_now_ns() - sync_start_ns; + double mb = (double) size / (1024.0 * 1024.0); + double sec = (double) total_ns / 1e9; + syslog(LOG_INFO, + "libvrtd: timing C2H sync offset=%llu size=%llu aligned_offset=%llu aligned_size=%llu step=%llu total=%llu ns (%.1f MB/s)", + (unsigned long long) offset, (unsigned long long) size, + (unsigned long long) aligned_offset, (unsigned long long) aligned_size, + (unsigned long long) step, (unsigned long long) total_ns, + sec > 0.0 ? mb / sec : 0.0); } +#endif return VRTD_RET_OK; } From 91cc4f39ba18c53b71eebe1a15cf7c602ea5b379 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Mon, 8 Jun 2026 10:12:38 +0100 Subject: [PATCH 04/23] smi: validate bandwidth modes with raw SLASH and stock qdma backends Signed-off-by: Vlad-Gabriel Serbu --- docs/reference/smi/commands.rst | 60 ++- smi/CMakeLists.txt | 5 + smi/README.md | 92 +++- smi/src/CMakeLists.txt | 21 + smi/src/qdma_driver_backend.cpp | 542 +++++++++++++++++++ smi/src/qdma_driver_backend.hpp | 152 ++++++ smi/src/raw_transfer.hpp | 266 +++++++++ smi/src/smi.cpp | 14 + smi/src/validate.cpp | 926 +++++++++++++++++++++++++++++--- smi/src/validate.hpp | 11 +- 10 files changed, 1993 insertions(+), 96 deletions(-) create mode 100644 smi/src/qdma_driver_backend.cpp create mode 100644 smi/src/qdma_driver_backend.hpp create mode 100644 smi/src/raw_transfer.hpp diff --git a/docs/reference/smi/commands.rst b/docs/reference/smi/commands.rst index 563a81b5..989803b3 100644 --- a/docs/reference/smi/commands.rst +++ b/docs/reference/smi/commands.rst @@ -151,11 +151,27 @@ validate -------- Run memory integrity and bandwidth tests against a board's HBM and DDR -subsystems. +subsystems. For each memory path, bandwidth is reported as single-direction +C2H read, single-direction H2C write, and simultaneous bidirectional +throughput (read, write, and total). After the per-memory phases, a final +parallel phase drives HBM and DDR simultaneously with ``2 * N`` buffers for +single-direction tests and ``4 * N`` threads for bidirectional tests; this +phase is skipped when ``--ddr-only`` or ``--hbm-only`` is given. .. code-block:: text - v80-smi validate -d [-j|--threads ] + v80-smi validate -d [-j|--threads ] [-R|--no-reset] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] + +Requirements by mode: + +* Default mode uses VRTD buffers, requires a running VRTD daemon, and resets + the board unless ``--no-reset`` is given. +* ``--raw-transfer-test`` bypasses VRTD for transfers and requires the SLASH + QDMA driver device node for the board. It skips reset. +* ``--use-qdma-driver`` bypasses both VRTD and SLASH for transfers and requires + the stock ``qdma-pf`` driver to be bound to the board's QDMA PF. This backend + is built only when ``SMI_ENABLE_QDMA_DRIVER_BACKEND`` is enabled at CMake + configure time. .. option:: -d, --device @@ -164,6 +180,46 @@ subsystems. .. option:: -j, --threads Number of parallel buffers/threads for the validation test (1–64, default 8). + Each buffer is 512 MB (one HBM/DDR allocator region). The bidirectional HBM + phase uses ``2 * N`` HBM regions, so values above 32 require ``--ddr-only``. + The largest phase maps up to ``4 * N * 512 MB`` of host buffers when both + HBM and DDR are enabled, or ``2 * N * 512 MB`` with ``--ddr-only`` or + ``--hbm-only``; the command fails early if that exceeds currently available + host memory. + +.. option:: -R, --no-reset + + Skip the device reset step before running memory tests. + +.. option:: --raw-transfer-test + + Use libslash raw QDMA transfers instead of VRTD buffers. This mode implies + ``--no-reset`` and requires the SLASH QDMA driver device to be present. + +.. option:: --use-qdma-driver + + Run the raw transfer test over the off-the-shelf Xilinx QDMA driver + (``/dev/qdma-MM-``) instead of SLASH. smi provisions the queues + itself: it raises the function's ``qmax`` via sysfs if needed, creates and + starts bidirectional AXI-MM queue pairs over generic netlink (the same + ``xnl_pf`` interface ``dma-ctl`` uses), then transfers over the per-queue + char devices. Queue pairs are spread round-robin across the function's MM + engine channels (``channel = qid % mm_channel_max``); the CPM5 QDMA on the + V80 exposes two, so the test exercises both. This mode implies + ``--no-reset`` and is mutually exclusive with ``--raw-transfer-test``. It + requires the stock ``qdma-pf`` driver to be bound to the board's PF (it + cannot be bound at the same time as the SLASH driver), and typically + requires root to raise ``qmax`` and open the queue devices. + +.. option:: --ddr-only + + Run only the DDR memory tests and skip the HBM phase. Mutually exclusive + with ``--hbm-only``. + +.. option:: --hbm-only + + Run only the HBM memory tests and skip the DDR phase. Mutually exclusive + with ``--ddr-only``. debug ----- diff --git a/smi/CMakeLists.txt b/smi/CMakeLists.txt index 58cf9771..46ae8c16 100644 --- a/smi/CMakeLists.txt +++ b/smi/CMakeLists.txt @@ -40,6 +40,7 @@ project( ) option(SMI_INCLUDE_VRT "Include vrtd as subdirectory instead of building from system" OFF) +option(SMI_ENABLE_QDMA_DRIVER_BACKEND "Build validate --use-qdma-driver backend" ON) include(GNUInstallDirs) @@ -55,6 +56,10 @@ if(NOT TARGET vrt::vrt) "Build and install vrt first (cmake --install), then configure smi again.") endif() +if(NOT TARGET slash::slash) + find_package(slash REQUIRED CONFIG) +endif() + find_package(CLI11 CONFIG REQUIRED) configure_file( diff --git a/smi/README.md b/smi/README.md index d528ed92..16fb6beb 100644 --- a/smi/README.md +++ b/smi/README.md @@ -178,43 +178,91 @@ programmed with the static SLASH design. ### validate -Reset a board, then test HBM and DDR memory for data integrity and -bandwidth. +Optionally reset a board, then test HBM and DDR memory for data integrity and +bandwidth. Raw transfer modes skip reset and bypass the default VRTD buffer +path for data movement. ``` -v80-smi validate -d [-j ] +v80-smi validate -d [-j ] [-R] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] ``` | Flag | Description | |-------------------|------------------------------------------------------| | `-d,--device` | Board address (required), e.g. `03:00` or `0000:03:00` | -| `-j,--threads` | Parallel buffers/threads, 1-64 (default 8) | - -Each buffer is 64 MB. The integrity test writes a pattern, syncs to -device, clears host memory, syncs back, and verifies. The bandwidth -test runs parallel H2C writes and C2H reads. +| `-j,--threads` | Parallel buffers/threads, 1-64 (default 8). Bidirectional HBM needs `2 * threads` HBM regions, so values above 32 require `--ddr-only`. | +| `-R,--no-reset` | Skip the device reset step before running memory tests | +| `--raw-transfer-test` | Use libslash raw QDMA transfers instead of VRTD buffers; implies `--no-reset` | +| `--use-qdma-driver` | Run the raw transfer test over the off-the-shelf Xilinx QDMA driver instead of SLASH; implies `--no-reset`; mutually exclusive with `--raw-transfer-test` | +| `--ddr-only` | Run only DDR memory tests (skip HBM); mutually exclusive with `--hbm-only` | +| `--hbm-only` | Run only HBM memory tests (skip DDR); mutually exclusive with `--ddr-only` | + +Each buffer is 512 MB (one HBM/DDR allocator region). The integrity test +writes a pattern, syncs to device, clears host memory, syncs back, and +verifies. Each bandwidth +phase reports single-direction C2H reads, single-direction H2C writes, +and simultaneous bidirectional throughput (read, write, and total). After +the per-memory phases, a final parallel phase drives HBM and DDR together +using `2 x ` buffers for single-direction tests and `4 x ` +threads for bidirectional tests; it is skipped when `--ddr-only` or +`--hbm-only` is given. With `--raw-transfer-test`, the command bypasses +VRTD for transfers and opens the board's SLASH QDMA device directly, so +the SLASH QDMA driver node must be present. + +Each buffer is 512 MB. The largest phase maps up to +`4 x x 512 MB` of host buffers when HBM and DDR are both enabled, +or `2 x x 512 MB` with `--ddr-only` or `--hbm-only`; `validate` +fails early if that footprint exceeds currently available host memory. + +With `--use-qdma-driver`, the command runs the same raw test over the +off-the-shelf Xilinx QDMA driver (`submodules/qdma_drv`) instead of SLASH. +smi provisions the queues itself: it raises the function's `qmax` via sysfs +if needed, creates and starts bidirectional AXI-MM queue pairs over generic +netlink (the same `xnl_pf` interface `dma-ctl` uses), then transfers over the +per-queue char devices `/dev/qdma-MM-`. This requires the stock +`qdma-pf` driver to be bound to the board's PF (it cannot be bound at the same +time as the SLASH driver), and typically needs root to raise `qmax` and open +the queue devices. The device memory addresses tested (HBM/DDR) are the same +AXI addresses used by the SLASH path. + +Requirements depend on the selected mode: the default path needs VRTD and root +for reset unless `--no-reset` is used; `--raw-transfer-test` needs the SLASH +QDMA driver node; `--use-qdma-driver` needs a build with +`SMI_ENABLE_QDMA_DRIVER_BACKEND=ON` and the stock QDMA driver bound to the +board. ```console $ v80-smi validate -d 03:00 Resetting device 0000:03:00... Testing HBM data integrity (8 regions)... - HBM0: OK - HBM1: OK - ... -Testing HBM bandwidth (8 threads)... + 8/8 OK +Testing HBM read bandwidth (8 threads)... + Read: 9547.22 MB/s +Testing HBM write bandwidth (8 threads)... Write: 9832.10 MB/s - Read: 9547.22 MB/s +Testing HBM bidirectional bandwidth (16 threads)... + Read: 9210.15 MB/s + Write: 9475.81 MB/s + Total: 18685.96 MB/s Testing DDR data integrity (8 buffers)... - DDR0: OK - DDR1: OK - ... -Testing DDR bandwidth (8 threads)... + 8/8 OK +Testing DDR read bandwidth (8 threads)... + Read: 4980.33 MB/s +Testing DDR write bandwidth (8 threads)... Write: 5120.45 MB/s - Read: 4980.33 MB/s +Testing DDR bidirectional bandwidth (16 threads)... + Read: 4860.12 MB/s + Write: 5012.34 MB/s + Total: 9872.46 MB/s +Testing HBM+DDR read bandwidth (16 threads)... + Read: 11890.55 MB/s +Testing HBM+DDR write bandwidth (16 threads)... + Write: 12450.78 MB/s +Testing HBM+DDR bidirectional bandwidth (32 threads)... + Read: 11340.12 MB/s + Write: 12020.34 MB/s + Total: 23360.46 MB/s ``` -Requires root access and a running VRTD daemon. - ### debug bar-poke Perform low-level BAR reads or writes for troubleshooting. @@ -364,6 +412,8 @@ since v80-smi always operates at board granularity. |------------|--------------------------------------------------| | libvrt | VRT runtime library (device, kernel, vrtbin APIs) | | vrtd | Runtime daemon (sensors, reset, validate, query) | +| libslash | Raw SLASH QDMA backend for `validate --raw-transfer-test` | +| qdma_nl.h | Optional stock QDMA-driver backend (`SMI_ENABLE_QDMA_DRIVER_BACKEND=ON`) | ## Project layout @@ -376,6 +426,8 @@ smi/ program.cpp/hpp Device programming reset.cpp/hpp Hardware reset via VRTD validate.cpp/hpp Memory integrity and bandwidth testing + raw_transfer.hpp Shared raw QDMA host mapping and transfer helpers + qdma_driver_backend.cpp/hpp Optional stock QDMA-driver validate backend debug/bar_poke.cpp/hpp BAR read/write debug command debug/mem_poke.cpp/hpp Raw device memory read/write command debug/clockwiz.cpp/hpp Clock read/set debug command diff --git a/smi/src/CMakeLists.txt b/smi/src/CMakeLists.txt index 30e509aa..ad0c721a 100644 --- a/smi/src/CMakeLists.txt +++ b/smi/src/CMakeLists.txt @@ -32,6 +32,21 @@ add_executable( smi.cpp ) +if(SMI_ENABLE_QDMA_DRIVER_BACKEND) + target_sources(v80-smi PRIVATE qdma_driver_backend.cpp) + + # Off-the-shelf Xilinx QDMA driver netlink UAPI header (qdma_nl.h), used by + # the --use-qdma-driver validate backend. + set(QDMA_DRV_APPS_INCLUDE + "${CMAKE_CURRENT_SOURCE_DIR}/../../submodules/qdma_drv/QDMA/linux-kernel/apps/include") + if(NOT EXISTS "${QDMA_DRV_APPS_INCLUDE}/qdma_nl.h") + message(FATAL_ERROR + "Missing ${QDMA_DRV_APPS_INCLUDE}/qdma_nl.h. " + "Initialize submodules (git submodule update --init submodules/qdma_drv), " + "or configure with -DSMI_ENABLE_QDMA_DRIVER_BACKEND=OFF.") + endif() +endif() + target_compile_features(v80-smi PRIVATE cxx_std_20) target_include_directories( @@ -43,11 +58,17 @@ target_include_directories( ${CMAKE_CURRENT_BINARY_DIR}/../generated # For version.hpp ) +if(SMI_ENABLE_QDMA_DRIVER_BACKEND) + target_include_directories(v80-smi PRIVATE ${QDMA_DRV_APPS_INCLUDE}) + target_compile_definitions(v80-smi PRIVATE SMI_ENABLE_QDMA_DRIVER_BACKEND=1) +endif() + target_link_libraries( v80-smi PRIVATE vrt::vrt + slash::slash CLI11::CLI11 ) diff --git a/smi/src/qdma_driver_backend.cpp b/smi/src/qdma_driver_backend.cpp new file mode 100644 index 00000000..1c8c5c27 --- /dev/null +++ b/smi/src/qdma_driver_backend.cpp @@ -0,0 +1,542 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software + * and associated documentation files (the "Software"), to deal in the Software without restriction, + * including without limitation the rights to use, copy, modify, merge, publish, distribute, + * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or + * substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT + * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/// @file qdma_driver_backend.cpp +/// @brief Implementation of the off-the-shelf QDMA-driver raw-transfer backend. + +#include "qdma_driver_backend.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +// qdma_nl.h defines unused file-scope static lookup arrays (xnl_attr_str / +// xnl_op_str); silence the resulting -Wunused warnings without touching the +// vendored upstream header. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-const-variable" +extern "C" { +#include +} +#pragma GCC diagnostic pop + +#include "bdf.hpp" + +namespace smi::qdma_driver { + +namespace { + +/// Generous receive buffer: the device list dump grows with the number of +/// queues/functions, so keep this comfortably larger than XNL_RESP_BUFLEN_MAX. +constexpr size_t RESP_BUF_LEN = 256 * 1024; + +[[noreturn]] void throwSystemError(const std::string& message) { + throw std::runtime_error(message + ": " + std::strerror(errno)); +} + +} // namespace + +/// Minimal generic-netlink client for the QDMA driver's "xnl_pf" family. +/// +/// This is a focused port of the netlink plumbing in the upstream `dma-ctl` +/// utility (QDMA/linux-kernel/apps/dma-utils/dmactl.c): resolve the family id, +/// send a command carrying a handful of u32 attributes, and parse the reply's +/// attributes / generic message text. +class XnlClient { +public: + XnlClient() { + fd_ = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (fd_ < 0) { + throwSystemError("Failed to open QDMA netlink socket"); + } + + struct sockaddr_nl addr{}; + addr.nl_family = AF_NETLINK; + if (bind(fd_, reinterpret_cast(&addr), sizeof(addr)) < 0) { + const int err = errno; + close(fd_); + fd_ = -1; + errno = err; + throwSystemError("Failed to bind QDMA netlink socket"); + } + + // Don't block forever if the driver isn't present / doesn't answer. + struct timeval tv{}; + tv.tv_sec = 5; + tv.tv_usec = 0; + (void)setsockopt(fd_, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + + family_ = resolveFamily(XNL_NAME_PF); + } + + ~XnlClient() { + if (fd_ >= 0) { + close(fd_); + } + } + + XnlClient(const XnlClient&) = delete; + XnlClient& operator=(const XnlClient&) = delete; + + /// Parsed netlink response: scalar attributes plus any generic message text. + struct Response { + std::array attrs{}; + std::array present{}; + std::string genmsg; + }; + + /// Send command @p op for device index @p devIndex with the given u32 + /// attributes (DEV_IDX and a response-buffer-length hint are added + /// automatically) and return the parsed response. + Response sendCmd(uint8_t op, uint32_t devIndex, + const std::vector>& attrs) { + std::vector buf(RESP_BUF_LEN, 0); + auto* n = reinterpret_cast(buf.data()); + + n->nlmsg_type = family_; + n->nlmsg_flags = NLM_F_REQUEST; + n->nlmsg_pid = getpid(); + n->nlmsg_seq = seq_++; + n->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + + auto* g = reinterpret_cast(NLMSG_DATA(n)); + g->cmd = op; + g->version = XNL_VERSION; + + addIntAttr(n, XNL_ATTR_DEV_IDX, devIndex); + for (const auto& [type, val] : attrs) { + addIntAttr(n, type, val); + } + // Tell the kernel how large a response we can accept. + addIntAttr(n, XNL_ATTR_RSP_BUF_LEN, static_cast(buf.size())); + + sendMsg(n); + return recvMsg(buf); + } + +private: + static uint16_t alignedAttrLen(uint16_t payload) { + return static_cast(NLA_HDRLEN + payload); + } + + static void addIntAttr(struct nlmsghdr* n, uint16_t type, uint32_t value) { + auto* attr = reinterpret_cast(reinterpret_cast(n) + n->nlmsg_len); + attr->nla_type = type; + attr->nla_len = alignedAttrLen(sizeof(uint32_t)); + std::memcpy(reinterpret_cast(attr) + NLA_HDRLEN, &value, sizeof(value)); + n->nlmsg_len += NLMSG_ALIGN(attr->nla_len); + } + + static void addStrAttr(struct nlmsghdr* n, uint16_t type, const char* s) { + auto* attr = reinterpret_cast(reinterpret_cast(n) + n->nlmsg_len); + const size_t len = std::strlen(s) + 1; + attr->nla_type = type; + attr->nla_len = alignedAttrLen(static_cast(len)); + std::memcpy(reinterpret_cast(attr) + NLA_HDRLEN, s, len); + n->nlmsg_len += NLMSG_ALIGN(attr->nla_len); + } + + void sendMsg(struct nlmsghdr* n) { + struct sockaddr_nl addr{}; + addr.nl_family = AF_NETLINK; + ssize_t rv = sendto(fd_, n, n->nlmsg_len, 0, + reinterpret_cast(&addr), sizeof(addr)); + if (rv < 0 || static_cast(rv) != n->nlmsg_len) { + throwSystemError("QDMA netlink send failed"); + } + } + + Response recvMsg(std::vector& buf) { + std::memset(buf.data(), 0, buf.size()); + ssize_t rv = recv(fd_, buf.data(), buf.size(), 0); + if (rv < 0) { + throwSystemError("QDMA netlink receive failed"); + } + + auto* n = reinterpret_cast(buf.data()); + if (n->nlmsg_type == NLMSG_ERROR) { + int err = 0; + if (n->nlmsg_len >= NLMSG_LENGTH(sizeof(struct nlmsgerr))) { + auto* nlerr = reinterpret_cast(NLMSG_DATA(n)); + err = nlerr->error; + } + throw std::runtime_error("QDMA netlink returned an error response (" + + std::to_string(err) + ")"); + } + + Response resp; + auto* p = reinterpret_cast(buf.data()) + NLMSG_LENGTH(GENL_HDRLEN); + int maxlen = static_cast(n->nlmsg_len) - static_cast(NLMSG_LENGTH(GENL_HDRLEN)); + while (maxlen > 0) { + auto* na = reinterpret_cast(p); + if (na->nla_len < NLA_HDRLEN) { + break; + } + const int len = NLA_ALIGN(na->nla_len); + const char* payload = reinterpret_cast(na) + NLA_HDRLEN; + + if (na->nla_type == XNL_ATTR_GENMSG) { + resp.genmsg.assign(payload); + } else if (na->nla_type < XNL_ATTR_MAX) { + uint32_t v = 0; + std::memcpy(&v, payload, sizeof(v)); + resp.attrs[na->nla_type] = v; + resp.present[na->nla_type] = true; + } + + p += len; + maxlen -= len; + } + return resp; + } + + uint16_t resolveFamily(const char* name) { + std::vector buf(RESP_BUF_LEN, 0); + auto* n = reinterpret_cast(buf.data()); + + n->nlmsg_type = GENL_ID_CTRL; + n->nlmsg_flags = NLM_F_REQUEST; + n->nlmsg_pid = getpid(); + n->nlmsg_seq = seq_++; + n->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + + auto* g = reinterpret_cast(NLMSG_DATA(n)); + g->cmd = CTRL_CMD_GETFAMILY; + g->version = XNL_VERSION; + + addStrAttr(n, CTRL_ATTR_FAMILY_NAME, name); + sendMsg(n); + + std::memset(buf.data(), 0, buf.size()); + ssize_t rv = recv(fd_, buf.data(), buf.size(), 0); + if (rv < 0) { + throwSystemError(std::string("Failed to resolve QDMA netlink family '") + name + + "' (is the upstream qdma driver loaded?)"); + } + if (n->nlmsg_type == NLMSG_ERROR) { + int err = 0; + if (n->nlmsg_len >= NLMSG_LENGTH(sizeof(struct nlmsgerr))) { + auto* nlerr = reinterpret_cast(NLMSG_DATA(n)); + err = nlerr->error; + } + throw std::runtime_error(std::string("QDMA netlink family '") + name + + "' not found (netlink error " + std::to_string(err) + + "; is the upstream qdma driver loaded?)"); + } + + auto* p = reinterpret_cast(buf.data()) + NLMSG_LENGTH(GENL_HDRLEN); + int maxlen = static_cast(n->nlmsg_len) - static_cast(NLMSG_LENGTH(GENL_HDRLEN)); + while (maxlen > 0) { + auto* na = reinterpret_cast(p); + if (na->nla_len < NLA_HDRLEN) { + break; + } + if (na->nla_type == CTRL_ATTR_FAMILY_ID) { + uint16_t id = 0; + std::memcpy(&id, reinterpret_cast(na) + NLA_HDRLEN, sizeof(id)); + return id; + } + const int len = NLA_ALIGN(na->nla_len); + p += len; + maxlen -= len; + } + throw std::runtime_error(std::string("QDMA netlink family '") + name + + "' id not present in response"); + } + + int fd_ = -1; + uint16_t family_ = 0; + uint32_t seq_ = 0; +}; + +namespace { + +/// Queue flags for a bidirectional AXI-MM queue pair. +constexpr uint32_t QFLAG_MM_BI = XNL_F_QMODE_MM | XNL_F_QDIR_BOTH; + +/// Queue flags for `q start`. In addition to mode/direction, this must enable +/// the descriptor-ring writeback/completion-status reporting and fetch credit, +/// exactly as `dma-ctl q start` does by default (see +/// QDMA/linux-kernel/apps/dma-ctl/cmd_parse.c). Without the writeback bits the +/// poll-mode driver never observes MM completion and every transfer times out. +constexpr uint32_t QFLAG_MM_BI_START = + QFLAG_MM_BI | + XNL_F_CMPL_STATUS_EN | XNL_F_CMPL_STATUS_ACC_EN | + XNL_F_CMPL_STATUS_PEND_CHK | XNL_F_CMPL_STATUS_DESC_EN | + XNL_F_FETCH_CREDIT; + +/// Default descriptor-ring size index for `q start`, matching `dma-ctl`'s +/// default ("ring size set to 2048"). +constexpr uint32_t QRNGSZ_IDX_DEFAULT = 9; + +} // namespace + +QdmaDriverDevice::QdmaDriverDevice(const std::string& boardBdf) + : nl_(std::make_unique()) { + const ParsedBdf board = parseBdf(boardBdf); + + // Enumerate the driver's devices and find the QDMA function on this board. + // Each PF line looks like: "qdma61001\t0000:61:00.1\tmax QP: 512, 0~511". + XnlClient::Response resp = nl_->sendCmd(XNL_CMD_DEV_LIST, /*devIndex=*/0, {}); + if (resp.genmsg.empty()) { + throw std::runtime_error( + "Upstream QDMA driver reported no devices (dev list empty). " + "Ensure the stock qdma driver is bound to the board."); + } + + bool found = false; + std::istringstream lines(resp.genmsg); + std::string line; + while (std::getline(lines, line)) { + std::istringstream tokens(line); + std::string name; + std::string bdfStr; + if (!(tokens >> name >> bdfStr)) { + continue; + } + if (name.rfind("qdma", 0) != 0 || name.rfind("qdmavf", 0) == 0) { + continue; // not a PF entry + } + + ParsedBdf entry; + try { + entry = parseBdf(bdfStr); + } catch (const std::exception&) { + continue; + } + if (entry.base() != board.base()) { + continue; + } + + index_ = static_cast(std::stoul(name.substr(4), nullptr, 16)); + functionBdf_ = bdfStr; + + const auto pos = line.find("max QP:"); + if (pos != std::string::npos) { + qmax_ = static_cast(std::strtoul(line.c_str() + pos + 7, nullptr, 10)); + } + found = true; + if (entry.function.value_or(0) == 1) { + break; // Prefer the QDMA PF used by SLASH/V80. + } + } + + if (!found) { + throw std::runtime_error( + "No upstream QDMA function found for board " + board.base() + + " (is the stock qdma driver bound to this board's PF?)"); + } + + // Ask the driver how many MM (memory-mapped) DMA engine channels this + // function exposes so we can spread queues across them. CPM5 (V80) + // reports 2; older/soft IPs report 1. Best-effort: if the query fails or + // the attribute is absent, fall back to a single channel (channel 0). + try { + XnlClient::Response info = nl_->sendCmd(XNL_CMD_DEV_INFO, index_, {}); + if (info.present[XNL_ATTR_DEV_MM_CHANNEL_MAX] && + info.attrs[XNL_ATTR_DEV_MM_CHANNEL_MAX] > 0) { + mmChannelMax_ = info.attrs[XNL_ATTR_DEV_MM_CHANNEL_MAX]; + } + } catch (const std::exception&) { + mmChannelMax_ = 1; + } +} + +QdmaDriverDevice::~QdmaDriverDevice() = default; + +void QdmaDriverDevice::refreshQmax() { + XnlClient::Response resp = nl_->sendCmd(XNL_CMD_DEV_LIST, /*devIndex=*/0, {}); + std::istringstream lines(resp.genmsg); + std::string line; + + while (std::getline(lines, line)) { + std::istringstream tokens(line); + std::string name; + std::string bdfStr; + if (!(tokens >> name >> bdfStr) || bdfStr != functionBdf_) { + continue; + } + + const auto pos = line.find("max QP:"); + if (pos == std::string::npos) { + throw std::runtime_error("QDMA device list entry for " + functionBdf_ + + " does not report max QP"); + } + + qmax_ = static_cast(std::strtoul(line.c_str() + pos + 7, nullptr, 10)); + return; + } + + throw std::runtime_error("QDMA function " + functionBdf_ + + " disappeared from driver device list after qmax update"); +} + +void QdmaDriverDevice::ensureQmax(unsigned needed) { + if (qmax_ >= needed) { + return; + } + + const std::string path = "/sys/bus/pci/devices/" + functionBdf_ + "/qdma/qmax"; + std::ofstream qmaxFile(path); + if (!qmaxFile.is_open()) { + throw std::runtime_error( + "Need at least " + std::to_string(needed) + " queues but qmax is " + + std::to_string(qmax_) + " and cannot open " + path + + " to raise it (run as root, or set qmax manually with dma-ctl)"); + } + qmaxFile << needed << std::endl; + qmaxFile.close(); + if (qmaxFile.fail()) { + throw std::runtime_error( + "Failed to write qmax=" + std::to_string(needed) + " to " + path + + " (queues may be active; stop them or reload the driver)"); + } + refreshQmax(); + if (qmax_ < needed) { + throw std::runtime_error( + "QDMA qmax update requested " + std::to_string(needed) + + " queues, but driver reports only " + std::to_string(qmax_)); + } +} + +void QdmaDriverDevice::queueAdd(uint32_t qid) { + XnlClient::Response resp = nl_->sendCmd(XNL_CMD_Q_ADD, index_, + {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI}}); + if (resp.present[XNL_ATTR_ERROR] && resp.attrs[XNL_ATTR_ERROR] != 0) { + throw std::runtime_error("QDMA q add failed for qid " + std::to_string(qid) + ": " + + (resp.genmsg.empty() ? "netlink error" : resp.genmsg)); + } +} + +void QdmaDriverDevice::queueStart(uint32_t qid) { + // Round-robin the queue pair across the function's MM engine channels. + // This has to be carried on `q start`: the driver only reads + // XNL_ATTR_MM_CHANNEL in its start handler (via qdma_queue_config) and + // defaults the queue to channel 0 whenever the attribute is absent. + // mmChannelMax_ is always >= 1, so the modulo is safe. + const uint32_t channel = qid % mmChannelMax_; + XnlClient::Response resp = nl_->sendCmd(XNL_CMD_Q_START, index_, + {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI_START}, + {XNL_ATTR_QRNGSZ_IDX, QRNGSZ_IDX_DEFAULT}, {XNL_ATTR_MM_CHANNEL, channel}}); + if (resp.present[XNL_ATTR_ERROR] && resp.attrs[XNL_ATTR_ERROR] != 0) { + throw std::runtime_error("QDMA q start failed for qid " + std::to_string(qid) + ": " + + (resp.genmsg.empty() ? "netlink error" : resp.genmsg)); + } +} + +void QdmaDriverDevice::queueStop(uint32_t qid) noexcept { + try { + (void)nl_->sendCmd(XNL_CMD_Q_STOP, index_, + {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI}}); + } catch (...) { + // Best-effort teardown. + } +} + +void QdmaDriverDevice::queueDel(uint32_t qid) noexcept { + try { + (void)nl_->sendCmd(XNL_CMD_Q_DEL, index_, + {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI}}); + } catch (...) { + // Best-effort teardown. + } +} + +std::string QdmaDriverDevice::charDevPath(uint32_t qid) const { + char name[64]; + std::snprintf(name, sizeof(name), "/dev/qdma%05x-MM-%u", index_, qid); + return std::string(name); +} + +QdmaDriverBuffer::QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid, + uint64_t physAddr, uint64_t size) + : device_(&device), qid_(qid), physAddr_(physAddr) { + try { + mapping_ = raw::createHostMapping(size, physAddr); + + device_->queueAdd(qid_); + queueAdded_ = true; + device_->queueStart(qid_); + queueStarted_ = true; + + const std::string path = device_->charDevPath(qid_); + fd_ = open(path.c_str(), O_RDWR | O_CLOEXEC); + if (fd_ < 0) { + throwSystemError("Failed to open QDMA char device " + path); + } + } catch (...) { + cleanup(); + throw; + } +} + +QdmaDriverBuffer::~QdmaDriverBuffer() { + cleanup(); +} + +void QdmaDriverBuffer::moveFrom(QdmaDriverBuffer& other) noexcept { + device_ = other.device_; + qid_ = other.qid_; + queueAdded_ = other.queueAdded_; + queueStarted_ = other.queueStarted_; + fd_ = other.fd_; + physAddr_ = other.physAddr_; + mapping_ = other.mapping_; + + other.device_ = nullptr; + other.qid_ = 0; + other.queueAdded_ = false; + other.queueStarted_ = false; + other.fd_ = -1; + other.physAddr_ = 0; + other.mapping_ = raw::HostMapping{}; +} + +void QdmaDriverBuffer::cleanup() noexcept { + if (fd_ >= 0) { + (void)close(fd_); + fd_ = -1; + } + if (device_ != nullptr && queueStarted_) { + device_->queueStop(qid_); + queueStarted_ = false; + } + if (device_ != nullptr && queueAdded_) { + device_->queueDel(qid_); + queueAdded_ = false; + } + raw::destroyHostMapping(mapping_); +} + +} // namespace smi::qdma_driver diff --git a/smi/src/qdma_driver_backend.hpp b/smi/src/qdma_driver_backend.hpp new file mode 100644 index 00000000..f4f5634c --- /dev/null +++ b/smi/src/qdma_driver_backend.hpp @@ -0,0 +1,152 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software + * and associated documentation files (the "Software"), to deal in the Software without restriction, + * including without limitation the rights to use, copy, modify, merge, publish, distribute, + * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or + * substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT + * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef SMI_QDMA_DRIVER_BACKEND_HPP +#define SMI_QDMA_DRIVER_BACKEND_HPP + +/// @file qdma_driver_backend.hpp +/// @brief Raw-transfer backend for the off-the-shelf Xilinx QDMA driver. +/// +/// This backend mirrors the surface of validate.cpp's SLASH RawTransferBuffer +/// (data()/getSize()/syncToDevice()/syncFromDevice()) so the templated +/// integrity and bandwidth tests work unchanged, but it drives the upstream +/// QDMA driver (submodules/qdma_drv) instead of SLASH/libslash: +/// +/// - Queue lifecycle (add/start/stop/del) is performed over generic netlink +/// (family "xnl_pf"), exactly as the `dma-ctl` utility does. +/// - The function's `qmax` is provisioned via sysfs if it is too small. +/// - Data movement uses the per-queue char device /dev/qdma-MM- +/// with the device address carried as the file offset. +/// +/// Unlike SLASH there is no control device or custom ioctl ABI; the stock +/// driver must be bound to the function for any of this to work. + +#include +#include +#include + +#include "raw_transfer.hpp" + +namespace smi::qdma_driver { + +/// Opaque generic-netlink client used to talk to the QDMA driver. +class XnlClient; + +/// Represents a single PCIe function managed by the upstream QDMA driver. +/// +/// Resolves the driver's device index from the board BDF, ensures enough +/// queues are provisioned (qmax), and provides queue lifecycle operations. +class QdmaDriverDevice { +public: + /// @param boardBdf Board-level BDF "DDDD:BB:DD" (function is resolved by + /// enumerating the driver's device list). + explicit QdmaDriverDevice(const std::string& boardBdf); + ~QdmaDriverDevice(); + + QdmaDriverDevice(const QdmaDriverDevice&) = delete; + QdmaDriverDevice& operator=(const QdmaDriverDevice&) = delete; + + /// Ensure the function has at least @p needed queues provisioned, writing + /// the sysfs `qmax` entry (which re-initializes the queue set) if required. + void ensureQmax(unsigned needed); + + /// Add + start a bidirectional AXI-MM queue pair at relative index @p qid. + /// + /// queueStart pins the pair to MM engine channel `qid % mmChannelMax()`, + /// spreading queues across the device's MM channels (the channel only + /// takes effect on `q start`; the driver ignores it on `q add`). + void queueAdd(uint32_t qid); + void queueStart(uint32_t qid); + + /// Stop + delete a queue pair. Best-effort; never throws (safe in dtors). + void queueStop(uint32_t qid) noexcept; + void queueDel(uint32_t qid) noexcept; + + /// Char-device path for queue @p qid, e.g. "/dev/qdma61001-MM-0". + std::string charDevPath(uint32_t qid) const; + + /// Resolved 0000:BB:DD.F PCI address of the QDMA function. + const std::string& functionBdf() const { return functionBdf_; } + + /// Number of MM (memory-mapped) DMA engine channels the function exposes. + /// CPM5 (V80) reports 2; older/soft IPs report 1. Always >= 1. + unsigned mmChannelMax() const { return mmChannelMax_; } + +private: + void refreshQmax(); + + std::unique_ptr nl_; + unsigned index_ = 0; ///< Driver device index (qdma). + std::string functionBdf_; ///< Full BDF including function. + unsigned qmax_ = 0; ///< Currently provisioned queue count. + unsigned mmChannelMax_ = 1; ///< Number of MM engine channels (>= 1). +}; + +/// One host buffer bound to a freshly-created upstream QDMA queue pair. +/// +/// Satisfies the buffer concept used by validate.cpp's testDataIntegrity() / +/// testBandwidth() templates. +class QdmaDriverBuffer { +public: + QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid, uint64_t physAddr, uint64_t size); + + QdmaDriverBuffer(const QdmaDriverBuffer&) = delete; + QdmaDriverBuffer& operator=(const QdmaDriverBuffer&) = delete; + + QdmaDriverBuffer(QdmaDriverBuffer&& other) noexcept { moveFrom(other); } + QdmaDriverBuffer& operator=(QdmaDriverBuffer&& other) noexcept { + if (this != &other) { + cleanup(); + moveFrom(other); + } + return *this; + } + + ~QdmaDriverBuffer(); + + void* data() { return mapping_.data; } + uint64_t getSize() const { return mapping_.size; } + + void syncToDevice(uint64_t offset, uint64_t size) { + raw::validateSyncRange(offset, size, mapping_.size, physAddr_, mapping_.step); + raw::rawTransfer(fd_, mapping_.data, physAddr_, offset, size, mapping_.step, /*toDevice=*/true); + } + + void syncFromDevice(uint64_t offset, uint64_t size) { + raw::validateSyncRange(offset, size, mapping_.size, physAddr_, mapping_.step); + raw::rawTransfer(fd_, mapping_.data, physAddr_, offset, size, mapping_.step, /*toDevice=*/false); + } + +private: + void moveFrom(QdmaDriverBuffer& other) noexcept; + void cleanup() noexcept; + + QdmaDriverDevice* device_ = nullptr; + uint32_t qid_ = 0; + bool queueAdded_ = false; + bool queueStarted_ = false; + int fd_ = -1; + uint64_t physAddr_ = 0; + raw::HostMapping mapping_{}; +}; + +} // namespace smi::qdma_driver + +#endif // SMI_QDMA_DRIVER_BACKEND_HPP diff --git a/smi/src/raw_transfer.hpp b/smi/src/raw_transfer.hpp new file mode 100644 index 00000000..c067db54 --- /dev/null +++ b/smi/src/raw_transfer.hpp @@ -0,0 +1,266 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software + * and associated documentation files (the "Software"), to deal in the Software without restriction, + * including without limitation the rights to use, copy, modify, merge, publish, distribute, + * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or + * substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT + * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef SMI_RAW_TRANSFER_HPP +#define SMI_RAW_TRANSFER_HPP + +/// @file raw_transfer.hpp +/// @brief Backend-agnostic helpers for the raw QDMA memory-mapped transfer +/// tests used by `smi validate`. +/// +/// The SLASH backend (libslash queue-pair fds) and the off-the-shelf Xilinx +/// QDMA-driver backend (/dev/qdma-MM- char devices) share the exact +/// same host-side buffer setup and pread/pwrite transfer loop -- only the way +/// the file descriptor and device address get provisioned differs. Those +/// shared pieces live here so both backends behave (and time) identically. + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/// Per-transfer timing instrumentation. +/// +/// When SLASH_QDMA_TIMING is non-zero (compile-time flag, e.g. built with +/// -DSLASH_QDMA_TIMING=1), the raw-transfer path logs the wall-clock cost of +/// each pwrite/pread syscall plus the aggregate per-transfer time and +/// effective bandwidth. This is the userspace counterpart to the kernel's +/// SLASH_QDMA_TIMING and libqdma's QDMA_TIMING breakdowns. +#ifndef SLASH_QDMA_TIMING +#define SLASH_QDMA_TIMING 0 +#endif + +#ifndef MAP_HUGE_SHIFT +#define MAP_HUGE_SHIFT 26 +#endif + +#ifndef MAP_HUGE_2MB +#define MAP_HUGE_2MB (21UL << MAP_HUGE_SHIFT) +#endif + +namespace smi::raw { + +/// Host transfer sizes mirror libvrtd's QDMA staging policy. +static constexpr uint64_t BASE_TRANSFER_STEP_SIZE = 4ULL * 1024ULL; +static constexpr uint64_t HUGE_TRANSFER_STEP_SIZE = 2ULL * 1024ULL * 1024ULL; + +[[noreturn]] inline void throwSystemError(const std::string& message) { + throw std::runtime_error(message + ": " + std::strerror(errno)); +} + +/// A host staging buffer plus the DMA granule it is backed by. +/// +/// `step` is HUGE_TRANSFER_STEP_SIZE when a 2 MiB hugetlb mapping succeeded, +/// otherwise BASE_TRANSFER_STEP_SIZE (4 KiB base pages). It is used only for +/// range/alignment validation: either way the whole range is transferred in a +/// single syscall and the kernel builds one DMA descriptor per page. +struct HostMapping { + void* data = nullptr; + uint64_t size = 0; + uint64_t step = 0; +}; + +/// Create a host staging buffer for raw transfers, preferring a 2 MiB hugetlb +/// mapping and falling back to a regular (THP-disabled) mapping with 4 KiB +/// transfers. @p physAddrForWarn is only used to make the fallback warning +/// actionable. +inline HostMapping createHostMapping(uint64_t size, uint64_t physAddrForWarn) { + HostMapping mapping; + mapping.size = size; + + mapping.data = mmap(nullptr, + size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, + -1, + 0); + mapping.step = HUGE_TRANSFER_STEP_SIZE; + if (mapping.data != MAP_FAILED) { + return mapping; + } + + const int hugeErrno = errno; + // MAP_POPULATE is deliberately omitted here. It would pre-fault the whole + // buffer during mmap(), i.e. before the MADV_NOHUGEPAGE below can take + // effect. On hosts with transparent hugepages set to "always", those early + // faults hand back 2 MiB THP compound pages, and MADV_NOHUGEPAGE does not + // split pages that are already faulted in. The driver's strict 4 KiB + // base-page path (slash_qdma_map_user_base_page_to_sgl) then rejects every + // transfer with -EINVAL ("4 KiB transfer is not backed by a base page"). + mapping.data = mmap(nullptr, + size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, + 0); + if (mapping.data == MAP_FAILED) { + throwSystemError("Failed to mmap raw transfer host buffer"); + } + + // Disable THP for this region *before* any page is faulted in, so that + // every fault below allocates a genuine 4 KiB base page. + if (madvise(mapping.data, size, MADV_NOHUGEPAGE) != 0) { + const int savedErrno = errno; + (void)munmap(mapping.data, size); + mapping.data = nullptr; + errno = savedErrno; + throwSystemError("Failed to disable transparent hugepages for raw transfer host buffer"); + } + + // Pre-fault the buffer as base pages, replacing the MAP_POPULATE dropped + // above. Touching one byte per page now that VM_NOHUGEPAGE is set forces + // the kernel to back each page with a 4 KiB base page up front (and keeps + // the page-fault cost out of the timed transfer loop). + { + volatile uint8_t* touch = static_cast(mapping.data); + for (uint64_t off = 0; off < size; off += BASE_TRANSFER_STEP_SIZE) { + touch[off] = 0; + } + } + + mapping.step = BASE_TRANSFER_STEP_SIZE; + std::cerr << "Warning: 2 MiB hugetlb mmap failed for raw transfer buffer at 0x" + << std::hex << physAddrForWarn << std::dec + << " (errno=" << hugeErrno << "); using 4 KiB transfers" << std::endl; + return mapping; +} + +/// Release a host mapping created by createHostMapping(). +inline void destroyHostMapping(HostMapping& mapping) noexcept { + if (mapping.data != nullptr && mapping.data != MAP_FAILED) { + (void)munmap(mapping.data, mapping.size); + mapping.data = nullptr; + } +} + +/// Validate that a [offset, offset+size) request is aligned and in range for a +/// buffer of @p bufSize bytes backing device address @p physAddr, given the +/// mapping's @p step. +inline void validateSyncRange(uint64_t offset, uint64_t size, uint64_t bufSize, + uint64_t physAddr, uint64_t step) { + if (step == 0 || size == 0) { + throw std::invalid_argument("Invalid raw transfer size"); + } + if ((offset % step) != 0 || (size % step) != 0 || + (bufSize % step) != 0 || (physAddr % step) != 0) { + throw std::invalid_argument("Raw transfer range is not aligned to the host mapping step"); + } + if (offset > bufSize || size > bufSize - offset) { + throw std::out_of_range("Raw transfer range exceeds buffer size"); + } + // Both granules transfer the whole range in a single pread/pwrite, so the + // size must fit in ssize_t regardless of step. + if (size > static_cast(std::numeric_limits::max())) { + throw std::invalid_argument("Raw transfer size exceeds syscall limit"); + } +} + +/// Perform a raw memory-mapped QDMA transfer over @p fd using pread/pwrite, +/// with the device (endpoint) address encoded as the file offset. +/// +/// @param fd Per-queue char device / queue-pair fd. +/// @param data Host staging buffer base. +/// @param physAddr Device-side base address for this buffer. +/// @param offset Byte offset within the buffer (and added to physAddr). +/// @param size Number of bytes to transfer. +/// @param step Mapping step size (see HostMapping::step). +/// @param toDevice true for H2C (pwrite), false for C2H (pread). +inline void rawTransfer(int fd, void* data, uint64_t physAddr, uint64_t offset, + uint64_t size, [[maybe_unused]] uint64_t step, + bool toDevice) { + // Issue the whole range in a single syscall regardless of page granule. + // The kernel pins every page in the range and builds one descriptor per + // page, submitting a single multi-descriptor libqdma request (libqdma + // refills the descriptor ring as needed). This keeps syscall/submit + // overhead independent of the page size -- the 4 KiB path no longer costs + // one syscall (and one single-descriptor DMA) per page. + const uint64_t syscallSize = size; + const uint64_t endOffset = offset + size; +#if SLASH_QDMA_TIMING + const auto xferStart = std::chrono::steady_clock::now(); +#endif + + for (uint64_t currOffset = offset; currOffset < endOffset; currOffset += syscallSize) { + uint64_t transferred = 0; + while (transferred < syscallSize) { + const auto* src = static_cast(data) + currOffset + transferred; + auto* dst = static_cast(data) + currOffset + transferred; + const size_t remaining = static_cast(syscallSize - transferred); + const off_t devOffset = static_cast(physAddr + currOffset + transferred); + +#if SLASH_QDMA_TIMING + const auto callStart = std::chrono::steady_clock::now(); +#endif + ssize_t ret = toDevice + ? pwrite(fd, src, remaining, devOffset) + : pread(fd, dst, remaining, devOffset); + + if (ret < 0 && errno == EINTR) { + continue; + } + if (ret <= 0) { + throwSystemError(toDevice ? "Raw QDMA write failed" : "Raw QDMA read failed"); + } +#if SLASH_QDMA_TIMING + const auto callNs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - callStart) + .count(); + std::fprintf(stderr, + "validate: timing %s dev=0x%llx bytes=%zu ret=%zd syscall=%lld ns\n", + toDevice ? "H2C" : "C2H", + static_cast(devOffset), remaining, ret, + static_cast(callNs)); +#endif + transferred += static_cast(ret); + } + } + +#if SLASH_QDMA_TIMING + const auto totalNs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - xferStart) + .count(); + const double mb = static_cast(size) / (1024.0 * 1024.0); + const double sec = static_cast(totalNs) / 1e9; + std::fprintf(stderr, + "validate: timing %s xfer dev=0x%llx size=%llu step=%llu total=%lld ns (%.1f MB/s)\n", + toDevice ? "H2C" : "C2H", + static_cast(physAddr + offset), + static_cast(size), + static_cast(step), static_cast(totalNs), + sec > 0.0 ? mb / sec : 0.0); +#endif +} + +} // namespace smi::raw + +#endif // SMI_RAW_TRANSFER_HPP diff --git a/smi/src/smi.cpp b/smi/src/smi.cpp index 063fdb8e..a1d84f19 100644 --- a/smi/src/smi.cpp +++ b/smi/src/smi.cpp @@ -113,6 +113,20 @@ static int smiMain(int argc, char **argv) { "Number of parallel buffers/threads (1-64)")->default_val(8)->check(CLI::Range(1u, 64u)); validateCommand->add_flag("-R,--no-reset", validateOptions.noReset, "Skip the device reset step before running memory tests"); + auto* rawTransferFlag = validateCommand->add_flag("--raw-transfer-test", validateOptions.rawTransferTest, + "Use libslash raw QDMA transfers instead of VRTD buffers (implies --no-reset)"); + auto* useQdmaDriverFlag = validateCommand->add_flag("--use-qdma-driver", validateOptions.useQdmaDriver, + "Run the raw transfer test over the off-the-shelf Xilinx QDMA driver " + "(/dev/qdma-MM-) instead of SLASH; requires the stock qdma driver " + "bound to the board. Implies --no-reset; mutually exclusive with --raw-transfer-test"); + rawTransferFlag->excludes(useQdmaDriverFlag); + useQdmaDriverFlag->excludes(rawTransferFlag); + auto* ddrOnlyFlag = validateCommand->add_flag("--ddr-only", validateOptions.ddrOnly, + "Run only DDR memory tests (skip HBM)"); + auto* hbmOnlyFlag = validateCommand->add_flag("--hbm-only", validateOptions.hbmOnly, + "Run only HBM memory tests (skip DDR)"); + ddrOnlyFlag->excludes(hbmOnlyFlag); + hbmOnlyFlag->excludes(ddrOnlyFlag); // -- debug (low-level debug utilities) -- auto* debugCommand = app.add_subcommand("debug", "Low-level debug utilities"); diff --git a/smi/src/validate.cpp b/smi/src/validate.cpp index 605e9abc..dbc9bd88 100644 --- a/smi/src/validate.cpp +++ b/smi/src/validate.cpp @@ -33,22 +33,346 @@ /// TODO: Decide whether vrt::Device should gain a vrtbin-less constructor so /// that commands like validate can go through the standard vrt:: layer. +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include "validate.hpp" +#include +#include #include +#include #include +#include #include +#include +#include #include #include +#include +#include +#include +#include #include #include +#include +#include +#include + #include #include "bdf.hpp" +#include "raw_transfer.hpp" + +#ifdef SMI_ENABLE_QDMA_DRIVER_BACKEND +#include "qdma_driver_backend.hpp" +#endif + +extern "C" { +#include +} + +namespace { + +using smi::raw::throwSystemError; + +/// Buffer size for each allocation (512 MB — one allocator region). +static constexpr uint64_t BUFFER_SIZE = 512ULL * 1024 * 1024; + +/// Region constants mirror vrt/vrtd/src/allocator.h, which is private. +static constexpr uint64_t HBM_BASE = 0x4000000000ULL; +static constexpr uint64_t DDR_BASE = 0x60000000000ULL; +static constexpr uint64_t MEM_REGION_SIZE = 512ULL * 1024 * 1024; + +static constexpr uint32_t QDMA_Q_MODE_MM = 0; +static constexpr uint32_t QDMA_DIR_H2C = 0x1; +static constexpr uint32_t QDMA_DIR_C2H = 0x2; +static constexpr uint32_t QDMA_RING_SZ_IDX = 0; + +static bool checkHostMemoryBudget(const Validate::Options& options) { + const uint64_t maxConcurrentBuffers = (!options.ddrOnly && !options.hbmOnly) + ? 4ULL * options.threads + : 2ULL * options.threads; + const uint64_t requiredBytes = maxConcurrentBuffers * BUFFER_SIZE; + const long pageSize = sysconf(_SC_PAGESIZE); + const long availablePages = sysconf(_SC_AVPHYS_PAGES); + + if (pageSize <= 0 || availablePages <= 0) { + std::cerr << "Warning: unable to estimate available host memory for validate; " + << "peak mapped buffer footprint is " + << (requiredBytes / (1024ULL * 1024ULL)) << " MiB." << std::endl; + return true; + } + + const auto availableBytes = static_cast(pageSize) * + static_cast(availablePages); + if (requiredBytes > availableBytes) { + std::cerr << "validate: requested test can map up to " + << (requiredBytes / (1024ULL * 1024ULL)) << " MiB of host buffers, " + << "but only about " << (availableBytes / (1024ULL * 1024ULL)) + << " MiB is currently available. Reduce --threads or use --ddr-only/--hbm-only." + << std::endl; + return false; + } + + return true; +} + +static void warnIfNotRoot(const char* mode) { + if (geteuid() != 0) { + std::cerr << "Warning: " << mode + << " usually needs root or udev-granted access to QDMA device nodes/sysfs." + << std::endl; + } +} + +std::string readDevNameFromUevent(const std::filesystem::path& miscPath) { + std::ifstream uevent(miscPath / "uevent"); + if (!uevent.is_open()) { + throw std::runtime_error("Failed to open " + (miscPath / "uevent").string()); + } + + std::string line; + while (std::getline(uevent, line)) { + static constexpr std::string_view key{"DEVNAME="}; + if (!line.starts_with(key)) { + continue; + } + + std::string devName = line.substr(key.size()); + while (!devName.empty() && (devName.back() == '\n' || devName.back() == '\r')) { + devName.pop_back(); + } + return "/dev/" + devName; + } + + throw std::runtime_error("No DEVNAME entry found in " + (miscPath / "uevent").string()); +} -/// Buffer size for each allocation (64 MB — one allocator subregion). -static constexpr uint64_t BUFFER_SIZE = 64ULL * 1024 * 1024; +std::string resolveQdmaDevicePath(const std::string& boardBdf) { + static const std::filesystem::path MISC_PATH{"/sys/class/misc"}; + + const std::string exactName = "slash_qdma_ctl_" + boardBdf + ".1"; + const auto exactPath = MISC_PATH / exactName; + if (std::filesystem::exists(exactPath)) { + return readDevNameFromUevent(exactPath); + } + + const std::string prefix = "slash_qdma_ctl_" + boardBdf + "."; + std::vector matches; + for (const auto& entry : std::filesystem::directory_iterator(MISC_PATH)) { + const std::string name = entry.path().filename().string(); + if (name.starts_with(prefix)) { + matches.push_back(entry.path()); + } + } + + if (matches.empty()) { + throw std::runtime_error( + "No QDMA misc device found for board " + boardBdf + + " (looked for /sys/class/misc/" + prefix + "*)"); + } + + std::sort(matches.begin(), matches.end()); + if (matches.size() > 1) { + std::cerr << "Warning: multiple QDMA devices found for " << boardBdf + << "; using " << matches.front().filename().string() << std::endl; + } + + return readDevNameFromUevent(matches.front()); +} + +class RawQdmaDevice { +public: + explicit RawQdmaDevice(const std::string& path) : qdma_{slash_qdma_open(path.c_str())} { + if (qdma_ == nullptr) { + throwSystemError("Failed to open QDMA device " + path); + } + } + + RawQdmaDevice(const RawQdmaDevice&) = delete; + RawQdmaDevice& operator=(const RawQdmaDevice&) = delete; + + RawQdmaDevice(RawQdmaDevice&& other) noexcept : qdma_{other.qdma_} { + other.qdma_ = nullptr; + } + + RawQdmaDevice& operator=(RawQdmaDevice&& other) noexcept { + if (this != &other) { + cleanup(); + qdma_ = other.qdma_; + other.qdma_ = nullptr; + } + return *this; + } + + ~RawQdmaDevice() { + cleanup(); + } + + slash_qdma* get() const { + return qdma_; + } + +private: + void cleanup() { + if (qdma_ != nullptr) { + (void)slash_qdma_close(qdma_); + qdma_ = nullptr; + } + } + + slash_qdma* qdma_ = nullptr; +}; + +class RawTransferBuffer { +public: + RawTransferBuffer(slash_qdma* qdma, uint64_t physAddr, uint64_t size) + : qdma_{qdma}, physAddr_{physAddr}, size_{size} { + try { + createHostMapping(); + createQpair(); + } catch (...) { + cleanup(); + throw; + } + } + + RawTransferBuffer(const RawTransferBuffer&) = delete; + RawTransferBuffer& operator=(const RawTransferBuffer&) = delete; + + RawTransferBuffer(RawTransferBuffer&& other) noexcept { + moveFrom(other); + } + + RawTransferBuffer& operator=(RawTransferBuffer&& other) noexcept { + if (this != &other) { + cleanup(); + moveFrom(other); + } + return *this; + } + + ~RawTransferBuffer() { + cleanup(); + } + + void* data() { + return data_; + } + + uint64_t getSize() const { + return size_; + } + + void syncToDevice(uint64_t offset, uint64_t size) { + validateSyncRange(offset, size); + transfer(offset, size, /*toDevice=*/true); + } + + void syncFromDevice(uint64_t offset, uint64_t size) { + validateSyncRange(offset, size); + transfer(offset, size, /*toDevice=*/false); + } + +private: + void moveFrom(RawTransferBuffer& other) noexcept { + qdma_ = other.qdma_; + fd_ = other.fd_; + qid_ = other.qid_; + qpairCreated_ = other.qpairCreated_; + qpairStarted_ = other.qpairStarted_; + data_ = other.data_; + physAddr_ = other.physAddr_; + size_ = other.size_; + transferStepSize_ = other.transferStepSize_; + + other.qdma_ = nullptr; + other.fd_ = -1; + other.qid_ = 0; + other.qpairCreated_ = false; + other.qpairStarted_ = false; + other.data_ = nullptr; + other.physAddr_ = 0; + other.size_ = 0; + other.transferStepSize_ = 0; + } + + void createHostMapping() { + smi::raw::HostMapping mapping = smi::raw::createHostMapping(size_, physAddr_); + data_ = mapping.data; + transferStepSize_ = mapping.step; + } + + void createQpair() { + if (qdma_ == nullptr || size_ == 0) { + throw std::invalid_argument("Invalid raw transfer buffer arguments"); + } + + struct slash_qdma_qpair_add req{}; + req.size = sizeof(req); + req.mode = QDMA_Q_MODE_MM; + req.dir_mask = QDMA_DIR_H2C | QDMA_DIR_C2H; + req.h2c_ring_sz = QDMA_RING_SZ_IDX; + req.c2h_ring_sz = QDMA_RING_SZ_IDX; + req.cmpt_ring_sz = QDMA_RING_SZ_IDX; + + if (slash_qdma_qpair_add(qdma_, &req) != 0) { + throwSystemError("Failed to add raw transfer QDMA queue pair"); + } + qid_ = req.qid; + qpairCreated_ = true; + + if (slash_qdma_qpair_start(qdma_, qid_) != 0) { + throwSystemError("Failed to start raw transfer QDMA queue pair"); + } + qpairStarted_ = true; + + fd_ = slash_qdma_qpair_get_fd(qdma_, qid_, O_CLOEXEC); + if (fd_ < 0) { + throwSystemError("Failed to get raw transfer QDMA queue fd"); + } + } + + void validateSyncRange(uint64_t offset, uint64_t size) const { + smi::raw::validateSyncRange(offset, size, size_, physAddr_, transferStepSize_); + } + + void transfer(uint64_t offset, uint64_t size, bool toDevice) { + smi::raw::rawTransfer(fd_, data_, physAddr_, offset, size, transferStepSize_, toDevice); + } + + void cleanup() { + if (fd_ >= 0) { + (void)close(fd_); + fd_ = -1; + } + if (qdma_ != nullptr && qpairStarted_) { + (void)slash_qdma_qpair_stop(qdma_, qid_); + qpairStarted_ = false; + } + if (qdma_ != nullptr && qpairCreated_) { + (void)slash_qdma_qpair_del(qdma_, qid_); + qpairCreated_ = false; + } + if (data_ != nullptr && data_ != MAP_FAILED) { + (void)munmap(data_, size_); + data_ = nullptr; + } + } + + slash_qdma* qdma_ = nullptr; + int fd_ = -1; + uint32_t qid_ = 0; + bool qpairCreated_ = false; + bool qpairStarted_ = false; + void* data_ = nullptr; + uint64_t physAddr_ = 0; + uint64_t size_ = 0; + uint64_t transferStepSize_ = 0; +}; /// Fill @p buf with a deterministic pattern seeded by @p seed. static void fillPattern(void* buf, uint64_t size, uint32_t seed) { @@ -74,12 +398,19 @@ static bool verifyPattern(const void* buf, uint64_t size, uint32_t seed) { /// Run data integrity on every buffer: write pattern → sync to device → /// clear host → sync from device → verify. +/// +/// Output policy: per-buffer FAIL lines are printed as failures occur; OK +/// buffers are silent. A single summary line ("N/N OK" or "M/N OK, K +/// FAIL") is printed at the end. +/// /// @return true if all buffers pass. -static bool testDataIntegrity(std::vector& buffers, +template +static bool testDataIntegrity(std::vector& buffers, const std::string& label) { - bool allPassed = true; + const size_t total = buffers.size(); + size_t passed = 0; - for (size_t i = 0; i < buffers.size(); ++i) { + for (size_t i = 0; i < total; ++i) { auto& buf = buffers[i]; uint32_t seed = static_cast(i); uint64_t size = buf.getSize(); @@ -90,73 +421,464 @@ static bool testDataIntegrity(std::vector& buffers, std::memset(buf.data(), 0, size); buf.syncFromDevice(0, size); - bool ok = verifyPattern(buf.data(), size, seed); - std::cout << " " << label << i << ": " - << (ok ? "OK" : "FAIL") << std::endl; - - if (!ok) { - allPassed = false; + if (verifyPattern(buf.data(), size, seed)) { + ++passed; + } else { + std::cout << " " << label << i << ": FAIL" << std::endl; } } - return allPassed; + if (passed == total) { + std::cout << " " << total << "/" << total << " OK" << std::endl; + } else { + std::cout << " " << passed << "/" << total << " OK, " + << (total - passed) << " FAIL" << std::endl; + } + + return passed == total; +} + +static double mbPerSecond(uint64_t bytes, std::chrono::duration elapsed) { + const double totalMB = static_cast(bytes) / (1024.0 * 1024.0); + return totalMB / elapsed.count(); +} + +static void printBandwidthMetric(const char* label, double mbps) { + std::cout << " " << label << ": " << std::fixed << std::setprecision(2) + << mbps << " MB/s" << std::endl; } -/// Measure aggregate write and read bandwidth across all buffers in parallel -/// (one std::thread per buffer). -static void testBandwidth(std::vector& buffers) { +template +static uint64_t fillBuffers(std::vector& buffers, int value) { uint64_t totalBytes = 0; for (auto& buf : buffers) { - std::memset(buf.data(), 0xAB, buf.getSize()); + std::memset(buf.data(), value, buf.getSize()); totalBytes += buf.getSize(); } + return totalBytes; +} + +template +static void launchTransferThreads(std::vector& buffers, + bool toDevice, + std::vector& threads, + std::vector& errors, + size_t errorOffset) { + for (size_t i = 0; i < buffers.size(); ++i) { + threads.emplace_back([&buffers, &errors, i, errorOffset, toDevice] { + try { + if (toDevice) { + buffers[i].syncToDevice(0, buffers[i].getSize()); + } else { + buffers[i].syncFromDevice(0, buffers[i].getSize()); + } + } catch (...) { + errors[errorOffset + i] = std::current_exception(); + } + }); + } +} + +template +static void runTransfers(std::vector& buffers, bool toDevice) { + std::vector threads; + std::vector errors(buffers.size()); + threads.reserve(buffers.size()); - // -- Write (H2C) bandwidth -- - auto writeStart = std::chrono::steady_clock::now(); - { - std::vector threads; - threads.reserve(buffers.size()); + launchTransferThreads(buffers, toDevice, threads, errors, 0); + + for (auto& t : threads) { + t.join(); + } + for (auto& error : errors) { + if (error) { + std::rethrow_exception(error); + } + } +} + +template +static double testSingleDirectionBandwidth(std::vector& buffers, bool toDevice) { + const uint64_t totalBytes = fillBuffers(buffers, toDevice ? 0xAB : 0xCD); + + if (!toDevice) { + runTransfers(buffers, /*toDevice=*/true); for (auto& buf : buffers) { - threads.emplace_back([&buf] { - buf.syncToDevice(0, buf.getSize()); - }); + std::memset(buf.data(), 0, buf.getSize()); } - for (auto& t : threads) { - t.join(); + } + + const auto start = std::chrono::steady_clock::now(); + runTransfers(buffers, toDevice); + const auto end = std::chrono::steady_clock::now(); + + return mbPerSecond(totalBytes, end - start); +} + +template +static void testBidirectionalBandwidth(std::vector& writeBuffers, + std::vector& readBuffers) { + const uint64_t writeBytes = fillBuffers(writeBuffers, 0xAB); + const uint64_t readBytes = fillBuffers(readBuffers, 0xCD); + + // Prime device memory before timing so the C2H side reads initialized data. + runTransfers(readBuffers, /*toDevice=*/true); + for (auto& buf : readBuffers) { + std::memset(buf.data(), 0, buf.getSize()); + } + + std::vector threads; + std::vector errors(writeBuffers.size() + readBuffers.size()); + threads.reserve(errors.size()); + + const auto start = std::chrono::steady_clock::now(); + launchTransferThreads(writeBuffers, /*toDevice=*/true, threads, errors, 0); + launchTransferThreads(readBuffers, /*toDevice=*/false, threads, errors, writeBuffers.size()); + + for (auto& t : threads) { + t.join(); + } + const auto end = std::chrono::steady_clock::now(); + + for (auto& error : errors) { + if (error) { + std::rethrow_exception(error); } } - auto writeEnd = std::chrono::steady_clock::now(); - // -- Read (C2H) bandwidth -- - auto readStart = std::chrono::steady_clock::now(); - { - std::vector threads; - threads.reserve(buffers.size()); - for (auto& buf : buffers) { - threads.emplace_back([&buf] { - buf.syncFromDevice(0, buf.getSize()); - }); + const auto elapsed = end - start; + const double writeMBps = mbPerSecond(writeBytes, elapsed); + const double readMBps = mbPerSecond(readBytes, elapsed); + + printBandwidthMetric("Read", readMBps); + printBandwidthMetric("Write", writeMBps); + printBandwidthMetric("Total", readMBps + writeMBps); +} + +template +static void testBandwidthSuite(std::vector& singleDirectionBuffers, + const std::string& label, + const std::string& backendSuffix) { + std::cout << "Testing " << label << " read bandwidth (" + << singleDirectionBuffers.size() << " threads" << backendSuffix << ")..." << std::endl; + printBandwidthMetric("Read", testSingleDirectionBandwidth(singleDirectionBuffers, /*toDevice=*/false)); + + std::cout << "Testing " << label << " write bandwidth (" + << singleDirectionBuffers.size() << " threads" << backendSuffix << ")..." << std::endl; + printBandwidthMetric("Write", testSingleDirectionBandwidth(singleDirectionBuffers, /*toDevice=*/true)); +} + +template +static void testBidirectionalBandwidthSuite(std::vector& bidirectionalWriteBuffers, + std::vector& bidirectionalReadBuffers, + const std::string& label, + const std::string& backendSuffix) { + std::cout << "Testing " << label << " bidirectional bandwidth (" + << (bidirectionalWriteBuffers.size() + bidirectionalReadBuffers.size()) + << " threads" << backendSuffix << ")..." << std::endl; + testBidirectionalBandwidth(bidirectionalWriteBuffers, bidirectionalReadBuffers); +} + +static int runRawTransferTest(const std::string& bdf, const Validate::Options& options) { + const unsigned N = options.threads; + + if (!options.noReset) { + std::cout << "Raw transfer mode skips reset; continuing without VRTD reset." << std::endl; + } + warnIfNotRoot("SLASH raw transfer mode"); + + const std::string qdmaPath = resolveQdmaDevicePath(bdf); + std::cout << "Using raw QDMA device " << qdmaPath << "..." << std::endl; + + RawQdmaDevice qdma(qdmaPath); + + if (!options.ddrOnly) { + std::cout << "Testing HBM data integrity (" << N << " regions, raw QDMA)..." << std::endl; + { + std::vector hbmBuffers; + hbmBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + hbmBuffers.emplace_back(qdma.get(), HBM_BASE + i * MEM_REGION_SIZE, + BUFFER_SIZE); + } + + if (!testDataIntegrity(hbmBuffers, "HBM")) { + std::cerr << "HBM data integrity check failed" << std::endl; + return 1; + } + + testBandwidthSuite(hbmBuffers, "HBM", ", raw QDMA"); + } + { + // Bidirectional HBM: positions interleave R/W across regions + // 0..2N-1. Reads land on even regions, writes on odd regions. + std::vector hbmWriteBuffers; + std::vector hbmReadBuffers; + hbmWriteBuffers.reserve(N); + hbmReadBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + hbmReadBuffers.emplace_back(qdma.get(), + HBM_BASE + (2 * i) * MEM_REGION_SIZE, + BUFFER_SIZE); + hbmWriteBuffers.emplace_back(qdma.get(), + HBM_BASE + (2 * i + 1) * MEM_REGION_SIZE, + BUFFER_SIZE); + } + + testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", raw QDMA"); + } + } + + if (!options.hbmOnly) { + std::cout << "Testing DDR data integrity (" << N << " buffers, raw QDMA)..." << std::endl; + { + std::vector ddrBuffers; + ddrBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + ddrBuffers.emplace_back(qdma.get(), DDR_BASE + i * BUFFER_SIZE, + BUFFER_SIZE); + } + + if (!testDataIntegrity(ddrBuffers, "DDR")) { + std::cerr << "DDR data integrity check failed" << std::endl; + return 1; + } + + testBandwidthSuite(ddrBuffers, "DDR", ", raw QDMA"); + } + { + // Bidirectional DDR: positions interleave R/W across slot indices + // 0..2N-1 of the DDR address space. + std::vector ddrWriteBuffers; + std::vector ddrReadBuffers; + ddrWriteBuffers.reserve(N); + ddrReadBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + ddrReadBuffers.emplace_back(qdma.get(), + DDR_BASE + (2 * i) * BUFFER_SIZE, + BUFFER_SIZE); + ddrWriteBuffers.emplace_back(qdma.get(), + DDR_BASE + (2 * i + 1) * BUFFER_SIZE, + BUFFER_SIZE); + } + + testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", raw QDMA"); + } + } + + if (!options.ddrOnly && !options.hbmOnly) { + { + std::vector parBuffers; + parBuffers.reserve(2 * N); + for (unsigned i = 0; i < N; ++i) { + parBuffers.emplace_back(qdma.get(), HBM_BASE + i * MEM_REGION_SIZE, + BUFFER_SIZE); + } + for (unsigned i = 0; i < N; ++i) { + parBuffers.emplace_back(qdma.get(), DDR_BASE + i * BUFFER_SIZE, + BUFFER_SIZE); + } + + testBandwidthSuite(parBuffers, "HBM+DDR", ", raw QDMA"); + } + { + // Bidirectional HBM+DDR: 4N positions total. Positions 0..2N-1 + // are HBM (interleaved R/W across regions 0..2N-1); positions + // 2N..4N-1 are DDR (interleaved R/W across DDR slots 0..2N-1). + // Channel = (p / 2) & 1 throughout. + std::vector parWriteBuffers; + std::vector parReadBuffers; + parWriteBuffers.reserve(2 * N); + parReadBuffers.reserve(2 * N); + for (unsigned i = 0; i < N; ++i) { + parReadBuffers.emplace_back(qdma.get(), + HBM_BASE + (2 * i) * MEM_REGION_SIZE, + BUFFER_SIZE); + parWriteBuffers.emplace_back(qdma.get(), + HBM_BASE + (2 * i + 1) * MEM_REGION_SIZE, + BUFFER_SIZE); + } + for (unsigned i = 0; i < N; ++i) { + parReadBuffers.emplace_back(qdma.get(), + DDR_BASE + (2 * i) * BUFFER_SIZE, + BUFFER_SIZE); + parWriteBuffers.emplace_back(qdma.get(), + DDR_BASE + (2 * i + 1) * BUFFER_SIZE, + BUFFER_SIZE); + } + + testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", raw QDMA"); + } + } + + return 0; +} + +/// Raw integrity + bandwidth test driven over the off-the-shelf Xilinx QDMA +/// driver instead of SLASH. smi provisions queues itself (qmax + netlink +/// add/start) and transfers over the per-queue char devices. +static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& options) { +#ifndef SMI_ENABLE_QDMA_DRIVER_BACKEND + (void)bdf; + (void)options; + std::cerr << "validate: this v80-smi build was configured without " + << "--use-qdma-driver support. Rebuild with " + << "-DSMI_ENABLE_QDMA_DRIVER_BACKEND=ON." << std::endl; + return 1; +#else + const unsigned N = options.threads; + + if (!options.noReset) { + std::cout << "QDMA-driver raw mode skips reset; continuing without VRTD reset." << std::endl; + } + warnIfNotRoot("QDMA-driver raw mode"); + + const bool runParallel = !options.ddrOnly && !options.hbmOnly; + + std::cout << "Using off-the-shelf Xilinx QDMA driver for board " << bdf << "..." << std::endl; + smi::qdma_driver::QdmaDriverDevice qdma(bdf); + std::cout << "Resolved QDMA function " << qdma.functionBdf() << std::endl; + qdma.ensureQmax(runParallel ? 4 * N : 2 * N); + + const unsigned mmChannels = qdma.mmChannelMax(); + if (mmChannels > 1) { + std::cout << "Distributing queues across " << mmChannels + << " MM channels (channel = qid % " << mmChannels << ")." << std::endl; + } else { + std::cout << "Device exposes a single MM channel; all queues on channel 0." << std::endl; + } + + if (!options.ddrOnly) { + std::cout << "Testing HBM data integrity (" << N << " regions, QDMA driver)..." << std::endl; + { + std::vector hbmBuffers; + hbmBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + hbmBuffers.emplace_back(qdma, i, HBM_BASE + i * MEM_REGION_SIZE, BUFFER_SIZE); + } + + if (!testDataIntegrity(hbmBuffers, "HBM")) { + std::cerr << "HBM data integrity check failed" << std::endl; + return 1; + } + + testBandwidthSuite(hbmBuffers, "HBM", ", QDMA driver"); } - for (auto& t : threads) { - t.join(); + { + std::vector hbmWriteBuffers; + std::vector hbmReadBuffers; + hbmWriteBuffers.reserve(N); + hbmReadBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + hbmReadBuffers.emplace_back(qdma, i, + HBM_BASE + (2 * i) * MEM_REGION_SIZE, BUFFER_SIZE); + hbmWriteBuffers.emplace_back(qdma, N + i, + HBM_BASE + (2 * i + 1) * MEM_REGION_SIZE, BUFFER_SIZE); + } + + testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", QDMA driver"); } } - auto readEnd = std::chrono::steady_clock::now(); - double writeSec = std::chrono::duration(writeEnd - writeStart).count(); - double readSec = std::chrono::duration(readEnd - readStart).count(); - double totalMB = static_cast(totalBytes) / (1024.0 * 1024.0); + if (!options.hbmOnly) { + std::cout << "Testing DDR data integrity (" << N << " buffers, QDMA driver)..." << std::endl; + { + std::vector ddrBuffers; + ddrBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + ddrBuffers.emplace_back(qdma, i, DDR_BASE + i * BUFFER_SIZE, BUFFER_SIZE); + } + + if (!testDataIntegrity(ddrBuffers, "DDR")) { + std::cerr << "DDR data integrity check failed" << std::endl; + return 1; + } + + testBandwidthSuite(ddrBuffers, "DDR", ", QDMA driver"); + } + { + std::vector ddrWriteBuffers; + std::vector ddrReadBuffers; + ddrWriteBuffers.reserve(N); + ddrReadBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + ddrReadBuffers.emplace_back(qdma, i, + DDR_BASE + (2 * i) * BUFFER_SIZE, BUFFER_SIZE); + ddrWriteBuffers.emplace_back(qdma, N + i, + DDR_BASE + (2 * i + 1) * BUFFER_SIZE, BUFFER_SIZE); + } + + testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", QDMA driver"); + } + } + + if (runParallel) { + { + std::vector parBuffers; + parBuffers.reserve(2 * N); + for (unsigned i = 0; i < N; ++i) { + parBuffers.emplace_back(qdma, i, HBM_BASE + i * MEM_REGION_SIZE, BUFFER_SIZE); + } + for (unsigned i = 0; i < N; ++i) { + parBuffers.emplace_back(qdma, N + i, DDR_BASE + i * BUFFER_SIZE, BUFFER_SIZE); + } + + testBandwidthSuite(parBuffers, "HBM+DDR", ", QDMA driver"); + } + { + std::vector parWriteBuffers; + std::vector parReadBuffers; + parWriteBuffers.reserve(2 * N); + parReadBuffers.reserve(2 * N); + for (unsigned i = 0; i < N; ++i) { + parReadBuffers.emplace_back(qdma, i, + HBM_BASE + (2 * i) * MEM_REGION_SIZE, BUFFER_SIZE); + parWriteBuffers.emplace_back(qdma, 2 * N + i, + HBM_BASE + (2 * i + 1) * MEM_REGION_SIZE, BUFFER_SIZE); + } + for (unsigned i = 0; i < N; ++i) { + parReadBuffers.emplace_back(qdma, N + i, + DDR_BASE + (2 * i) * BUFFER_SIZE, BUFFER_SIZE); + parWriteBuffers.emplace_back(qdma, 3 * N + i, + DDR_BASE + (2 * i + 1) * BUFFER_SIZE, BUFFER_SIZE); + } - std::cout << " Write: " << std::fixed << std::setprecision(2) - << (totalMB / writeSec) << " MB/s" << std::endl; - std::cout << " Read: " << std::fixed << std::setprecision(2) - << (totalMB / readSec) << " MB/s" << std::endl; + testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", QDMA driver"); + } + } + + return 0; +#endif } +} // namespace + int Validate::run(const Options& options) { std::string bdf = resolveBoardBdf(options.bdf, "validate"); unsigned N = options.threads; + if (!checkHostMemoryBudget(options)) { + return 1; + } + + // The HBM bidirectional phase uses 2*N HBM regions (write 0..N-1, read N..2N-1). + // HBM has only 64 regions, so N>32 is unsupportable unless HBM is excluded. + static constexpr unsigned HBM_REGIONS = 64; + if (!options.ddrOnly && 2 * N > HBM_REGIONS) { + std::cerr << "validate: --threads > " << (HBM_REGIONS / 2) + << " requires --ddr-only (bidirectional HBM uses 2*N HBM regions, only " + << HBM_REGIONS << " exist)" << std::endl; + return 1; + } + + if (options.rawTransferTest) { + return runRawTransferTest(bdf, options); + } + + if (options.useQdmaDriver) { + return runQdmaDriverTest(bdf, options); + } + // -- Step 1: (Optional) Reset the device via vrtd -- if (!options.noReset) { std::cout << "Resetting device " << bdf << "..." << std::endl; @@ -172,42 +894,104 @@ int Validate::run(const Options& options) { auto device = session.getDeviceByBdf(bdf); // -- Step 2: HBM — integrity then bandwidth -- - std::cout << "Testing HBM data integrity (" << N << " regions)..." << std::endl; - { - std::vector hbmBuffers; - hbmBuffers.reserve(N); - for (unsigned i = 0; i < N; ++i) { - hbmBuffers.push_back(device.openHbmBuffer(i, BUFFER_SIZE)); - } + if (!options.ddrOnly) { + std::cout << "Testing HBM data integrity (" << N << " regions)..." << std::endl; + { + std::vector hbmBuffers; + hbmBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + hbmBuffers.push_back(device.openHbmBuffer(i, BUFFER_SIZE)); + } - if (!testDataIntegrity(hbmBuffers, "HBM")) { - std::cerr << "HBM data integrity check failed" << std::endl; - return 1; + if (!testDataIntegrity(hbmBuffers, "HBM")) { + std::cerr << "HBM data integrity check failed" << std::endl; + return 1; + } + + testBandwidthSuite(hbmBuffers, "HBM", ""); } + // HBM buffers released. + { + std::vector hbmWriteBuffers; + std::vector hbmReadBuffers; + hbmWriteBuffers.reserve(N); + hbmReadBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + hbmReadBuffers.push_back(device.openHbmBuffer(2 * i, BUFFER_SIZE)); + hbmWriteBuffers.push_back(device.openHbmBuffer(2 * i + 1, BUFFER_SIZE)); + } - std::cout << "Testing HBM bandwidth (" << N << " threads)..." << std::endl; - testBandwidth(hbmBuffers); + testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ""); + } + // Bidirectional HBM buffers released. } - // HBM buffers released. // -- Step 3: DDR — integrity then bandwidth -- - std::cout << "Testing DDR data integrity (" << N << " buffers)..." << std::endl; - { - std::vector ddrBuffers; - ddrBuffers.reserve(N); - for (unsigned i = 0; i < N; ++i) { - ddrBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); + if (!options.hbmOnly) { + std::cout << "Testing DDR data integrity (" << N << " buffers)..." << std::endl; + { + std::vector ddrBuffers; + ddrBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + ddrBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); + } + + if (!testDataIntegrity(ddrBuffers, "DDR")) { + std::cerr << "DDR data integrity check failed" << std::endl; + return 1; + } + + testBandwidthSuite(ddrBuffers, "DDR", ""); + } + // DDR buffers released. + { + std::vector ddrWriteBuffers; + std::vector ddrReadBuffers; + ddrWriteBuffers.reserve(N); + ddrReadBuffers.reserve(N); + for (unsigned i = 0; i < N; ++i) { + ddrWriteBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); + ddrReadBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); + } + + testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ""); } + // Bidirectional DDR buffers released. + } - if (!testDataIntegrity(ddrBuffers, "DDR")) { - std::cerr << "DDR data integrity check failed" << std::endl; - return 1; + // -- Step 4: HBM + DDR in parallel -- + if (!options.ddrOnly && !options.hbmOnly) { + { + std::vector parBuffers; + parBuffers.reserve(2 * N); + for (unsigned i = 0; i < N; ++i) { + parBuffers.push_back(device.openHbmBuffer(i, BUFFER_SIZE)); + } + for (unsigned i = 0; i < N; ++i) { + parBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); + } + + testBandwidthSuite(parBuffers, "HBM+DDR", ""); } + // Parallel single-direction buffers released. + { + std::vector parWriteBuffers; + std::vector parReadBuffers; + parWriteBuffers.reserve(2 * N); + parReadBuffers.reserve(2 * N); + for (unsigned i = 0; i < N; ++i) { + parReadBuffers.push_back(device.openHbmBuffer(2 * i, BUFFER_SIZE)); + parWriteBuffers.push_back(device.openHbmBuffer(2 * i + 1, BUFFER_SIZE)); + } + for (unsigned i = 0; i < N; ++i) { + parWriteBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); + parReadBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); + } - std::cout << "Testing DDR bandwidth (" << N << " threads)..." << std::endl; - testBandwidth(ddrBuffers); + testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ""); + } + // Parallel bidirectional buffers released. } - // DDR buffers released. return 0; } diff --git a/smi/src/validate.hpp b/smi/src/validate.hpp index 2e5d1f8e..810d53fa 100644 --- a/smi/src/validate.hpp +++ b/smi/src/validate.hpp @@ -24,9 +24,10 @@ /// @file validate.hpp /// @brief Declaration of the Validate command. /// -/// The Validate command resets a V80 board and then exercises DDR and HBM -/// memory via PCIe by running data integrity checks followed by parallel -/// bandwidth measurements. +/// The Validate command optionally resets a V80 board and then exercises DDR +/// and HBM memory via PCIe by running data integrity checks followed by +/// parallel bandwidth measurements. Raw transfer modes skip reset and bypass +/// the default VRTD buffer path. #include @@ -42,6 +43,10 @@ class Validate { std::string bdf; ///< BDF (Bus:Device.Function) address of the target device. unsigned threads = 8; ///< Number of parallel buffers/threads (1-64). bool noReset = false; ///< Skip the device reset step before running memory tests. + bool ddrOnly = false; ///< Skip HBM phase (mutually exclusive with hbmOnly). + bool hbmOnly = false; ///< Skip DDR phase (mutually exclusive with ddrOnly). + bool rawTransferTest = false; ///< Use libslash raw QDMA transfers instead of VRTD buffers. + bool useQdmaDriver = false; ///< Run the raw test over the off-the-shelf Xilinx QDMA driver. }; /// @brief Executes the validate command. From 2816573590e45d7b7a4f08923d75eba8c75e8711 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Mon, 8 Jun 2026 10:12:44 +0100 Subject: [PATCH 05/23] packaging: ship libqdma patches, add deps, harden install test, ignore /tmp Signed-off-by: Vlad-Gabriel Serbu --- .gitignore | 3 +++ packaging/debian/control | 4 ++-- packaging/debian/slash-dkms.install | 1 + packaging/rpm/slash.spec | 5 ++++- scripts/test-fresh-install.sh | 4 ++-- 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 5c17336a..e90cb2ef 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,6 @@ driver/kcompat/.scratch/ # Python test coverage .coverage + +# Project-local scratch space +/tmp/ diff --git a/packaging/debian/control b/packaging/debian/control index ce02f2dd..09534d96 100644 --- a/packaging/debian/control +++ b/packaging/debian/control @@ -48,7 +48,7 @@ Description: SLASH/VRT System for simulation and emulation Package: slash-dkms Architecture: all -Depends: dkms, gcc, make, ${misc:Depends} +Depends: dkms, gcc, make, patch, ${misc:Depends} Provides: slash-kernel-module Description: SLASH kernel module (DKMS) @@ -89,7 +89,7 @@ Description: VRT Runtime (development files) Package: v80-smi Architecture: any -Depends: libvrt (= ${binary:Version}), ${shlibs:Depends}, ${misc:Depends} +Depends: libvrt (= ${binary:Version}), libslash (= ${binary:Version}), ${shlibs:Depends}, ${misc:Depends} Description: V80 System Management Interface Package: slashkit diff --git a/packaging/debian/slash-dkms.install b/packaging/debian/slash-dkms.install index d0496a33..c377f69f 100644 --- a/packaging/debian/slash-dkms.install +++ b/packaging/debian/slash-dkms.install @@ -22,6 +22,7 @@ driver/*.c usr/src/slash-@VERSION@/driver/ driver/*.h usr/src/slash-@VERSION@/driver/ driver/Makefile usr/src/slash-@VERSION@/driver/ driver/kcompat usr/src/slash-@VERSION@/driver/ +driver/patches usr/src/slash-@VERSION@/driver/ driver/libslash/include/slash/uapi usr/src/slash-@VERSION@/driver/libslash/include/slash/ submodules/qdma_drv/QDMA/linux-kernel/driver/libqdma/ usr/src/slash-@VERSION@/driver/ diff --git a/packaging/rpm/slash.spec b/packaging/rpm/slash.spec index a18ccd59..3568a859 100644 --- a/packaging/rpm/slash.spec +++ b/packaging/rpm/slash.spec @@ -90,7 +90,7 @@ SLASH/VRT System for simulation and emulation (development files) %package -n slash-dkms Summary: SLASH kernel module (DKMS) -Requires: dkms, gcc, make +Requires: dkms, gcc, make, patch BuildArch: noarch %description -n slash-dkms @@ -157,6 +157,7 @@ VRT Runtime (development files) %package -n v80-smi Summary: V80 System Management Interface Requires: libvrt = %{version}-%{release} +Requires: libslash = %{version}-%{release} %description -n v80-smi V80 System Management Interface @@ -211,6 +212,8 @@ install -m 0644 driver/Makefile %{buildroot}%{_usrsrc}/%{dkms_name}-%{dkms_versi cp -a driver/kcompat %{buildroot}%{_usrsrc}/%{dkms_name}-%{dkms_version}/driver/ +cp -a driver/patches %{buildroot}%{_usrsrc}/%{dkms_name}-%{dkms_version}/driver/ + cp -a driver/libslash/include/slash/uapi \ %{buildroot}%{_usrsrc}/%{dkms_name}-%{dkms_version}/driver/libslash/include/slash/ diff --git a/scripts/test-fresh-install.sh b/scripts/test-fresh-install.sh index 247d6b2b..cd20e363 100755 --- a/scripts/test-fresh-install.sh +++ b/scripts/test-fresh-install.sh @@ -197,7 +197,7 @@ elif [[ "${PKG_TYPE}" == "rpm" ]]; then if [[ ${#INSTALLED[@]} -gt 0 ]]; then echo "Removing: ${INSTALLED[*]}" - dnf remove -y "${INSTALLED[@]}" + dnf remove -y --setopt='*.skip_if_unavailable=True' "${INSTALLED[@]}" else echo "No SLASH packages currently installed." fi @@ -226,7 +226,7 @@ elif [[ "${PKG_TYPE}" == "rpm" ]]; then # Exclude source, debuginfo, and debugsource RPMs mapfile -t RPMS < <(find "${ARTIFACTS_DIR}" -maxdepth 1 -name '*.rpm' \ ! -name '*.src.rpm' ! -name '*-debuginfo-*' ! -name '*-debugsource-*') - dnf install -y "${RPMS[@]}" + dnf install -y --setopt='*.skip_if_unavailable=True' "${RPMS[@]}" fi # ========================================================================= From 96f7b058573e2de71beedf8c2080e6e81e74cb1d Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Wed, 10 Jun 2026 11:16:00 +0100 Subject: [PATCH 06/23] driver: guard libqdma pr_fmt under force-included compat header Signed-off-by: Vlad-Gabriel Serbu --- .../patches/0003-libqdma-pr-fmt-guard.patch | 43 +++++++++++++++++++ driver/slash_compat.h | 16 +++++++ 2 files changed, 59 insertions(+) create mode 100644 driver/patches/0003-libqdma-pr-fmt-guard.patch diff --git a/driver/patches/0003-libqdma-pr-fmt-guard.patch b/driver/patches/0003-libqdma-pr-fmt-guard.patch new file mode 100644 index 00000000..d253070a --- /dev/null +++ b/driver/patches/0003-libqdma-pr-fmt-guard.patch @@ -0,0 +1,43 @@ +SLASH local modification to the pinned QDMA submodule (libqdma). + +libqdma: make qdma_platform_env.h self-sufficient for pr_fmt + +SLASH force-includes driver/slash_compat.h into every TU (driver/Makefile) so +kernel-API shims such as from_timer() reach the pinned libqdma sources. That +header pulls in early (via ) and then #undefs +pr_fmt, so each libqdma .c that sets its own + #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ +at the top compiles without a "pr_fmt redefined" warning. + +The qdma_access HAL files don't set their own pr_fmt -- they log via +qdma_log_* -> pr_*, which expand pr_fmt at the call site -- so after that +#undef they would reference an undefined pr_fmt ( is already +include-guarded by the time they include it, so its guarded default can no +longer re-arm). Re-arm the kernel default here, guarded by #ifndef so the +libqdma sources that do set a custom pr_fmt before including this header keep +it. Behaviour for the HAL files is unchanged (the kernel default is "fmt"). + +Generated against qdma_drv @ e0168be (pinned submodule commit). +Applied automatically by driver/Makefile (libqdma-patches target, patch -p1). +diff --git a/qdma_platform_env.h b/qdma_platform_env.h +index fa26c9a..c9e1082 100755 +--- a/qdma_platform_env.h ++++ b/qdma_platform_env.h +@@ -25,6 +25,17 @@ + #define QDMA_SNPRINTF_S(arg1, arg2, arg3, ...) \ + snprintf(arg1, arg3, ##__VA_ARGS__) + ++/* ++ * SLASH: re-arm the kernel-default pr_fmt for TUs that log via qdma_log_* -> ++ * pr_* but never set their own pr_fmt. SLASH force-includes a compat header ++ * that #undefs pr_fmt after is already include-guarded, so the ++ * default can no longer re-arm on its own. Guarded with #ifndef so the libqdma ++ * sources that set a custom pr_fmt before including this header keep it. ++ */ ++#ifndef pr_fmt ++#define pr_fmt(fmt) fmt ++#endif ++ + #define qdma_log_info(x_, ...) pr_info(x_, ##__VA_ARGS__) + #define qdma_log_warning(x_, ...) pr_warn(x_, ##__VA_ARGS__) + #define qdma_log_error(x_, ...) pr_err(x_, ##__VA_ARGS__) diff --git a/driver/slash_compat.h b/driver/slash_compat.h index 2352458c..e6719487 100644 --- a/driver/slash_compat.h +++ b/driver/slash_compat.h @@ -77,4 +77,20 @@ static inline void slash_vm_flags_set(struct vm_area_struct *vma, vm_flags_t fla # endif #endif +/* + * The kernel headers included above ( -> ) install + * the default `#define pr_fmt(fmt) fmt` under an #ifndef guard. Because this + * header is force-included (-include, see driver/Makefile) ahead of every TU, + * that default lands before each pinned libqdma source's own top-of-file + * #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + * turning it into a redefinition ("pr_fmt redefined" warning). Undefine it here + * so each TU starts from the clean "nobody has defined pr_fmt yet" state the + * idiom relies on; the file's own #define is then the first and only one. + * + * TUs that never set their own pr_fmt and log via the kernel default (the + * qdma_access HAL, whose qdma_log_* macros expand to pr_*) re-arm that default + * from qdma_platform_env.h; see driver/patches/0003-libqdma-pr-fmt-guard.patch. + */ +#undef pr_fmt + #endif /* SLASH_COMPAT_H */ From 6a5a3e97a299e00c88b9769fc447639a7cea547c Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Wed, 10 Jun 2026 11:16:07 +0100 Subject: [PATCH 07/23] driver: verify qdma host-profile readback and add tunable hugepage descriptor size Signed-off-by: Vlad-Gabriel Serbu --- driver/slash_qdma.c | 298 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 254 insertions(+), 44 deletions(-) diff --git a/driver/slash_qdma.c b/driver/slash_qdma.c index 18c78206..ebdf9949 100644 --- a/driver/slash_qdma.c +++ b/driver/slash_qdma.c @@ -130,9 +130,21 @@ #define SLASH_QDMA_HP_NUM_WORDS 8 #define SLASH_QDMA_HP_SEL 0xAu /* QDMA_CTXT_SELC_HOST_PROFILE */ #define SLASH_QDMA_HP_OP_WR 0x1u /* indirect context WR opcode */ +#define SLASH_QDMA_HP_OP_RD 0x2u /* indirect context RD opcode */ #define SLASH_QDMA_HP_SMID_BASE 0x100u /* bit 8 set; base AXI-MM master ID */ #define SLASH_QDMA_HP_POLL_US 1000 /* busy-wait budget in microseconds */ +/* + * The qpair fd data path accepts either a span of 4 KiB base pages or a span + * of 2 MiB hugetlb pages. Every scatter-gather entry within one request uses + * the same granule, which keeps the DMA mapping semantics unambiguous; the two + * granules are never mixed in a single request. A whole transfer (of either + * granule) is submitted to libqdma as a single multi-descriptor request, and + * libqdma refills the descriptor ring as needed -- so the transfer size is not + * bounded by the ring depth. + */ +#define SLASH_QDMA_HUGEPAGE_SIZE (2UL * 1024UL * 1024UL) + /* * qdma_force_mm_channel - Debug/experiment override for AXI-MM/NoC channel * assignment of newly-added queue pairs. @@ -181,6 +193,53 @@ module_param_cb(qdma_force_mm_channel, &slash_qdma_force_mm_channel_ops, MODULE_PARM_DESC(qdma_force_mm_channel, "Force QDMA AXI-MM/NoC channel for new queues: <0=auto(qid&1), 0 or 1 to pin (default -1)"); +/* + * qdma_huge_desc_size - Experimental descriptor granularity for hugetlb-backed + * raw qpair transfers. + * + * The userspace raw-transfer path prefers 2 MiB hugetlb pages so the host page + * size stays large and stable. By default, each pinned 2 MiB page becomes one + * SGL entry / QDMA descriptor. Reducing this value keeps the same pinned + * hugetlb page but emits several descriptors with increasing offsets inside + * that page, allowing us to test whether descriptor pressure (rather than host + * page size) is what makes dma-perf faster. + * + * Must be a page-aligned divisor of 2 MiB. Examples: + * 2097152 -> current behaviour (1 descriptor per huge page) + * 65536 -> 32 descriptors per huge page + * 4096 -> 512 descriptors per huge page + */ +static unsigned int qdma_huge_desc_size = SLASH_QDMA_HUGEPAGE_SIZE; + +static int slash_qdma_huge_desc_size_set(const char *val, + const struct kernel_param *kp) +{ + unsigned int parsed; + int err; + + err = kstrtouint(val, 0, &parsed); + if (err) + return err; + + if (parsed < PAGE_SIZE || + parsed > SLASH_QDMA_HUGEPAGE_SIZE || + !IS_ALIGNED(parsed, PAGE_SIZE) || + (SLASH_QDMA_HUGEPAGE_SIZE % parsed) != 0) + return -EINVAL; + + return param_set_uint(val, kp); +} + +static const struct kernel_param_ops slash_qdma_huge_desc_size_ops = { + .set = slash_qdma_huge_desc_size_set, + .get = param_get_uint, +}; + +module_param_cb(qdma_huge_desc_size, &slash_qdma_huge_desc_size_ops, + &qdma_huge_desc_size, 0644); +MODULE_PARM_DESC(qdma_huge_desc_size, + "Descriptor size for 2 MiB hugetlb raw transfers; page-aligned divisor of 2 MiB (default 2097152)"); + /** * SLASH_QDMA_QTYPE_COUNT - Number of queue types tracked per queue pair. * @@ -198,17 +257,6 @@ MODULE_PARM_DESC(qdma_force_mm_channel, */ #define SLASH_QDMA_MAX_QPAIRS 256 -/* - * The qpair fd data path accepts either a span of 4 KiB base pages or a span - * of 2 MiB hugetlb pages. Every scatter-gather entry within one request uses - * the same granule, which keeps the DMA mapping semantics unambiguous; the two - * granules are never mixed in a single request. A whole transfer (of either - * granule) is submitted to libqdma as a single multi-descriptor request, and - * libqdma refills the descriptor ring as needed -- so the transfer size is not - * bounded by the ring depth. - */ -#define SLASH_QDMA_HUGEPAGE_SIZE (2UL * 1024UL * 1024UL) - /* * Upper bound on the number of pages pinned per get_user_pages_fast() call when * mapping a multi-page base-page transfer. Bounds the work done in a single @@ -1009,7 +1057,113 @@ static void slash_qdma_hp_set_field(u32 *words, unsigned int hi, } /** - * slash_qdma_write_host_profile() - Program one CPM5 Host Profile entry. + * slash_qdma_hp_wait_ready() - Poll the indirect-context BUSY bit. + * @device: QDMA device (provides the libqdma handle for register access). + * @val_out: If non-NULL, receives the last QDMA_IND_CTXT_CMD value read. + * + * Spins (up to SLASH_QDMA_HP_POLL_US microseconds) until the indirect + * context command BUSY bit clears. Logging is left to the caller so the + * write path can treat a timeout as fatal while the readback path can treat + * it as a warning. + * + * Return: 0 once not busy, -ETIMEDOUT on timeout, or a negative errno from + * the register read. + */ +static int slash_qdma_hp_wait_ready(struct slash_qdma_dev *device, u32 *val_out) +{ + unsigned int waited_us = 0; + u32 val = 0; + int err; + + do { + err = qdma_device_read_config_register(device->qdma_handle, + SLASH_QDMA_HP_CMD_ADDR, &val); + if (err) + return err; + if (!(val & SLASH_QDMA_HP_CMD_BUSY)) { + if (val_out) + *val_out = val; + return 0; + } + udelay(1); + } while (++waited_us < SLASH_QDMA_HP_POLL_US); + + if (val_out) + *val_out = val; + return -ETIMEDOUT; +} + +/** + * slash_qdma_hp_get_field() - Read a bit field from the host profile context. + * @words: Array of SLASH_QDMA_HP_NUM_WORDS u32s holding the 256-bit context + * (word i covers bits [32*i+31 : 32*i]). + * @hi: Most-significant bit index of the field (inclusive). + * @lo: Least-significant bit index of the field (inclusive). + * + * Inverse of slash_qdma_hp_set_field(); handles fields that straddle a + * 32-bit word boundary (e.g. the C2H AXI4-MM steering field at bits + * [97:94], which spans words 2 and 3). + * + * Return: the value held in [hi:lo]. + */ +static u32 slash_qdma_hp_get_field(const u32 *words, unsigned int hi, + unsigned int lo) +{ + unsigned int width = hi - lo + 1; + u32 fmask = (width >= 32) ? ~0u : ((1u << width) - 1u); + unsigned int word = lo >> 5; + unsigned int off = lo & 31; + u64 two = (u64)words[word]; + + if ((word + 1) < SLASH_QDMA_HP_NUM_WORDS) + two |= (u64)words[word + 1] << 32; + + return (u32)((two >> off) & fmask); +} + +/** + * slash_qdma_read_host_profile() - Read one CPM5 Host Profile entry back. + * @device: QDMA device (provides the libqdma handle for register access). + * @host_id: Host Profile index to read. + * @out: Array of SLASH_QDMA_HP_NUM_WORDS u32s that receives the 256-bit + * context. + * + * Issues an indirect-context RD command for the host-profile selector, + * waits for the controller to complete it, and copies the IND_CTXT_DATA + * words back. Used to verify a preceding write. + * + * Return: 0 on success, negative errno on register-access error or + * -ETIMEDOUT if the BUSY bit never clears. + */ +static int slash_qdma_read_host_profile(struct slash_qdma_dev *device, + u32 host_id, u32 *out) +{ + u32 cmd = (host_id << 7) | (SLASH_QDMA_HP_OP_RD << 5) | + (SLASH_QDMA_HP_SEL << 1); + int err; + int i; + + err = qdma_device_write_config_register(device->qdma_handle, + SLASH_QDMA_HP_CMD_ADDR, cmd); + if (err) + return err; + + err = slash_qdma_hp_wait_ready(device, NULL); + if (err) + return err; + + for (i = 0; i < SLASH_QDMA_HP_NUM_WORDS; i++) { + err = qdma_device_read_config_register(device->qdma_handle, + SLASH_QDMA_HP_DATA_ADDR + (i * sizeof(u32)), &out[i]); + if (err) + return err; + } + + return 0; +} + +/** + * slash_qdma_write_host_profile() - Program and verify one CPM5 Host Profile. * @device: QDMA device (provides the libqdma handle for register access). * @host_id: Host Profile index to program (also the AXI4-MM steering value, * i.e. the target NoC channel). @@ -1019,6 +1173,11 @@ static void slash_qdma_hp_set_field(u32 *words, unsigned int hi, * registers via the libqdma-exported config register accessors, and * polls the command BUSY bit until the controller completes the write. * + * Once the write completes it reads the profile back and verifies the + * programmed fields (SMID and the two steering fields); a readback error + * or field mismatch is logged but is non-fatal (the profile is still + * considered applied). + * * Only the SMID and the two steering fields are non-zero; the AXI * prot/cache attributes are left at 0. * @@ -1029,7 +1188,6 @@ static int slash_qdma_write_host_profile(struct slash_qdma_dev *device, u32 host_id) { u32 data[SLASH_QDMA_HP_NUM_WORDS] = {0}; - unsigned int waited_us = 0; u32 smid = SLASH_QDMA_HP_SMID_BASE + host_id; u32 cmd; u32 val = 0; @@ -1065,26 +1223,46 @@ static int slash_qdma_write_host_profile(struct slash_qdma_dev *device, goto err_reg; /* Wait for the controller to consume the command. */ - do { - err = qdma_device_read_config_register(device->qdma_handle, - SLASH_QDMA_HP_CMD_ADDR, &val); - if (err) - goto err_reg; - if (!(val & SLASH_QDMA_HP_CMD_BUSY)) - break; - udelay(1); - } while (++waited_us < SLASH_QDMA_HP_POLL_US); - - if (val & SLASH_QDMA_HP_CMD_BUSY) { + err = slash_qdma_hp_wait_ready(device, &val); + if (err == -ETIMEDOUT) { dev_err(&device->pdev->dev, "qdma: host profile %u programming timed out (cmd=0x%x)\n", host_id, val); return -ETIMEDOUT; } + if (err) + goto err_reg; - dev_info(&device->pdev->dev, - "slash: qdma: host profile %u applied: H2C/C2H AXI-MM steering=%u (NoC channel %u), smid=0x%03x (cmd=0x%02x)\n", - host_id, host_id, host_id, smid, cmd); + /* + * Read the profile back and verify the programmed fields. A readback + * error or field mismatch is non-fatal: the write itself completed, so + * the profile is still considered applied. + */ + { + u32 rb[SLASH_QDMA_HP_NUM_WORDS] = {0}; + int rerr = slash_qdma_read_host_profile(device, host_id, rb); + + if (rerr) { + dev_warn(&device->pdev->dev, + "slash: qdma: host profile %u applied (cmd=0x%02x) but readback failed: %d\n", + host_id, cmd, rerr); + } else { + u32 smid_rb = slash_qdma_hp_get_field(rb, 201, 192); + u32 h2c_rb = slash_qdma_hp_get_field(rb, 181, 178); + u32 c2h_rb = slash_qdma_hp_get_field(rb, 97, 94); + + if (smid_rb == smid && h2c_rb == host_id && c2h_rb == host_id) { + dev_info(&device->pdev->dev, + "slash: qdma: host profile %u applied and readback verified: H2C/C2H AXI-MM steering=%u (NoC channel %u), smid=0x%03x (cmd=0x%02x)\n", + host_id, host_id, host_id, smid, cmd); + } else { + dev_err(&device->pdev->dev, + "slash: qdma: host profile %u readback MISMATCH: smid exp=0x%03x got=0x%03x, h2c exp=%u got=%u, c2h exp=%u got=%u\n", + host_id, smid, smid_rb, host_id, h2c_rb, + host_id, c2h_rb); + } + } + } return 0; err_reg: @@ -2458,49 +2636,81 @@ static int slash_qdma_map_user_huge_page_to_sgl(struct slash_qdma_io_cb *iocb, bool write) { unsigned long addr = (unsigned long)iocb->buf; - size_t entries = iocb->len / SLASH_QDMA_HUGEPAGE_SIZE; + size_t huge_pages = iocb->len / SLASH_QDMA_HUGEPAGE_SIZE; + unsigned int desc_size = READ_ONCE(qdma_huge_desc_size); + unsigned int descs_per_page; + size_t entries; unsigned int i; + unsigned int sg_idx = 0; int rv; if ((iocb->len % SLASH_QDMA_HUGEPAGE_SIZE) != 0 || - entries == 0 || entries > UINT_MAX) + huge_pages == 0 || huge_pages > UINT_MAX) + return -EINVAL; + + if (desc_size < PAGE_SIZE || + desc_size > SLASH_QDMA_HUGEPAGE_SIZE || + !IS_ALIGNED(desc_size, PAGE_SIZE) || + (SLASH_QDMA_HUGEPAGE_SIZE % desc_size) != 0) + return -EINVAL; + + descs_per_page = SLASH_QDMA_HUGEPAGE_SIZE / desc_size; + if (huge_pages > UINT_MAX / descs_per_page) return -EINVAL; + entries = huge_pages * descs_per_page; rv = slash_qdma_iocb_alloc_sgl(iocb, (unsigned int)entries); if (rv) return rv; - for (i = 0; i < entries; i++) { + for (i = 0; i < huge_pages; i++) { unsigned long curr_addr = addr + (i * SLASH_QDMA_HUGEPAGE_SIZE); - struct qdma_sw_sg *sg = &iocb->sgl[i]; + struct page *page = NULL; + unsigned int j; - rv = get_user_pages_fast(curr_addr, 1, 1 /* write */, &iocb->pages[i]); + rv = get_user_pages_fast(curr_addr, 1, 1 /* write */, &page); if (rv != 1) { pr_err("slash: qdma: unable to pin 2 MiB user page %u/%zu, %d\n", - i, entries, rv); + i, huge_pages, rv); rv = rv < 0 ? rv : -EFAULT; goto err_out; } - iocb->pages_nr = i + 1; - if (!slash_qdma_page_is_2m_hugetlb_head(iocb->pages[i])) { + if (!slash_qdma_page_is_2m_hugetlb_head(page)) { pr_err("slash: qdma: 2 MiB transfer page %u/%zu is not backed by a 2 MiB hugetlb head page\n", - i, entries); + i, huge_pages); + put_page(page); rv = -EINVAL; goto err_out; } - flush_dcache_page(iocb->pages[i]); + flush_dcache_page(page); - sg->next = (i + 1 < entries) ? &iocb->sgl[i + 1] : NULL; - sg->pg = iocb->pages[i]; - sg->offset = 0; - sg->len = SLASH_QDMA_HUGEPAGE_SIZE; - sg->dma_addr = 0UL; + for (j = 0; j < descs_per_page; j++, sg_idx++) { + struct qdma_sw_sg *sg = &iocb->sgl[sg_idx]; + + /* + * The first segment consumes the GUP reference. Additional + * descriptors over the same hugetlb page take explicit references + * so slash_qdma_unmap_user_buf() can release one page ref per SGL + * entry without special casing repeated pages. + */ + if (j != 0) + get_page(page); + + iocb->pages[sg_idx] = page; + iocb->pages_nr = sg_idx + 1; + + sg->next = (sg_idx + 1 < entries) ? &iocb->sgl[sg_idx + 1] : NULL; + sg->pg = page; + sg->offset = j * desc_size; + sg->len = desc_size; + sg->dma_addr = 0UL; + } } - SLASH_QDMA_OP_LOG("user transfer path=hugetlb-2m addr=0x%lx len=%zu pages=%zu write=%d\n", - addr, iocb->len, entries, write); + SLASH_QDMA_OP_LOG("user transfer path=hugetlb-2m addr=0x%lx len=%zu pages=%zu desc_size=%u descs=%zu write=%d\n", + addr, iocb->len, huge_pages, desc_size, entries, write); return 0; From b2a9e24f17dd9249fc58bc81a6854d5d082d4670 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Wed, 10 Jun 2026 11:16:48 +0100 Subject: [PATCH 08/23] smi: add validate buffer placement, channel allocation, and bandwidth knobs Signed-off-by: Vlad-Gabriel Serbu --- smi/src/smi.cpp | 48 ++++ smi/src/validate.cpp | 638 +++++++++++++++++++++++++++++++++++++------ smi/src/validate.hpp | 32 +++ 3 files changed, 630 insertions(+), 88 deletions(-) diff --git a/smi/src/smi.cpp b/smi/src/smi.cpp index a1d84f19..dba4fa9f 100644 --- a/smi/src/smi.cpp +++ b/smi/src/smi.cpp @@ -27,6 +27,9 @@ /// reset, validate, debug). #include +#include +#include +#include #include #include @@ -108,11 +111,33 @@ static int smiMain(int argc, char **argv) { // -- validate (memory integrity + bandwidth) -- auto* validateCommand = app.add_subcommand("validate", "Validate board memory (integrity + bandwidth)"); Validate::Options validateOptions; + auto addValidateSizeOption = [&](const char* name, uint64_t* target, const char* description) { + return validateCommand->add_option_function( + name, + [target, name, &validateOptions](const std::string& value) { + try { + *target = Validate::parseByteSizeOption(value); + validateOptions.placementExplicit = true; + } catch (const std::exception& e) { + throw CLI::ValidationError(name, e.what()); + } + }, + description); + }; validateCommand->add_option("-d,--device", validateOptions.bdf, "Board address (e.g. 03:00 or 0000:03:00)")->required(); validateCommand->add_option("-j,--threads", validateOptions.threads, "Number of parallel buffers/threads (1-64)")->default_val(8)->check(CLI::Range(1u, 64u)); validateCommand->add_flag("-R,--no-reset", validateOptions.noReset, "Skip the device reset step before running memory tests"); + addValidateSizeOption("--buffer-size", &validateOptions.bufferSize, + "Size of each validate buffer; accepts bytes or k/K/m/M suffixes (max 512M)") + ->default_str("512M"); + addValidateSizeOption("--offset", &validateOptions.offset, + "Distance between logical validate buffer positions; accepts bytes or k/K/m/M suffixes") + ->default_str("512M"); + addValidateSizeOption("--starting-offset", &validateOptions.startingOffset, + "Offset from each memory-space base for logical position 0; accepts bytes or k/K/m/M suffixes") + ->default_str("0"); auto* rawTransferFlag = validateCommand->add_flag("--raw-transfer-test", validateOptions.rawTransferTest, "Use libslash raw QDMA transfers instead of VRTD buffers (implies --no-reset)"); auto* useQdmaDriverFlag = validateCommand->add_flag("--use-qdma-driver", validateOptions.useQdmaDriver, @@ -127,6 +152,29 @@ static int smiMain(int argc, char **argv) { "Run only HBM memory tests (skip DDR)"); ddrOnlyFlag->excludes(hbmOnlyFlag); hbmOnlyFlag->excludes(ddrOnlyFlag); + const std::map channelAllocationMap{ + {"auto", Validate::Options::ChannelAllocation::Auto}, + {"paired", Validate::Options::ChannelAllocation::Paired}, + }; + validateCommand->add_option("--channel-allocation", validateOptions.channelAllocation, + "Raw-transfer NoC channel/memory placement (raw modes only): " + "auto (interleaved: mm-channel=qid&1, linear addressing; default) or " + "paired (couple mm-channel to a distinct memory region/NSU per " + "--channel-region-stride, mirroring dma-perf offset_ch0/offset_ch1)") + ->transform(CLI::CheckedTransformer(channelAllocationMap, CLI::ignore_case)) + ->default_str("auto"); + addValidateSizeOption("--channel-region-stride", &validateOptions.channelRegionStride, + "In --channel-allocation paired mode, byte distance between the two per-channel " + "memory regions (NSU/pseudo-channel stride); accepts k/K/m/M/g/G suffixes") + ->default_str("16G"); + validateCommand->add_option("--bandwidth-iterations", validateOptions.bandwidthIterations, + "Raw-transfer bandwidth mode only: repeat each whole-buffer transfer this many times") + ->default_val(1)->check(CLI::Range(static_cast(1), + std::numeric_limits::max())); + validateCommand->add_option("--bandwidth-duration", validateOptions.bandwidthDuration, + "Raw-transfer bandwidth mode only: repeat whole-buffer transfers for this many seconds " + "(0 disables duration mode)") + ->default_val(0.0)->check(CLI::NonNegativeNumber); // -- debug (low-level debug utilities) -- auto* debugCommand = app.add_subcommand("debug", "Low-level debug utilities"); diff --git a/smi/src/validate.cpp b/smi/src/validate.cpp index dbc9bd88..899afab4 100644 --- a/smi/src/validate.cpp +++ b/smi/src/validate.cpp @@ -41,6 +41,8 @@ #include #include +#include +#include #include #include #include @@ -55,6 +57,7 @@ #include #include #include +#include #include #include @@ -78,24 +81,273 @@ namespace { using smi::raw::throwSystemError; -/// Buffer size for each allocation (512 MB — one allocator region). -static constexpr uint64_t BUFFER_SIZE = 512ULL * 1024 * 1024; - /// Region constants mirror vrt/vrtd/src/allocator.h, which is private. static constexpr uint64_t HBM_BASE = 0x4000000000ULL; static constexpr uint64_t DDR_BASE = 0x60000000000ULL; static constexpr uint64_t MEM_REGION_SIZE = 512ULL * 1024 * 1024; +static constexpr uint64_t MEMORY_SPACE_SIZE = 64ULL * MEM_REGION_SIZE; +static constexpr uint64_t MAX_BUFFER_SIZE = MEM_REGION_SIZE; +static constexpr uint64_t TRANSFER_ALIGNMENT = 4096ULL; static constexpr uint32_t QDMA_Q_MODE_MM = 0; static constexpr uint32_t QDMA_DIR_H2C = 0x1; static constexpr uint32_t QDMA_DIR_C2H = 0x2; static constexpr uint32_t QDMA_RING_SZ_IDX = 0; +static std::string trim(std::string_view text) { + size_t first = 0; + while (first < text.size() && + std::isspace(static_cast(text[first]))) { + ++first; + } + + size_t last = text.size(); + while (last > first && + std::isspace(static_cast(text[last - 1]))) { + --last; + } + + return std::string{text.substr(first, last - first)}; +} + +static uint64_t parseByteSizeText(std::string_view text) { + std::string value = trim(text); + if (value.empty()) { + throw std::invalid_argument("value must not be empty"); + } + + uint64_t multiplier = 1; + if (!value.empty() && (value.back() == 'b' || value.back() == 'B')) { + value.pop_back(); + } + if (!value.empty()) { + const char suffix = value.back(); + if (suffix == 'k' || suffix == 'K') { + multiplier = 1024ULL; + value.pop_back(); + } else if (suffix == 'm' || suffix == 'M') { + multiplier = 1024ULL * 1024ULL; + value.pop_back(); + } else if (suffix == 'g' || suffix == 'G') { + multiplier = 1024ULL * 1024ULL * 1024ULL; + value.pop_back(); + } + } + + value = trim(value); + if (value.empty() || value.front() == '-' || value.front() == '+') { + throw std::invalid_argument("value must be an unsigned byte count"); + } + + size_t parsed = 0; + uint64_t bytes = 0; + try { + bytes = std::stoull(value, &parsed, 0); + } catch (const std::exception&) { + throw std::invalid_argument("value must be an unsigned byte count"); + } + + if (parsed != value.size()) { + throw std::invalid_argument("unrecognized byte-size suffix"); + } + if (bytes > std::numeric_limits::max() / multiplier) { + throw std::invalid_argument("byte-size value is too large"); + } + + return bytes * multiplier; +} + +static bool isAligned(uint64_t value, uint64_t alignment) { + return (value % alignment) == 0; +} + +static bool checkAligned(const char* name, uint64_t value) { + if (!isAligned(value, TRANSFER_ALIGNMENT)) { + std::cerr << "validate: " << name << " must be " << TRANSFER_ALIGNMENT + << "-byte aligned" << std::endl; + return false; + } + return true; +} + +static bool checkMemoryPlacementRange(const char* memoryName, + const Validate::Options& options, + uint64_t positions) { + if (positions == 0) { + return true; + } + + const uint64_t lastPosition = positions - 1; + if (lastPosition != 0 && + options.offset > (std::numeric_limits::max() - options.startingOffset) / + lastPosition) { + std::cerr << "validate: " << memoryName + << " placement overflows 64-bit address arithmetic" << std::endl; + return false; + } + + const uint64_t lastStart = options.startingOffset + lastPosition * options.offset; + if (lastStart > MEMORY_SPACE_SIZE || options.bufferSize > MEMORY_SPACE_SIZE - lastStart) { + std::cerr << "validate: " << memoryName << " placement exceeds available " + << (MEMORY_SPACE_SIZE / (1024ULL * 1024ULL)) << " MiB address space" + << std::endl; + return false; + } + + return true; +} + +/// Paired-mode per-channel region stride (NSU / pseudo-channel spacing), +/// resolving 0 to half the per-memory address space. +static uint64_t pairedRegionStride(const Validate::Options& options) { + return options.channelRegionStride != 0 ? options.channelRegionStride + : (MEMORY_SPACE_SIZE / 2); +} + +/// Placement check for Paired channel allocation: even/odd positions occupy two +/// regions `pairedRegionStride()` bytes apart, each packed by in-region index. +/// Verifies neither region overflows into the next nor past the memory space. +static bool checkMemoryPlacementRangePaired(const char* memoryName, + const Validate::Options& options, + uint64_t positions) { + if (positions == 0) { + return true; + } + + const uint64_t stride = pairedRegionStride(options); + if (stride == 0 || (stride % TRANSFER_ALIGNMENT) != 0) { + std::cerr << "validate: --channel-region-stride must be a non-zero multiple of " + << TRANSFER_ALIGNMENT << " bytes" << std::endl; + return false; + } + if (stride > MEMORY_SPACE_SIZE) { + std::cerr << "validate: --channel-region-stride exceeds the " + << (MEMORY_SPACE_SIZE / (1024ULL * 1024ULL)) << " MiB per-memory address space" + << std::endl; + return false; + } + + // Highest in-region index used across both regions (positions 0..positions-1, + // split even/odd, each using index = position >> 1). + const uint64_t maxIndex = (positions - 1) >> 1; + if (maxIndex != 0 && + options.offset > (std::numeric_limits::max() - options.startingOffset) / maxIndex) { + std::cerr << "validate: " << memoryName + << " paired placement overflows 64-bit address arithmetic" << std::endl; + return false; + } + const uint64_t lastStart = options.startingOffset + maxIndex * options.offset; + + // Each region must hold its last buffer without spilling into the next region. + if (lastStart > stride || options.bufferSize > stride - lastStart) { + std::cerr << "validate: " << memoryName + << " paired placement overflows the per-channel region (stride " << stride + << " bytes); reduce --threads/--buffer-size/--offset or raise" + " --channel-region-stride" << std::endl; + return false; + } + // Region 1 sits one stride higher and must still fit the memory space. + if (lastStart + options.bufferSize > MEMORY_SPACE_SIZE - stride) { + std::cerr << "validate: " << memoryName + << " paired placement exceeds available " + << (MEMORY_SPACE_SIZE / (1024ULL * 1024ULL)) << " MiB address space" + << std::endl; + return false; + } + return true; +} + +static bool validatePlacement(const Validate::Options& options) { + if (options.bufferSize == 0 || options.bufferSize > MAX_BUFFER_SIZE) { + std::cerr << "validate: --buffer-size must be in the range 1..512M" << std::endl; + return false; + } + if (options.offset == 0) { + std::cerr << "validate: --offset must be greater than zero" << std::endl; + return false; + } + if (!checkAligned("--buffer-size", options.bufferSize) || + !checkAligned("--offset", options.offset) || + !checkAligned("--starting-offset", options.startingOffset)) { + return false; + } + if (options.offset < options.bufferSize) { + std::cerr << "validate: --offset must be at least --buffer-size so buffers do not overlap" + << std::endl; + return false; + } + + const bool paired = + options.channelAllocation == Validate::Options::ChannelAllocation::Paired; + if (paired && !options.rawTransferTest && !options.useQdmaDriver) { + std::cerr << "validate: --channel-allocation paired only applies to the raw transfer" + " tests (--raw-transfer-test or --use-qdma-driver)" << std::endl; + return false; + } + if ((options.bandwidthIterations > 1 || options.bandwidthDuration > 0.0) && + !options.rawTransferTest && !options.useQdmaDriver) { + std::cerr << "validate: --bandwidth-iterations/--bandwidth-duration only apply to the raw transfer" + " tests (--raw-transfer-test or --use-qdma-driver)" << std::endl; + return false; + } + if (options.bandwidthDuration < 0.0) { + std::cerr << "validate: --bandwidth-duration must be non-negative" << std::endl; + return false; + } + + const uint64_t positions = 2ULL * options.threads; + const auto checkRange = paired ? checkMemoryPlacementRangePaired : checkMemoryPlacementRange; + if (!options.ddrOnly && !checkRange("HBM", options, positions)) { + return false; + } + if (!options.hbmOnly && !checkRange("DDR", options, positions)) { + return false; + } + + return true; +} + +static uint64_t addressFor(uint64_t memoryBase, + const Validate::Options& options, + uint64_t position) { + return memoryBase + options.startingOffset + position * options.offset; +} + +/// Device address for a raw-transfer buffer, honouring the channel-allocation +/// strategy. In Paired mode the mm-channel (position&1 -- which SLASH maps to +/// the SW-context host_id and hence the CPM5 NoC NMU) is coupled to a distinct +/// memory region (NSU): even positions land in region 0, odd positions in +/// region 1, pairedRegionStride() bytes higher, each packed by its in-region +/// index. This mirrors dma-perf's offset_ch0/offset_ch1 so the two NMUs drive +/// independent memory endpoints instead of converging on one. +static uint64_t rawAddressFor(uint64_t memoryBase, + const Validate::Options& options, + uint64_t position) { + if (options.channelAllocation == Validate::Options::ChannelAllocation::Paired) { + const uint64_t channel = position & 1ULL; + const uint64_t inRegionIndex = position >> 1; + return memoryBase + channel * pairedRegionStride(options) + + options.startingOffset + inRegionIndex * options.offset; + } + return addressFor(memoryBase, options, position); +} + +/// Print which raw-transfer channel-allocation strategy is in effect. +static void printChannelAllocation(const Validate::Options& options) { + if (options.channelAllocation == Validate::Options::ChannelAllocation::Paired) { + std::cout << "Channel allocation: paired (even positions -> mm-channel 0 / region 0, " + "odd -> mm-channel 1 / region 1; region stride 0x" + << std::hex << pairedRegionStride(options) << std::dec << " bytes)" << std::endl; + } else { + std::cout << "Channel allocation: auto (mm-channel = qid&1, linear addressing)" << std::endl; + } +} + static bool checkHostMemoryBudget(const Validate::Options& options) { const uint64_t maxConcurrentBuffers = (!options.ddrOnly && !options.hbmOnly) ? 4ULL * options.threads : 2ULL * options.threads; - const uint64_t requiredBytes = maxConcurrentBuffers * BUFFER_SIZE; + const uint64_t requiredBytes = maxConcurrentBuffers * options.bufferSize; const long pageSize = sysconf(_SC_PAGESIZE); const long availablePages = sysconf(_SC_AVPHYS_PAGES); @@ -448,6 +700,35 @@ static void printBandwidthMetric(const char* label, double mbps) { << mbps << " MB/s" << std::endl; } +struct BandwidthRepeatOptions { + uint64_t iterations = 1; + std::chrono::duration duration{0.0}; + + bool durationMode() const { + return duration.count() > 0.0; + } + + bool isRepeated() const { + return durationMode() || iterations > 1; + } +}; + +static BandwidthRepeatOptions repeatOptionsFromValidate(const Validate::Options& options) { + BandwidthRepeatOptions repeat; + repeat.iterations = std::max(1, options.bandwidthIterations); + repeat.duration = std::chrono::duration(options.bandwidthDuration); + return repeat; +} + +static void printBandwidthRepeatMode(const BandwidthRepeatOptions& repeat) { + if (repeat.durationMode()) { + std::cout << "Bandwidth mode: duration " << std::fixed << std::setprecision(3) + << repeat.duration.count() << " s" << std::endl; + } else if (repeat.iterations > 1) { + std::cout << "Bandwidth mode: " << repeat.iterations << " iterations" << std::endl; + } +} + template static uint64_t fillBuffers(std::vector& buffers, int value) { uint64_t totalBytes = 0; @@ -497,9 +778,81 @@ static void runTransfers(std::vector& buffers, bool toDevice) { } } +static uint64_t joinRepeatedTransferThreads(std::vector& threads, + std::vector& errors, + const std::vector& bytes) { + for (auto& t : threads) { + t.join(); + } + for (auto& error : errors) { + if (error) { + std::rethrow_exception(error); + } + } + + uint64_t totalBytes = 0; + for (uint64_t value : bytes) { + totalBytes += value; + } + return totalBytes; +} + +template +static std::pair> +runRepeatedTransfers(std::vector& buffers, + bool toDevice, + const BandwidthRepeatOptions& repeat) { + std::vector threads; + std::vector errors(buffers.size()); + std::vector bytes(buffers.size(), 0); + threads.reserve(buffers.size()); + + const auto start = std::chrono::steady_clock::now(); + const auto deadline = start + repeat.duration; + + for (size_t i = 0; i < buffers.size(); ++i) { + threads.emplace_back([&buffers, &errors, &bytes, i, toDevice, repeat, deadline] { + try { + const uint64_t size = buffers[i].getSize(); + uint64_t completed = 0; + + if (repeat.durationMode()) { + while (std::chrono::steady_clock::now() < deadline) { + if (toDevice) { + buffers[i].syncToDevice(0, size); + } else { + buffers[i].syncFromDevice(0, size); + } + ++completed; + } + } else { + for (uint64_t iter = 0; iter < repeat.iterations; ++iter) { + if (toDevice) { + buffers[i].syncToDevice(0, size); + } else { + buffers[i].syncFromDevice(0, size); + } + ++completed; + } + } + + bytes[i] = completed * size; + } catch (...) { + errors[i] = std::current_exception(); + } + }); + } + + const uint64_t totalBytes = joinRepeatedTransferThreads(threads, errors, bytes); + const auto end = std::chrono::steady_clock::now(); + return {totalBytes, end - start}; +} + template -static double testSingleDirectionBandwidth(std::vector& buffers, bool toDevice) { - const uint64_t totalBytes = fillBuffers(buffers, toDevice ? 0xAB : 0xCD); +static double testSingleDirectionBandwidth(std::vector& buffers, + bool toDevice, + const BandwidthRepeatOptions& repeat = {}) { + (void)fillBuffers(buffers, toDevice ? 0xAB : 0xCD); if (!toDevice) { runTransfers(buffers, /*toDevice=*/true); @@ -508,18 +861,17 @@ static double testSingleDirectionBandwidth(std::vector& buffers, bool to } } - const auto start = std::chrono::steady_clock::now(); - runTransfers(buffers, toDevice); - const auto end = std::chrono::steady_clock::now(); + const auto [totalBytes, elapsed] = runRepeatedTransfers(buffers, toDevice, repeat); - return mbPerSecond(totalBytes, end - start); + return mbPerSecond(totalBytes, elapsed); } template static void testBidirectionalBandwidth(std::vector& writeBuffers, - std::vector& readBuffers) { - const uint64_t writeBytes = fillBuffers(writeBuffers, 0xAB); - const uint64_t readBytes = fillBuffers(readBuffers, 0xCD); + std::vector& readBuffers, + const BandwidthRepeatOptions& repeat = {}) { + (void)fillBuffers(writeBuffers, 0xAB); + (void)fillBuffers(readBuffers, 0xCD); // Prime device memory before timing so the C2H side reads initialized data. runTransfers(readBuffers, /*toDevice=*/true); @@ -529,11 +881,62 @@ static void testBidirectionalBandwidth(std::vector& writeBuffers, std::vector threads; std::vector errors(writeBuffers.size() + readBuffers.size()); + std::vector writeThreadBytes(writeBuffers.size(), 0); + std::vector readThreadBytes(readBuffers.size(), 0); threads.reserve(errors.size()); const auto start = std::chrono::steady_clock::now(); - launchTransferThreads(writeBuffers, /*toDevice=*/true, threads, errors, 0); - launchTransferThreads(readBuffers, /*toDevice=*/false, threads, errors, writeBuffers.size()); + const auto deadline = start + repeat.duration; + + for (size_t i = 0; i < writeBuffers.size(); ++i) { + threads.emplace_back([&writeBuffers, &errors, &writeThreadBytes, i, repeat, deadline] { + try { + const uint64_t size = writeBuffers[i].getSize(); + uint64_t completed = 0; + + if (repeat.durationMode()) { + while (std::chrono::steady_clock::now() < deadline) { + writeBuffers[i].syncToDevice(0, size); + ++completed; + } + } else { + for (uint64_t iter = 0; iter < repeat.iterations; ++iter) { + writeBuffers[i].syncToDevice(0, size); + ++completed; + } + } + + writeThreadBytes[i] = completed * size; + } catch (...) { + errors[i] = std::current_exception(); + } + }); + } + for (size_t i = 0; i < readBuffers.size(); ++i) { + threads.emplace_back([&readBuffers, &errors, &readThreadBytes, i, + repeat, deadline, errorOffset = writeBuffers.size()] { + try { + const uint64_t size = readBuffers[i].getSize(); + uint64_t completed = 0; + + if (repeat.durationMode()) { + while (std::chrono::steady_clock::now() < deadline) { + readBuffers[i].syncFromDevice(0, size); + ++completed; + } + } else { + for (uint64_t iter = 0; iter < repeat.iterations; ++iter) { + readBuffers[i].syncFromDevice(0, size); + ++completed; + } + } + + readThreadBytes[i] = completed * size; + } catch (...) { + errors[errorOffset + i] = std::current_exception(); + } + }); + } for (auto& t : threads) { t.join(); @@ -547,6 +950,14 @@ static void testBidirectionalBandwidth(std::vector& writeBuffers, } const auto elapsed = end - start; + uint64_t writeBytes = 0; + uint64_t readBytes = 0; + for (uint64_t value : writeThreadBytes) { + writeBytes += value; + } + for (uint64_t value : readThreadBytes) { + readBytes += value; + } const double writeMBps = mbPerSecond(writeBytes, elapsed); const double readMBps = mbPerSecond(readBytes, elapsed); @@ -558,29 +969,55 @@ static void testBidirectionalBandwidth(std::vector& writeBuffers, template static void testBandwidthSuite(std::vector& singleDirectionBuffers, const std::string& label, - const std::string& backendSuffix) { + const std::string& backendSuffix, + const BandwidthRepeatOptions& repeat = {}) { std::cout << "Testing " << label << " read bandwidth (" << singleDirectionBuffers.size() << " threads" << backendSuffix << ")..." << std::endl; - printBandwidthMetric("Read", testSingleDirectionBandwidth(singleDirectionBuffers, /*toDevice=*/false)); + printBandwidthMetric("Read", testSingleDirectionBandwidth(singleDirectionBuffers, /*toDevice=*/false, repeat)); std::cout << "Testing " << label << " write bandwidth (" << singleDirectionBuffers.size() << " threads" << backendSuffix << ")..." << std::endl; - printBandwidthMetric("Write", testSingleDirectionBandwidth(singleDirectionBuffers, /*toDevice=*/true)); + printBandwidthMetric("Write", testSingleDirectionBandwidth(singleDirectionBuffers, /*toDevice=*/true, repeat)); } template static void testBidirectionalBandwidthSuite(std::vector& bidirectionalWriteBuffers, std::vector& bidirectionalReadBuffers, const std::string& label, - const std::string& backendSuffix) { + const std::string& backendSuffix, + const BandwidthRepeatOptions& repeat = {}) { std::cout << "Testing " << label << " bidirectional bandwidth (" << (bidirectionalWriteBuffers.size() + bidirectionalReadBuffers.size()) << " threads" << backendSuffix << ")..." << std::endl; - testBidirectionalBandwidth(bidirectionalWriteBuffers, bidirectionalReadBuffers); + testBidirectionalBandwidth(bidirectionalWriteBuffers, bidirectionalReadBuffers, repeat); +} + +static vrtd::Buffer openValidateHbmBuffer(const vrtd::Device& device, + const Validate::Options& options, + uint64_t position) { + if (options.placementExplicit) { + return device.openRawBuffer(addressFor(HBM_BASE, options, position), + options.bufferSize); + } + + return device.openHbmBuffer(static_cast(position), options.bufferSize); +} + +static vrtd::Buffer openValidateDdrBuffer(const vrtd::Device& device, + const Validate::Options& options, + uint64_t position) { + if (options.placementExplicit) { + return device.openRawBuffer(addressFor(DDR_BASE, options, position), + options.bufferSize); + } + + (void)position; + return device.openDdrBuffer(options.bufferSize); } static int runRawTransferTest(const std::string& bdf, const Validate::Options& options) { const unsigned N = options.threads; + const BandwidthRepeatOptions repeat = repeatOptionsFromValidate(options); if (!options.noReset) { std::cout << "Raw transfer mode skips reset; continuing without VRTD reset." << std::endl; @@ -589,6 +1026,8 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o const std::string qdmaPath = resolveQdmaDevicePath(bdf); std::cout << "Using raw QDMA device " << qdmaPath << "..." << std::endl; + printChannelAllocation(options); + printBandwidthRepeatMode(repeat); RawQdmaDevice qdma(qdmaPath); @@ -598,8 +1037,8 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o std::vector hbmBuffers; hbmBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { - hbmBuffers.emplace_back(qdma.get(), HBM_BASE + i * MEM_REGION_SIZE, - BUFFER_SIZE); + hbmBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, i), + options.bufferSize); } if (!testDataIntegrity(hbmBuffers, "HBM")) { @@ -607,7 +1046,7 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o return 1; } - testBandwidthSuite(hbmBuffers, "HBM", ", raw QDMA"); + testBandwidthSuite(hbmBuffers, "HBM", ", raw QDMA", repeat); } { // Bidirectional HBM: positions interleave R/W across regions @@ -618,14 +1057,14 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o hbmReadBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { hbmReadBuffers.emplace_back(qdma.get(), - HBM_BASE + (2 * i) * MEM_REGION_SIZE, - BUFFER_SIZE); + rawAddressFor(HBM_BASE, options, 2 * i), + options.bufferSize); hbmWriteBuffers.emplace_back(qdma.get(), - HBM_BASE + (2 * i + 1) * MEM_REGION_SIZE, - BUFFER_SIZE); + rawAddressFor(HBM_BASE, options, 2 * i + 1), + options.bufferSize); } - testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", raw QDMA"); + testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", raw QDMA", repeat); } } @@ -635,8 +1074,8 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o std::vector ddrBuffers; ddrBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { - ddrBuffers.emplace_back(qdma.get(), DDR_BASE + i * BUFFER_SIZE, - BUFFER_SIZE); + ddrBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, i), + options.bufferSize); } if (!testDataIntegrity(ddrBuffers, "DDR")) { @@ -644,7 +1083,7 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o return 1; } - testBandwidthSuite(ddrBuffers, "DDR", ", raw QDMA"); + testBandwidthSuite(ddrBuffers, "DDR", ", raw QDMA", repeat); } { // Bidirectional DDR: positions interleave R/W across slot indices @@ -655,14 +1094,14 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o ddrReadBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { ddrReadBuffers.emplace_back(qdma.get(), - DDR_BASE + (2 * i) * BUFFER_SIZE, - BUFFER_SIZE); + rawAddressFor(DDR_BASE, options, 2 * i), + options.bufferSize); ddrWriteBuffers.emplace_back(qdma.get(), - DDR_BASE + (2 * i + 1) * BUFFER_SIZE, - BUFFER_SIZE); + rawAddressFor(DDR_BASE, options, 2 * i + 1), + options.bufferSize); } - testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", raw QDMA"); + testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", raw QDMA", repeat); } } @@ -671,15 +1110,15 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o std::vector parBuffers; parBuffers.reserve(2 * N); for (unsigned i = 0; i < N; ++i) { - parBuffers.emplace_back(qdma.get(), HBM_BASE + i * MEM_REGION_SIZE, - BUFFER_SIZE); + parBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, i), + options.bufferSize); } for (unsigned i = 0; i < N; ++i) { - parBuffers.emplace_back(qdma.get(), DDR_BASE + i * BUFFER_SIZE, - BUFFER_SIZE); + parBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, i), + options.bufferSize); } - testBandwidthSuite(parBuffers, "HBM+DDR", ", raw QDMA"); + testBandwidthSuite(parBuffers, "HBM+DDR", ", raw QDMA", repeat); } { // Bidirectional HBM+DDR: 4N positions total. Positions 0..2N-1 @@ -692,22 +1131,22 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o parReadBuffers.reserve(2 * N); for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma.get(), - HBM_BASE + (2 * i) * MEM_REGION_SIZE, - BUFFER_SIZE); + rawAddressFor(HBM_BASE, options, 2 * i), + options.bufferSize); parWriteBuffers.emplace_back(qdma.get(), - HBM_BASE + (2 * i + 1) * MEM_REGION_SIZE, - BUFFER_SIZE); + rawAddressFor(HBM_BASE, options, 2 * i + 1), + options.bufferSize); } for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma.get(), - DDR_BASE + (2 * i) * BUFFER_SIZE, - BUFFER_SIZE); + rawAddressFor(DDR_BASE, options, 2 * i), + options.bufferSize); parWriteBuffers.emplace_back(qdma.get(), - DDR_BASE + (2 * i + 1) * BUFFER_SIZE, - BUFFER_SIZE); + rawAddressFor(DDR_BASE, options, 2 * i + 1), + options.bufferSize); } - testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", raw QDMA"); + testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", raw QDMA", repeat); } } @@ -727,6 +1166,7 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op return 1; #else const unsigned N = options.threads; + const BandwidthRepeatOptions repeat = repeatOptionsFromValidate(options); if (!options.noReset) { std::cout << "QDMA-driver raw mode skips reset; continuing without VRTD reset." << std::endl; @@ -736,6 +1176,8 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op const bool runParallel = !options.ddrOnly && !options.hbmOnly; std::cout << "Using off-the-shelf Xilinx QDMA driver for board " << bdf << "..." << std::endl; + printChannelAllocation(options); + printBandwidthRepeatMode(repeat); smi::qdma_driver::QdmaDriverDevice qdma(bdf); std::cout << "Resolved QDMA function " << qdma.functionBdf() << std::endl; qdma.ensureQmax(runParallel ? 4 * N : 2 * N); @@ -754,7 +1196,8 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op std::vector hbmBuffers; hbmBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { - hbmBuffers.emplace_back(qdma, i, HBM_BASE + i * MEM_REGION_SIZE, BUFFER_SIZE); + hbmBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, i), + options.bufferSize); } if (!testDataIntegrity(hbmBuffers, "HBM")) { @@ -762,7 +1205,7 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op return 1; } - testBandwidthSuite(hbmBuffers, "HBM", ", QDMA driver"); + testBandwidthSuite(hbmBuffers, "HBM", ", QDMA driver", repeat); } { std::vector hbmWriteBuffers; @@ -771,12 +1214,14 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op hbmReadBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { hbmReadBuffers.emplace_back(qdma, i, - HBM_BASE + (2 * i) * MEM_REGION_SIZE, BUFFER_SIZE); + rawAddressFor(HBM_BASE, options, 2 * i), + options.bufferSize); hbmWriteBuffers.emplace_back(qdma, N + i, - HBM_BASE + (2 * i + 1) * MEM_REGION_SIZE, BUFFER_SIZE); + rawAddressFor(HBM_BASE, options, 2 * i + 1), + options.bufferSize); } - testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", QDMA driver"); + testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", QDMA driver", repeat); } } @@ -786,7 +1231,8 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op std::vector ddrBuffers; ddrBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { - ddrBuffers.emplace_back(qdma, i, DDR_BASE + i * BUFFER_SIZE, BUFFER_SIZE); + ddrBuffers.emplace_back(qdma, i, rawAddressFor(DDR_BASE, options, i), + options.bufferSize); } if (!testDataIntegrity(ddrBuffers, "DDR")) { @@ -794,7 +1240,7 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op return 1; } - testBandwidthSuite(ddrBuffers, "DDR", ", QDMA driver"); + testBandwidthSuite(ddrBuffers, "DDR", ", QDMA driver", repeat); } { std::vector ddrWriteBuffers; @@ -803,12 +1249,14 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op ddrReadBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { ddrReadBuffers.emplace_back(qdma, i, - DDR_BASE + (2 * i) * BUFFER_SIZE, BUFFER_SIZE); + rawAddressFor(DDR_BASE, options, 2 * i), + options.bufferSize); ddrWriteBuffers.emplace_back(qdma, N + i, - DDR_BASE + (2 * i + 1) * BUFFER_SIZE, BUFFER_SIZE); + rawAddressFor(DDR_BASE, options, 2 * i + 1), + options.bufferSize); } - testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", QDMA driver"); + testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", QDMA driver", repeat); } } @@ -817,13 +1265,15 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op std::vector parBuffers; parBuffers.reserve(2 * N); for (unsigned i = 0; i < N; ++i) { - parBuffers.emplace_back(qdma, i, HBM_BASE + i * MEM_REGION_SIZE, BUFFER_SIZE); + parBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, i), + options.bufferSize); } for (unsigned i = 0; i < N; ++i) { - parBuffers.emplace_back(qdma, N + i, DDR_BASE + i * BUFFER_SIZE, BUFFER_SIZE); + parBuffers.emplace_back(qdma, N + i, rawAddressFor(DDR_BASE, options, i), + options.bufferSize); } - testBandwidthSuite(parBuffers, "HBM+DDR", ", QDMA driver"); + testBandwidthSuite(parBuffers, "HBM+DDR", ", QDMA driver", repeat); } { std::vector parWriteBuffers; @@ -832,18 +1282,22 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op parReadBuffers.reserve(2 * N); for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma, i, - HBM_BASE + (2 * i) * MEM_REGION_SIZE, BUFFER_SIZE); + rawAddressFor(HBM_BASE, options, 2 * i), + options.bufferSize); parWriteBuffers.emplace_back(qdma, 2 * N + i, - HBM_BASE + (2 * i + 1) * MEM_REGION_SIZE, BUFFER_SIZE); + rawAddressFor(HBM_BASE, options, 2 * i + 1), + options.bufferSize); } for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma, N + i, - DDR_BASE + (2 * i) * BUFFER_SIZE, BUFFER_SIZE); + rawAddressFor(DDR_BASE, options, 2 * i), + options.bufferSize); parWriteBuffers.emplace_back(qdma, 3 * N + i, - DDR_BASE + (2 * i + 1) * BUFFER_SIZE, BUFFER_SIZE); + rawAddressFor(DDR_BASE, options, 2 * i + 1), + options.bufferSize); } - testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", QDMA driver"); + testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", QDMA driver", repeat); } } @@ -853,21 +1307,19 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op } // namespace +uint64_t Validate::parseByteSizeOption(const std::string& text) { + return parseByteSizeText(text); +} + int Validate::run(const Options& options) { std::string bdf = resolveBoardBdf(options.bdf, "validate"); unsigned N = options.threads; - if (!checkHostMemoryBudget(options)) { + if (!validatePlacement(options)) { return 1; } - // The HBM bidirectional phase uses 2*N HBM regions (write 0..N-1, read N..2N-1). - // HBM has only 64 regions, so N>32 is unsupportable unless HBM is excluded. - static constexpr unsigned HBM_REGIONS = 64; - if (!options.ddrOnly && 2 * N > HBM_REGIONS) { - std::cerr << "validate: --threads > " << (HBM_REGIONS / 2) - << " requires --ddr-only (bidirectional HBM uses 2*N HBM regions, only " - << HBM_REGIONS << " exist)" << std::endl; + if (!checkHostMemoryBudget(options)) { return 1; } @@ -900,7 +1352,7 @@ int Validate::run(const Options& options) { std::vector hbmBuffers; hbmBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { - hbmBuffers.push_back(device.openHbmBuffer(i, BUFFER_SIZE)); + hbmBuffers.push_back(openValidateHbmBuffer(device, options, i)); } if (!testDataIntegrity(hbmBuffers, "HBM")) { @@ -917,8 +1369,8 @@ int Validate::run(const Options& options) { hbmWriteBuffers.reserve(N); hbmReadBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { - hbmReadBuffers.push_back(device.openHbmBuffer(2 * i, BUFFER_SIZE)); - hbmWriteBuffers.push_back(device.openHbmBuffer(2 * i + 1, BUFFER_SIZE)); + hbmReadBuffers.push_back(openValidateHbmBuffer(device, options, 2 * i)); + hbmWriteBuffers.push_back(openValidateHbmBuffer(device, options, 2 * i + 1)); } testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ""); @@ -933,7 +1385,7 @@ int Validate::run(const Options& options) { std::vector ddrBuffers; ddrBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { - ddrBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); + ddrBuffers.push_back(openValidateDdrBuffer(device, options, i)); } if (!testDataIntegrity(ddrBuffers, "DDR")) { @@ -950,8 +1402,13 @@ int Validate::run(const Options& options) { ddrWriteBuffers.reserve(N); ddrReadBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { - ddrWriteBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); - ddrReadBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); + if (options.placementExplicit) { + ddrReadBuffers.push_back(openValidateDdrBuffer(device, options, 2 * i)); + ddrWriteBuffers.push_back(openValidateDdrBuffer(device, options, 2 * i + 1)); + } else { + ddrWriteBuffers.push_back(openValidateDdrBuffer(device, options, i)); + ddrReadBuffers.push_back(openValidateDdrBuffer(device, options, i)); + } } testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ""); @@ -965,10 +1422,10 @@ int Validate::run(const Options& options) { std::vector parBuffers; parBuffers.reserve(2 * N); for (unsigned i = 0; i < N; ++i) { - parBuffers.push_back(device.openHbmBuffer(i, BUFFER_SIZE)); + parBuffers.push_back(openValidateHbmBuffer(device, options, i)); } for (unsigned i = 0; i < N; ++i) { - parBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); + parBuffers.push_back(openValidateDdrBuffer(device, options, i)); } testBandwidthSuite(parBuffers, "HBM+DDR", ""); @@ -980,12 +1437,17 @@ int Validate::run(const Options& options) { parWriteBuffers.reserve(2 * N); parReadBuffers.reserve(2 * N); for (unsigned i = 0; i < N; ++i) { - parReadBuffers.push_back(device.openHbmBuffer(2 * i, BUFFER_SIZE)); - parWriteBuffers.push_back(device.openHbmBuffer(2 * i + 1, BUFFER_SIZE)); + parReadBuffers.push_back(openValidateHbmBuffer(device, options, 2 * i)); + parWriteBuffers.push_back(openValidateHbmBuffer(device, options, 2 * i + 1)); } for (unsigned i = 0; i < N; ++i) { - parWriteBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); - parReadBuffers.push_back(device.openDdrBuffer(BUFFER_SIZE)); + if (options.placementExplicit) { + parReadBuffers.push_back(openValidateDdrBuffer(device, options, 2 * i)); + parWriteBuffers.push_back(openValidateDdrBuffer(device, options, 2 * i + 1)); + } else { + parWriteBuffers.push_back(openValidateDdrBuffer(device, options, i)); + parReadBuffers.push_back(openValidateDdrBuffer(device, options, i)); + } } testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ""); diff --git a/smi/src/validate.hpp b/smi/src/validate.hpp index 810d53fa..e230b87b 100644 --- a/smi/src/validate.hpp +++ b/smi/src/validate.hpp @@ -29,6 +29,7 @@ /// parallel bandwidth measurements. Raw transfer modes skip reset and bypass /// the default VRTD buffer path. +#include #include /// @brief Static entry-point for the validate command. @@ -40,6 +41,20 @@ class Validate { public: /// @brief Options parsed from the CLI for the validate command. struct Options { + /// @brief How raw-transfer buffers map QDMA MM/NoC channels onto memory. + /// + /// On CPM5 the host-side NoC ingress port (NMU) is selected per queue by + /// the SW-context mm-channel/host_id (SLASH uses qid&1), while the + /// memory-side NoC egress endpoint (NSU / pseudo-channel) is selected by + /// the device address. Sustaining both NMUs requires also spreading + /// across two NSUs; otherwise both ports converge on one memory endpoint + /// and bandwidth caps at a single path. This mirrors the off-the-shelf + /// dma-perf knobs offset_ch0/offset_ch1. + enum class ChannelAllocation { + Auto, ///< Interleaved: driver picks mm-channel (qid&1), addresses linear. Default; current behaviour. + Paired, ///< Couple mm-channel to a distinct memory region: even positions -> region 0, odd -> region 1. + }; + std::string bdf; ///< BDF (Bus:Device.Function) address of the target device. unsigned threads = 8; ///< Number of parallel buffers/threads (1-64). bool noReset = false; ///< Skip the device reset step before running memory tests. @@ -47,12 +62,29 @@ class Validate { bool hbmOnly = false; ///< Skip DDR phase (mutually exclusive with ddrOnly). bool rawTransferTest = false; ///< Use libslash raw QDMA transfers instead of VRTD buffers. bool useQdmaDriver = false; ///< Run the raw test over the off-the-shelf Xilinx QDMA driver. + uint64_t bufferSize = 512ULL * 1024ULL * 1024ULL; ///< Size of each test buffer. + uint64_t offset = 512ULL * 1024ULL * 1024ULL; ///< Distance between logical buffer positions. + uint64_t startingOffset = 0; ///< Offset from memory-space base for position 0. + bool placementExplicit = false; ///< True when any placement option was provided. + /// Raw-transfer NoC channel/memory placement strategy (raw modes only). + ChannelAllocation channelAllocation = ChannelAllocation::Auto; + /// Paired-mode byte distance between the two per-channel memory regions + /// (the NSU / pseudo-channel stride). Default 16 GiB == MEMORY_SPACE_SIZE/2, + /// which matches the dma-perf HBM offset_ch1-offset_ch0 spacing. + uint64_t channelRegionStride = 16ULL * 1024ULL * 1024ULL * 1024ULL; + /// Number of whole-buffer transfers per buffer in raw bandwidth phases. + uint64_t bandwidthIterations = 1; + /// Raw bandwidth phase duration in seconds. 0 means use fixed iterations. + double bandwidthDuration = 0.0; }; /// @brief Executes the validate command. /// @param options Populated options struct. /// @return Exit code (0 on success). static int run(const Options& options); + + /// @brief Parse a byte-size option accepting bare values and k/K/m/M suffixes. + static uint64_t parseByteSizeOption(const std::string& text); }; #endif // SMI_VALIDATE_HPP From fe8072f6f34f20e33da77face70290ea4d3f0917 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Wed, 10 Jun 2026 11:16:55 +0100 Subject: [PATCH 09/23] docs: document new validate placement and bandwidth options Signed-off-by: Vlad-Gabriel Serbu --- docs/reference/smi/commands.rst | 80 ++++++++++++++++++++++++++++++--- smi/README.md | 36 ++++++++++++--- 2 files changed, 102 insertions(+), 14 deletions(-) diff --git a/docs/reference/smi/commands.rst b/docs/reference/smi/commands.rst index 989803b3..6a7828eb 100644 --- a/docs/reference/smi/commands.rst +++ b/docs/reference/smi/commands.rst @@ -160,7 +160,7 @@ phase is skipped when ``--ddr-only`` or ``--hbm-only`` is given. .. code-block:: text - v80-smi validate -d [-j|--threads ] [-R|--no-reset] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] + v80-smi validate -d [-j|--threads ] [-R|--no-reset] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--bandwidth-iterations ] [--bandwidth-duration ] Requirements by mode: @@ -180,12 +180,40 @@ Requirements by mode: .. option:: -j, --threads Number of parallel buffers/threads for the validation test (1–64, default 8). - Each buffer is 512 MB (one HBM/DDR allocator region). The bidirectional HBM - phase uses ``2 * N`` HBM regions, so values above 32 require ``--ddr-only``. - The largest phase maps up to ``4 * N * 512 MB`` of host buffers when both - HBM and DDR are enabled, or ``2 * N * 512 MB`` with ``--ddr-only`` or - ``--hbm-only``; the command fails early if that exceeds currently available - host memory. + Bidirectional phases use ``2 * N`` logical positions in each enabled memory + space. + +.. option:: --buffer-size + + Size of each test buffer. Values may be bare bytes or use ``k``/``K`` or + ``m``/``M`` suffixes. The default and maximum are ``512M``. Values must be + 4 KiB-aligned. + +.. option:: --offset + + Distance between logical buffer positions. The default is ``512M``. Values + may be bare bytes or use ``k``/``K`` or ``m``/``M`` suffixes, must be + 4 KiB-aligned, and must be at least ``--buffer-size`` so buffers do not + overlap. + +.. option:: --starting-offset + + Offset from each memory-space base for logical position 0. The default is + ``0``. Values may be bare bytes or use ``k``/``K`` or ``m``/``M`` suffixes + and must be 4 KiB-aligned. + +Buffers are placed at ``memory_base + starting_offset + position * offset``. +Single-direction phases use positions ``0..N-1``. Bidirectional phases use +positions ``0..2N-1`` with reads on even positions and writes on odd positions. +The full range must remain inside the 64 x 512 MB DDR/HBM address space. If any +placement option is specified in default VRTD mode, ``validate`` uses raw VRTD +buffers so the exact addresses are honored; this requires raw memory access +permission. + +The largest phase maps up to ``4 * N * buffer-size`` of host buffers when both +HBM and DDR are enabled, or ``2 * N * buffer-size`` with ``--ddr-only`` or +``--hbm-only``; the command fails early if that exceeds currently available +host memory. .. option:: -R, --no-reset @@ -221,6 +249,44 @@ Requirements by mode: Run only the HBM memory tests and skip the DDR phase. Mutually exclusive with ``--ddr-only``. +.. option:: --channel-allocation + + Raw-transfer-only (``--raw-transfer-test`` or ``--use-qdma-driver``) control + over how QDMA MM/NoC channels map onto device memory. On CPM5 the host-side + NoC ingress port (NMU) is chosen per queue by the SW-context + mm-channel/host_id (SLASH uses ``qid & 1``), while the memory-side NoC egress + endpoint (NSU / pseudo-channel) is chosen by the device address. Default + ``auto`` keeps the historical behaviour: channel ``qid & 1`` with linear + addressing, so both NMUs can converge on a single NSU and bandwidth caps at + one path. ``paired`` couples the two: even positions land in memory region 0 + on channel 0, odd positions in region 1 on channel 1 (one + ``--channel-region-stride`` apart), giving two independent NMU->NSU paths. + This mirrors the off-the-shelf ``dma-perf`` ``offset_ch0``/``offset_ch1`` + knobs and is the placement that lets both NoC ports contribute bandwidth. + +.. option:: --channel-region-stride + + In ``--channel-allocation paired`` mode, the byte distance between the two + per-channel memory regions (the NSU / pseudo-channel stride). Default ``16G`` + (== half the per-memory address space, matching the dma-perf HBM + ``offset_ch1 - offset_ch0`` spacing). Must be a non-zero multiple of 4 KiB. + Accepts bare bytes or ``k``/``K``, ``m``/``M``, ``g``/``G`` suffixes. + +.. option:: --bandwidth-iterations + + Raw-transfer-only (``--raw-transfer-test`` or ``--use-qdma-driver``). Repeat + each whole-buffer transfer in every bandwidth phase ``N`` times and report + bandwidth over the sustained loop. The default is ``1``, which preserves the + historical one-shot measurement. + +.. option:: --bandwidth-duration + + Raw-transfer-only duration mode. When non-zero, each bandwidth phase repeats + whole-buffer transfers until the requested wall-clock duration has elapsed + and counts only completed transfers. This is useful for comparing SLASH's raw + path against long-running tools such as ``dma-perf``. A value of ``0`` uses + ``--bandwidth-iterations`` instead. + debug ----- diff --git a/smi/README.md b/smi/README.md index 16fb6beb..4c011cad 100644 --- a/smi/README.md +++ b/smi/README.md @@ -183,20 +183,27 @@ bandwidth. Raw transfer modes skip reset and bypass the default VRTD buffer path for data movement. ``` -v80-smi validate -d [-j ] [-R] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] +v80-smi validate -d [-j ] [-R] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--bandwidth-iterations ] [--bandwidth-duration ] ``` | Flag | Description | |-------------------|------------------------------------------------------| | `-d,--device` | Board address (required), e.g. `03:00` or `0000:03:00` | -| `-j,--threads` | Parallel buffers/threads, 1-64 (default 8). Bidirectional HBM needs `2 * threads` HBM regions, so values above 32 require `--ddr-only`. | +| `-j,--threads` | Parallel buffers/threads, 1-64 (default 8). Bidirectional phases use `2 * threads` logical positions in each enabled memory space. | | `-R,--no-reset` | Skip the device reset step before running memory tests | +| `--buffer-size` | Size of each test buffer, accepting bytes or `k`/`K`/`m`/`M` suffixes (default `512M`, maximum `512M`) | +| `--offset` | Distance between logical buffer positions (default `512M`) | +| `--starting-offset` | Offset from each memory-space base for logical position 0 (default `0`) | | `--raw-transfer-test` | Use libslash raw QDMA transfers instead of VRTD buffers; implies `--no-reset` | | `--use-qdma-driver` | Run the raw transfer test over the off-the-shelf Xilinx QDMA driver instead of SLASH; implies `--no-reset`; mutually exclusive with `--raw-transfer-test` | | `--ddr-only` | Run only DDR memory tests (skip HBM); mutually exclusive with `--hbm-only` | | `--hbm-only` | Run only HBM memory tests (skip DDR); mutually exclusive with `--ddr-only` | +| `--channel-allocation` | Raw-transfer-only placement: `auto` (default; mm-channel `qid&1`, linear addressing) or `paired` (couple mm-channel to a distinct memory region/NSU: even positions -> region 0/channel 0, odd -> region 1/channel 1). `paired` mirrors dma-perf `offset_ch0`/`offset_ch1` so both NoC NMUs drive independent memory endpoints. | +| `--channel-region-stride` | In `--channel-allocation paired`, byte distance between the two per-channel regions (NSU stride). Default `16G` (half the per-memory space); accepts `k`/`K`/`m`/`M`/`g`/`G`. | +| `--bandwidth-iterations` | Raw-transfer-only sustained bandwidth mode: repeat each whole-buffer transfer this many times in each bandwidth phase (default `1`). | +| `--bandwidth-duration` | Raw-transfer-only duration mode: repeat whole-buffer transfers until this many seconds have elapsed; `0` disables duration mode and uses `--bandwidth-iterations`. | -Each buffer is 512 MB (one HBM/DDR allocator region). The integrity test +Each buffer defaults to 512 MB (one HBM/DDR allocator region). The integrity test writes a pattern, syncs to device, clears host memory, syncs back, and verifies. Each bandwidth phase reports single-direction C2H reads, single-direction H2C writes, @@ -208,10 +215,25 @@ threads for bidirectional tests; it is skipped when `--ddr-only` or VRTD for transfers and opens the board's SLASH QDMA device directly, so the SLASH QDMA driver node must be present. -Each buffer is 512 MB. The largest phase maps up to -`4 x x 512 MB` of host buffers when HBM and DDR are both enabled, -or `2 x x 512 MB` with `--ddr-only` or `--hbm-only`; `validate` -fails early if that footprint exceeds currently available host memory. +Buffers are placed at `memory_base + starting-offset + position * offset`. +The position sequence is `0..N-1` for single-direction phases and `0..2N-1` +for bidirectional phases (reads on even positions, writes on odd positions). +`--buffer-size`, `--offset`, and `--starting-offset` must be 4 KiB-aligned, +`--offset` must be at least `--buffer-size`, and the highest buffer must fit +within the 64 x 512 MB DDR/HBM address space. If any placement option is +specified in default VRTD mode, `validate` uses raw VRTD buffers so the exact +addresses are honored; this requires raw memory access permission. + +The largest phase maps up to `4 x x ` of host buffers +when HBM and DDR are both enabled, or `2 x x ` with +`--ddr-only` or `--hbm-only`; `validate` fails early if that footprint exceeds +currently available host memory. + +Raw transfer modes can repeat the bandwidth phases without changing buffer +placement or page size. `--bandwidth-iterations` repeats each whole-buffer +transfer a fixed number of times, while `--bandwidth-duration` runs each +bandwidth phase for a wall-clock duration and counts completed whole-buffer +transfers. Integrity checks remain one-shot. With `--use-qdma-driver`, the command runs the same raw test over the off-the-shelf Xilinx QDMA driver (`submodules/qdma_drv`) instead of SLASH. From bbb568f31c6ef99d09cc26dfb73be9a91a6c245b Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Wed, 10 Jun 2026 12:12:56 +0100 Subject: [PATCH 10/23] Added 4k|2M explicit page specification Signed-off-by: Vlad-Gabriel Serbu --- docs/reference/smi/commands.rst | 16 ++- smi/README.md | 10 +- smi/src/qdma_driver_backend.cpp | 5 +- smi/src/qdma_driver_backend.hpp | 3 +- smi/src/raw_transfer.hpp | 64 +++++---- smi/src/smi.cpp | 10 ++ smi/src/validate.cpp | 149 +++++++++++++++----- smi/src/validate.hpp | 12 ++ vrt/vrtd/libvrtd/include/vrtd/vrtd.h | 18 +++ vrt/vrtd/libvrtd/src/buffer.c | 70 +++++---- vrt/vrtd/libvrtd/src/requests.c | 4 + vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp | 11 ++ vrt/vrtd/libvrtdpp/include/vrtd/device.hpp | 31 ++-- vrt/vrtd/libvrtdpp/include/vrtd/session.hpp | 8 +- vrt/vrtd/libvrtdpp/src/device.cpp | 14 +- vrt/vrtd/libvrtdpp/src/session.cpp | 24 ++-- 16 files changed, 322 insertions(+), 127 deletions(-) diff --git a/docs/reference/smi/commands.rst b/docs/reference/smi/commands.rst index 6a7828eb..63ce3064 100644 --- a/docs/reference/smi/commands.rst +++ b/docs/reference/smi/commands.rst @@ -160,7 +160,7 @@ phase is skipped when ``--ddr-only`` or ``--hbm-only`` is given. .. code-block:: text - v80-smi validate -d [-j|--threads ] [-R|--no-reset] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--bandwidth-iterations ] [--bandwidth-duration ] + v80-smi validate -d [-j|--threads ] [-R|--no-reset] [--page-size <4k|2m>] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--bandwidth-iterations ] [--bandwidth-duration ] Requirements by mode: @@ -213,12 +213,24 @@ permission. The largest phase maps up to ``4 * N * buffer-size`` of host buffers when both HBM and DDR are enabled, or ``2 * N * buffer-size`` with ``--ddr-only`` or ``--hbm-only``; the command fails early if that exceeds currently available -host memory. +host memory. With ``--page-size 2m`` that footprint is checked against the free +2 MiB hugepage pool instead of general RAM. .. option:: -R, --no-reset Skip the device reset step before running memory tests. +.. option:: --page-size <4k|2m> + + Host staging-buffer page granule used for DMA transfers in every mode + (default VRTD, ``--raw-transfer-test`` and ``--use-qdma-driver``). ``4k`` + (the default) maps the host buffers with regular 4 KiB base pages; ``2m`` + maps them with 2 MiB hugepages. There is no fallback: ``2m`` requires + reserved 2 MiB hugepages and that ``--buffer-size``, ``--offset``, + ``--starting-offset`` (and ``--channel-region-stride`` in paired mode) all be + 2 MiB-aligned, otherwise ``validate`` fails early. Reserve hugepages with, + e.g., ``echo | sudo tee /proc/sys/vm/nr_hugepages``. + .. option:: --raw-transfer-test Use libslash raw QDMA transfers instead of VRTD buffers. This mode implies diff --git a/smi/README.md b/smi/README.md index 4c011cad..facf992b 100644 --- a/smi/README.md +++ b/smi/README.md @@ -183,7 +183,7 @@ bandwidth. Raw transfer modes skip reset and bypass the default VRTD buffer path for data movement. ``` -v80-smi validate -d [-j ] [-R] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--bandwidth-iterations ] [--bandwidth-duration ] +v80-smi validate -d [-j ] [-R] [--page-size <4k|2m>] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--bandwidth-iterations ] [--bandwidth-duration ] ``` | Flag | Description | @@ -191,6 +191,7 @@ v80-smi validate -d [-j ] [-R] [--buffer-size ] [--offset < | `-d,--device` | Board address (required), e.g. `03:00` or `0000:03:00` | | `-j,--threads` | Parallel buffers/threads, 1-64 (default 8). Bidirectional phases use `2 * threads` logical positions in each enabled memory space. | | `-R,--no-reset` | Skip the device reset step before running memory tests | +| `--page-size` | Host staging-buffer page granule for all backends: `4k` (default; 4 KiB base pages) or `2m` (2 MiB hugepages). No fallback: `2m` needs reserved 2 MiB hugepages and 2 MiB-aligned `--buffer-size`/`--offset`/`--starting-offset` (and `--channel-region-stride` in paired mode). | | `--buffer-size` | Size of each test buffer, accepting bytes or `k`/`K`/`m`/`M` suffixes (default `512M`, maximum `512M`) | | `--offset` | Distance between logical buffer positions (default `512M`) | | `--starting-offset` | Offset from each memory-space base for logical position 0 (default `0`) | @@ -218,9 +219,10 @@ the SLASH QDMA driver node must be present. Buffers are placed at `memory_base + starting-offset + position * offset`. The position sequence is `0..N-1` for single-direction phases and `0..2N-1` for bidirectional phases (reads on even positions, writes on odd positions). -`--buffer-size`, `--offset`, and `--starting-offset` must be 4 KiB-aligned, -`--offset` must be at least `--buffer-size`, and the highest buffer must fit -within the 64 x 512 MB DDR/HBM address space. If any placement option is +`--buffer-size`, `--offset`, and `--starting-offset` must be 4 KiB-aligned (or +2 MiB-aligned with `--page-size 2m`), `--offset` must be at least +`--buffer-size`, and the highest buffer must fit within the 64 x 512 MB DDR/HBM +address space. If any placement option is specified in default VRTD mode, `validate` uses raw VRTD buffers so the exact addresses are honored; this requires raw memory access permission. diff --git a/smi/src/qdma_driver_backend.cpp b/smi/src/qdma_driver_backend.cpp index 1c8c5c27..7cc25440 100644 --- a/smi/src/qdma_driver_backend.cpp +++ b/smi/src/qdma_driver_backend.cpp @@ -480,10 +480,11 @@ std::string QdmaDriverDevice::charDevPath(uint32_t qid) const { } QdmaDriverBuffer::QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid, - uint64_t physAddr, uint64_t size) + uint64_t physAddr, uint64_t size, + raw::PageSize pageSize) : device_(&device), qid_(qid), physAddr_(physAddr) { try { - mapping_ = raw::createHostMapping(size, physAddr); + mapping_ = raw::createHostMapping(size, physAddr, pageSize); device_->queueAdd(qid_); queueAdded_ = true; diff --git a/smi/src/qdma_driver_backend.hpp b/smi/src/qdma_driver_backend.hpp index f4f5634c..381e025e 100644 --- a/smi/src/qdma_driver_backend.hpp +++ b/smi/src/qdma_driver_backend.hpp @@ -105,7 +105,8 @@ class QdmaDriverDevice { /// testBandwidth() templates. class QdmaDriverBuffer { public: - QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid, uint64_t physAddr, uint64_t size); + QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid, uint64_t physAddr, uint64_t size, + raw::PageSize pageSize); QdmaDriverBuffer(const QdmaDriverBuffer&) = delete; QdmaDriverBuffer& operator=(const QdmaDriverBuffer&) = delete; diff --git a/smi/src/raw_transfer.hpp b/smi/src/raw_transfer.hpp index c067db54..d9edb3c1 100644 --- a/smi/src/raw_transfer.hpp +++ b/smi/src/raw_transfer.hpp @@ -74,6 +74,12 @@ namespace smi::raw { static constexpr uint64_t BASE_TRANSFER_STEP_SIZE = 4ULL * 1024ULL; static constexpr uint64_t HUGE_TRANSFER_STEP_SIZE = 2ULL * 1024ULL * 1024ULL; +/// Host staging-buffer page granule selection for raw transfers. +enum class PageSize { + Base4K, ///< Regular 4 KiB base pages. + Huge2M, ///< 2 MiB hugetlb pages; a mapping failure is fatal (no fallback). +}; + [[noreturn]] inline void throwSystemError(const std::string& message) { throw std::runtime_error(message + ": " + std::strerror(errno)); } @@ -90,33 +96,46 @@ struct HostMapping { uint64_t step = 0; }; -/// Create a host staging buffer for raw transfers, preferring a 2 MiB hugetlb -/// mapping and falling back to a regular (THP-disabled) mapping with 4 KiB -/// transfers. @p physAddrForWarn is only used to make the fallback warning -/// actionable. -inline HostMapping createHostMapping(uint64_t size, uint64_t physAddrForWarn) { +/// Create a host staging buffer for raw transfers using the requested page +/// granule. @p pageSize selects 4 KiB base pages or 2 MiB hugetlb pages; there +/// is no fallback, so a 2 MiB request fails (throws) when hugepages cannot be +/// mapped. @p physAddr is the device address this buffer backs and is only used +/// to make error messages actionable. +inline HostMapping createHostMapping(uint64_t size, uint64_t physAddr, PageSize pageSize) { HostMapping mapping; mapping.size = size; - mapping.data = mmap(nullptr, - size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, - -1, - 0); - mapping.step = HUGE_TRANSFER_STEP_SIZE; - if (mapping.data != MAP_FAILED) { + if (pageSize == PageSize::Huge2M) { + if ((size % HUGE_TRANSFER_STEP_SIZE) != 0) { + throw std::invalid_argument( + "Raw transfer buffer size must be a multiple of 2 MiB to use 2 MiB pages"); + } + + mapping.data = mmap(nullptr, + size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, + -1, + 0); + if (mapping.data == MAP_FAILED) { + char where[64]; + std::snprintf(where, sizeof(where), " at device 0x%llx", + static_cast(physAddr)); + throwSystemError(std::string("Failed to map 2 MiB hugetlb raw transfer host buffer") + + where + " (reserve 2 MiB hugepages or use --page-size 4k)"); + } + mapping.step = HUGE_TRANSFER_STEP_SIZE; return mapping; } - const int hugeErrno = errno; - // MAP_POPULATE is deliberately omitted here. It would pre-fault the whole - // buffer during mmap(), i.e. before the MADV_NOHUGEPAGE below can take - // effect. On hosts with transparent hugepages set to "always", those early - // faults hand back 2 MiB THP compound pages, and MADV_NOHUGEPAGE does not - // split pages that are already faulted in. The driver's strict 4 KiB - // base-page path (slash_qdma_map_user_base_page_to_sgl) then rejects every - // transfer with -EINVAL ("4 KiB transfer is not backed by a base page"). + // PageSize::Base4K: map regular base pages. MAP_POPULATE is deliberately + // omitted: it would pre-fault the whole buffer during mmap(), i.e. before + // the MADV_NOHUGEPAGE below can take effect. On hosts with transparent + // hugepages set to "always", those early faults hand back 2 MiB THP compound + // pages, and MADV_NOHUGEPAGE does not split pages that are already faulted + // in. The driver's strict 4 KiB base-page path + // (slash_qdma_map_user_base_page_to_sgl) then rejects every transfer with + // -EINVAL ("4 KiB transfer is not backed by a base page"). mapping.data = mmap(nullptr, size, PROT_READ | PROT_WRITE, @@ -149,9 +168,6 @@ inline HostMapping createHostMapping(uint64_t size, uint64_t physAddrForWarn) { } mapping.step = BASE_TRANSFER_STEP_SIZE; - std::cerr << "Warning: 2 MiB hugetlb mmap failed for raw transfer buffer at 0x" - << std::hex << physAddrForWarn << std::dec - << " (errno=" << hugeErrno << "); using 4 KiB transfers" << std::endl; return mapping; } diff --git a/smi/src/smi.cpp b/smi/src/smi.cpp index dba4fa9f..bc06787a 100644 --- a/smi/src/smi.cpp +++ b/smi/src/smi.cpp @@ -129,6 +129,16 @@ static int smiMain(int argc, char **argv) { "Number of parallel buffers/threads (1-64)")->default_val(8)->check(CLI::Range(1u, 64u)); validateCommand->add_flag("-R,--no-reset", validateOptions.noReset, "Skip the device reset step before running memory tests"); + const std::map pageSizeMap{ + {"4k", Validate::Options::PageSize::Base4K}, + {"2m", Validate::Options::PageSize::Huge2M}, + }; + validateCommand->add_option("--page-size", validateOptions.pageSize, + "Host staging-buffer page granule for all backends: 4k (4 KiB base pages; default) " + "or 2m (2 MiB hugepages). 2m requires reserved 2 MiB hugepages and 2 MiB-aligned " + "buffer-size/offsets; the allocation fails with no fallback otherwise.") + ->transform(CLI::CheckedTransformer(pageSizeMap, CLI::ignore_case)) + ->default_str("4k"); addValidateSizeOption("--buffer-size", &validateOptions.bufferSize, "Size of each validate buffer; accepts bytes or k/K/m/M suffixes (max 512M)") ->default_str("512M"); diff --git a/smi/src/validate.cpp b/smi/src/validate.cpp index 899afab4..a16aaea8 100644 --- a/smi/src/validate.cpp +++ b/smi/src/validate.cpp @@ -94,6 +94,31 @@ static constexpr uint32_t QDMA_DIR_H2C = 0x1; static constexpr uint32_t QDMA_DIR_C2H = 0x2; static constexpr uint32_t QDMA_RING_SZ_IDX = 0; +static constexpr uint64_t HUGE_PAGE_SIZE = 2ULL * 1024ULL * 1024ULL; + +/// Map the validate page-size option to the raw-transfer host mapping mode. +static smi::raw::PageSize rawPageSize(const Validate::Options& options) { + return options.pageSize == Validate::Options::PageSize::Huge2M + ? smi::raw::PageSize::Huge2M + : smi::raw::PageSize::Base4K; +} + +/// Map the validate page-size option to the vrtd host page mode. +static vrtd::HostPageSize vrtdPageSize(const Validate::Options& options) { + return options.pageSize == Validate::Options::PageSize::Huge2M + ? vrtd::HostPageSize::Huge2M + : vrtd::HostPageSize::Base4K; +} + +/// Required alignment for placement sizes/offsets given the selected page +/// granule: 2 MiB when hugepages are requested, otherwise the QDMA transfer +/// alignment (4 KiB). +static uint64_t requiredAlignment(const Validate::Options& options) { + return options.pageSize == Validate::Options::PageSize::Huge2M + ? HUGE_PAGE_SIZE + : TRANSFER_ALIGNMENT; +} + static std::string trim(std::string_view text) { size_t first = 0; while (first < text.size() && @@ -161,9 +186,9 @@ static bool isAligned(uint64_t value, uint64_t alignment) { return (value % alignment) == 0; } -static bool checkAligned(const char* name, uint64_t value) { - if (!isAligned(value, TRANSFER_ALIGNMENT)) { - std::cerr << "validate: " << name << " must be " << TRANSFER_ALIGNMENT +static bool checkAligned(const char* name, uint64_t value, uint64_t alignment) { + if (!isAligned(value, alignment)) { + std::cerr << "validate: " << name << " must be " << alignment << "-byte aligned" << std::endl; return false; } @@ -215,9 +240,10 @@ static bool checkMemoryPlacementRangePaired(const char* memoryName, } const uint64_t stride = pairedRegionStride(options); - if (stride == 0 || (stride % TRANSFER_ALIGNMENT) != 0) { + const uint64_t alignment = requiredAlignment(options); + if (stride == 0 || (stride % alignment) != 0) { std::cerr << "validate: --channel-region-stride must be a non-zero multiple of " - << TRANSFER_ALIGNMENT << " bytes" << std::endl; + << alignment << " bytes" << std::endl; return false; } if (stride > MEMORY_SPACE_SIZE) { @@ -266,9 +292,10 @@ static bool validatePlacement(const Validate::Options& options) { std::cerr << "validate: --offset must be greater than zero" << std::endl; return false; } - if (!checkAligned("--buffer-size", options.bufferSize) || - !checkAligned("--offset", options.offset) || - !checkAligned("--starting-offset", options.startingOffset)) { + const uint64_t alignment = requiredAlignment(options); + if (!checkAligned("--buffer-size", options.bufferSize, alignment) || + !checkAligned("--offset", options.offset, alignment) || + !checkAligned("--starting-offset", options.startingOffset, alignment)) { return false; } if (options.offset < options.bufferSize) { @@ -343,11 +370,44 @@ static void printChannelAllocation(const Validate::Options& options) { } } +/// Print which host staging-buffer page granule is in effect. +static void printPageSize(const Validate::Options& options) { + std::cout << "Host page size: " + << (options.pageSize == Validate::Options::PageSize::Huge2M + ? "2 MiB hugepages" + : "4 KiB base pages") + << std::endl; +} + static bool checkHostMemoryBudget(const Validate::Options& options) { const uint64_t maxConcurrentBuffers = (!options.ddrOnly && !options.hbmOnly) ? 4ULL * options.threads : 2ULL * options.threads; const uint64_t requiredBytes = maxConcurrentBuffers * options.bufferSize; + + if (options.pageSize == Validate::Options::PageSize::Huge2M) { + // 2 MiB hugepages are reserved separately from general RAM, so check + // the hugetlb pool rather than _SC_AVPHYS_PAGES. + const uint64_t needed = (requiredBytes + HUGE_PAGE_SIZE - 1) / HUGE_PAGE_SIZE; + std::ifstream freeFile("/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages"); + uint64_t freePages = 0; + if (!freeFile.is_open() || !(freeFile >> freePages)) { + std::cerr << "Warning: unable to read 2 MiB hugepage availability; " + << "validate needs about " << needed << " free 2 MiB hugepages." + << std::endl; + return true; + } + if (freePages < needed) { + std::cerr << "validate: --page-size 2m needs about " << needed + << " free 2 MiB hugepages, but only " << freePages + << " are free. Reserve more (e.g. 'echo " << needed + << " | sudo tee /proc/sys/vm/nr_hugepages') or reduce --threads/--buffer-size." + << std::endl; + return false; + } + return true; + } + const long pageSize = sysconf(_SC_PAGESIZE); const long availablePages = sysconf(_SC_AVPHYS_PAGES); @@ -481,8 +541,9 @@ class RawQdmaDevice { class RawTransferBuffer { public: - RawTransferBuffer(slash_qdma* qdma, uint64_t physAddr, uint64_t size) - : qdma_{qdma}, physAddr_{physAddr}, size_{size} { + RawTransferBuffer(slash_qdma* qdma, uint64_t physAddr, uint64_t size, + smi::raw::PageSize pageSize) + : qdma_{qdma}, physAddr_{physAddr}, size_{size}, pageSize_{pageSize} { try { createHostMapping(); createQpair(); @@ -540,6 +601,7 @@ class RawTransferBuffer { physAddr_ = other.physAddr_; size_ = other.size_; transferStepSize_ = other.transferStepSize_; + pageSize_ = other.pageSize_; other.qdma_ = nullptr; other.fd_ = -1; @@ -553,7 +615,7 @@ class RawTransferBuffer { } void createHostMapping() { - smi::raw::HostMapping mapping = smi::raw::createHostMapping(size_, physAddr_); + smi::raw::HostMapping mapping = smi::raw::createHostMapping(size_, physAddr_, pageSize_); data_ = mapping.data; transferStepSize_ = mapping.step; } @@ -624,6 +686,7 @@ class RawTransferBuffer { uint64_t physAddr_ = 0; uint64_t size_ = 0; uint64_t transferStepSize_ = 0; + smi::raw::PageSize pageSize_ = smi::raw::PageSize::Base4K; }; /// Fill @p buf with a deterministic pattern seeded by @p seed. @@ -997,10 +1060,12 @@ static vrtd::Buffer openValidateHbmBuffer(const vrtd::Device& device, uint64_t position) { if (options.placementExplicit) { return device.openRawBuffer(addressFor(HBM_BASE, options, position), - options.bufferSize); + options.bufferSize, vrtd::BufferAllocDir::Bidirectional, + vrtdPageSize(options)); } - return device.openHbmBuffer(static_cast(position), options.bufferSize); + return device.openHbmBuffer(static_cast(position), options.bufferSize, + vrtd::BufferAllocDir::Bidirectional, vrtdPageSize(options)); } static vrtd::Buffer openValidateDdrBuffer(const vrtd::Device& device, @@ -1008,11 +1073,13 @@ static vrtd::Buffer openValidateDdrBuffer(const vrtd::Device& device, uint64_t position) { if (options.placementExplicit) { return device.openRawBuffer(addressFor(DDR_BASE, options, position), - options.bufferSize); + options.bufferSize, vrtd::BufferAllocDir::Bidirectional, + vrtdPageSize(options)); } (void)position; - return device.openDdrBuffer(options.bufferSize); + return device.openDdrBuffer(options.bufferSize, vrtd::BufferAllocDir::Bidirectional, + vrtdPageSize(options)); } static int runRawTransferTest(const std::string& bdf, const Validate::Options& options) { @@ -1027,6 +1094,7 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o const std::string qdmaPath = resolveQdmaDevicePath(bdf); std::cout << "Using raw QDMA device " << qdmaPath << "..." << std::endl; printChannelAllocation(options); + printPageSize(options); printBandwidthRepeatMode(repeat); RawQdmaDevice qdma(qdmaPath); @@ -1038,7 +1106,7 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o hbmBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { hbmBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } if (!testDataIntegrity(hbmBuffers, "HBM")) { @@ -1058,10 +1126,10 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o for (unsigned i = 0; i < N; ++i) { hbmReadBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, 2 * i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); hbmWriteBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, 2 * i + 1), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", raw QDMA", repeat); @@ -1075,7 +1143,7 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o ddrBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { ddrBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } if (!testDataIntegrity(ddrBuffers, "DDR")) { @@ -1095,10 +1163,10 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o for (unsigned i = 0; i < N; ++i) { ddrReadBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, 2 * i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); ddrWriteBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, 2 * i + 1), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", raw QDMA", repeat); @@ -1111,11 +1179,11 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o parBuffers.reserve(2 * N); for (unsigned i = 0; i < N; ++i) { parBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } for (unsigned i = 0; i < N; ++i) { parBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } testBandwidthSuite(parBuffers, "HBM+DDR", ", raw QDMA", repeat); @@ -1132,18 +1200,18 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, 2 * i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); parWriteBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, 2 * i + 1), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, 2 * i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); parWriteBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, 2 * i + 1), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", raw QDMA", repeat); @@ -1177,6 +1245,7 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op std::cout << "Using off-the-shelf Xilinx QDMA driver for board " << bdf << "..." << std::endl; printChannelAllocation(options); + printPageSize(options); printBandwidthRepeatMode(repeat); smi::qdma_driver::QdmaDriverDevice qdma(bdf); std::cout << "Resolved QDMA function " << qdma.functionBdf() << std::endl; @@ -1197,7 +1266,7 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op hbmBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { hbmBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } if (!testDataIntegrity(hbmBuffers, "HBM")) { @@ -1215,10 +1284,10 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op for (unsigned i = 0; i < N; ++i) { hbmReadBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, 2 * i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); hbmWriteBuffers.emplace_back(qdma, N + i, rawAddressFor(HBM_BASE, options, 2 * i + 1), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", QDMA driver", repeat); @@ -1232,7 +1301,7 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op ddrBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { ddrBuffers.emplace_back(qdma, i, rawAddressFor(DDR_BASE, options, i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } if (!testDataIntegrity(ddrBuffers, "DDR")) { @@ -1250,10 +1319,10 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op for (unsigned i = 0; i < N; ++i) { ddrReadBuffers.emplace_back(qdma, i, rawAddressFor(DDR_BASE, options, 2 * i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); ddrWriteBuffers.emplace_back(qdma, N + i, rawAddressFor(DDR_BASE, options, 2 * i + 1), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", QDMA driver", repeat); @@ -1266,11 +1335,11 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op parBuffers.reserve(2 * N); for (unsigned i = 0; i < N; ++i) { parBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } for (unsigned i = 0; i < N; ++i) { parBuffers.emplace_back(qdma, N + i, rawAddressFor(DDR_BASE, options, i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } testBandwidthSuite(parBuffers, "HBM+DDR", ", QDMA driver", repeat); @@ -1283,18 +1352,18 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, 2 * i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); parWriteBuffers.emplace_back(qdma, 2 * N + i, rawAddressFor(HBM_BASE, options, 2 * i + 1), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma, N + i, rawAddressFor(DDR_BASE, options, 2 * i), - options.bufferSize); + options.bufferSize, rawPageSize(options)); parWriteBuffers.emplace_back(qdma, 3 * N + i, rawAddressFor(DDR_BASE, options, 2 * i + 1), - options.bufferSize); + options.bufferSize, rawPageSize(options)); } testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", QDMA driver", repeat); @@ -1345,6 +1414,8 @@ int Validate::run(const Options& options) { vrtd::Session session; auto device = session.getDeviceByBdf(bdf); + printPageSize(options); + // -- Step 2: HBM — integrity then bandwidth -- if (!options.ddrOnly) { std::cout << "Testing HBM data integrity (" << N << " regions)..." << std::endl; diff --git a/smi/src/validate.hpp b/smi/src/validate.hpp index e230b87b..4c9f7db1 100644 --- a/smi/src/validate.hpp +++ b/smi/src/validate.hpp @@ -55,6 +55,17 @@ class Validate { Paired, ///< Couple mm-channel to a distinct memory region: even positions -> region 0, odd -> region 1. }; + /// @brief Host staging-buffer page granule used for DMA transfers. + /// + /// Selects how the host-side buffer is mapped for every backend (VRTD, + /// raw SLASH, and the off-the-shelf QDMA driver). 2 MiB requires + /// reserved hugepages plus 2 MiB-aligned sizes/addresses; the allocation + /// fails with no fallback otherwise. + enum class PageSize { + Base4K, ///< 4 KiB base pages (default). + Huge2M, ///< 2 MiB hugepages. + }; + std::string bdf; ///< BDF (Bus:Device.Function) address of the target device. unsigned threads = 8; ///< Number of parallel buffers/threads (1-64). bool noReset = false; ///< Skip the device reset step before running memory tests. @@ -62,6 +73,7 @@ class Validate { bool hbmOnly = false; ///< Skip DDR phase (mutually exclusive with ddrOnly). bool rawTransferTest = false; ///< Use libslash raw QDMA transfers instead of VRTD buffers. bool useQdmaDriver = false; ///< Run the raw test over the off-the-shelf Xilinx QDMA driver. + PageSize pageSize = PageSize::Base4K; ///< Host staging-buffer page granule (4 KiB or 2 MiB). uint64_t bufferSize = 512ULL * 1024ULL * 1024ULL; ///< Size of each test buffer. uint64_t offset = 512ULL * 1024ULL * 1024ULL; ///< Distance between logical buffer positions. uint64_t startingOffset = 0; ///< Offset from memory-space base for position 0. diff --git a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h index d7721d49..53467bd3 100644 --- a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h +++ b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h @@ -54,6 +54,19 @@ extern "C" { struct vrtd_buffer; +/** + * @brief Host staging-buffer page granule for DMA buffers. + * + * Selects how the client-side host buffer backing a DMA transfer is mapped. + * libvrtd mmaps the host buffer locally, so this is a client-local concept and + * is never sent to the daemon. There is no automatic fallback: requesting + * #VRTD_HOST_PAGE_2M fails the allocation when 2 MiB hugepages are unavailable. + */ +enum vrtd_host_page_mode { + VRTD_HOST_PAGE_4K = 0, ///< Regular 4 KiB base pages (transparent hugepages disabled). + VRTD_HOST_PAGE_2M = 1, ///< 2 MiB hugetlb pages; allocation fails if they cannot be mapped. +}; + /** * @brief Connect to the vrtd UNIX domain socket. @@ -325,6 +338,7 @@ enum vrtd_ret vrtd_qdma_qpair_get_fd( * @param alloc_dir QDMA direction (one of enum vrtd_alloc_dir). * @param alloc_arg Allocation argument (HBM region index for HBM). * @param size_in Requested size in bytes. + * @param page_mode Host staging-buffer page granule (one of enum vrtd_host_page_mode). * @param buffer_out Output pointer to receive the allocated buffer handle. * * @return #VRTD_RET_OK on success; otherwise a #vrtd_ret error code. @@ -338,6 +352,7 @@ enum vrtd_ret vrtd_buffer_open( uint32_t alloc_dir, uint64_t alloc_arg, uint64_t size_in, + enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ); @@ -352,6 +367,7 @@ enum vrtd_ret vrtd_buffer_open( * @param phys_addr Caller-specified device physical address. * @param size Size in bytes. * @param alloc_dir One of #vrtd_alloc_dir. + * @param page_mode Host staging-buffer page granule (one of enum vrtd_host_page_mode). * @param buffer_out Output parameter set to the new buffer handle on success. * * @return #VRTD_RET_OK on success; otherwise a #vrtd_ret error code. @@ -364,6 +380,7 @@ enum vrtd_ret vrtd_buffer_open_raw( uint64_t phys_addr, uint64_t size, uint32_t alloc_dir, + enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ); @@ -529,6 +546,7 @@ enum vrtd_ret vrtd_buffer_create_raw( uint64_t size, uint64_t phys_addr, int qpair_fd, + enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ); diff --git a/vrt/vrtd/libvrtd/src/buffer.c b/vrt/vrtd/libvrtd/src/buffer.c index b85ef588..cd3babd0 100644 --- a/vrt/vrtd/libvrtd/src/buffer.c +++ b/vrt/vrtd/libvrtd/src/buffer.c @@ -24,10 +24,11 @@ * DMA buffer lifecycle management for the vrtd C client library. * * Buffers are host-side memory regions used for DMA transfers to/from - * the FPGA. Each buffer is backed by an anonymous mmap (preferring - * 2 MB hugepages for TLB efficiency, with automatic fallback to - * regular pages) and associated with a QDMA queue pair fd for - * performing the actual H2C / C2H transfers. + * the FPGA. Each buffer is backed by an anonymous mmap whose page granule + * (4 KiB base pages or 2 MiB hugepages) is selected explicitly by the caller + * via enum vrtd_host_page_mode -- there is no automatic fallback -- and + * associated with a QDMA queue pair fd for performing the actual H2C / C2H + * transfers. * * Sync operations (sync_to_device / sync_from_device) accept arbitrary * in-buffer ranges. Internally, the QDMA fd requires page-aligned transfer @@ -199,6 +200,7 @@ enum vrtd_ret vrtd_buffer_create_raw( uint64_t size, uint64_t phys_addr, int qpair_fd, + enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ) { if (buffer_out == NULL) { @@ -213,8 +215,19 @@ enum vrtd_ret vrtd_buffer_create_raw( buffer->buf = MAP_FAILED; buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE; - if ((size % HUGE_TRANSFER_STEP_SIZE) == 0 && - (phys_addr % HUGE_TRANSFER_STEP_SIZE) == 0) { + if (page_mode == VRTD_HOST_PAGE_2M) { + /* + * Explicit 2 MiB hugetlb request: there is no fallback. The DMA + * granule and the device address must both be 2 MiB aligned, and the + * hugetlb mapping must succeed, otherwise the allocation fails so the + * caller can react instead of silently transferring over 4 KiB pages. + */ + if ((size % HUGE_TRANSFER_STEP_SIZE) != 0 || + (phys_addr % HUGE_TRANSFER_STEP_SIZE) != 0) { + free(buffer); + return VRTD_RET_INVALID_ARGUMENT; + } + buffer->buf = mmap( NULL, /* address (let the kernel choose) */ size, @@ -223,39 +236,46 @@ enum vrtd_ret vrtd_buffer_create_raw( -1, /* fd */ 0 /* offset */ ); - if (buffer->buf != MAP_FAILED) { - buffer->transfer_step_size = HUGE_TRANSFER_STEP_SIZE; - } - } - - if (buffer->buf == MAP_FAILED) { - int huge_errno = errno; - // Huge pages are an optimization, not a hard requirement. - // Fall back to normal anonymous mapping when hugepage mmap fails. Do - // not use MAP_POPULATE before MADV_NOHUGEPAGE: THP=always can fault - // compound pages before the advice takes effect, and the kernel QDMA - // base-page path intentionally rejects those pages. - int mmap_ret = vrtd_mmap_regular_base_pages(size, &buffer->buf); - if (mmap_ret != 0) { + if (buffer->buf == MAP_FAILED) { + int huge_errno = errno; + syslog( + LOG_ERR, + "libvrtd: 2 MiB hugetlb mapping failed for buffer size=%llu phys_addr=0x%llx errno=%d; " + "reserve 2 MiB hugepages or request 4 KiB pages", + (unsigned long long)size, + (unsigned long long)phys_addr, + huge_errno + ); free(buffer); return VRTD_RET_INTERNAL_ERROR; } - buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE; + buffer->transfer_step_size = HUGE_TRANSFER_STEP_SIZE; #if SLASH_QDMA_TIMING syslog( LOG_INFO, - "libvrtd: buffer host mapping path=regular-4k size=%llu phys_addr=0x%llx step=%llu huge_errno=%d", + "libvrtd: buffer host mapping path=hugetlb-2m size=%llu phys_addr=0x%llx step=%llu", (unsigned long long)size, (unsigned long long)phys_addr, - (unsigned long long)buffer->transfer_step_size, - huge_errno + (unsigned long long)buffer->transfer_step_size ); #endif } else { + /* + * Explicit 4 KiB base-page request. Do not use MAP_POPULATE before + * MADV_NOHUGEPAGE: THP=always can fault compound pages before the + * advice takes effect, and the kernel QDMA base-page path intentionally + * rejects those pages (vrtd_mmap_regular_base_pages handles this). + */ + int mmap_ret = vrtd_mmap_regular_base_pages(size, &buffer->buf); + if (mmap_ret != 0) { + free(buffer); + return VRTD_RET_INTERNAL_ERROR; + } + buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE; #if SLASH_QDMA_TIMING syslog( LOG_INFO, - "libvrtd: buffer host mapping path=hugetlb-2m size=%llu phys_addr=0x%llx step=%llu", + "libvrtd: buffer host mapping path=regular-4k size=%llu phys_addr=0x%llx step=%llu", (unsigned long long)size, (unsigned long long)phys_addr, (unsigned long long)buffer->transfer_step_size diff --git a/vrt/vrtd/libvrtd/src/requests.c b/vrt/vrtd/libvrtd/src/requests.c index b03c863a..5f6b9357 100644 --- a/vrt/vrtd/libvrtd/src/requests.c +++ b/vrt/vrtd/libvrtd/src/requests.c @@ -468,6 +468,7 @@ enum vrtd_ret vrtd_buffer_open( uint32_t alloc_dir, uint64_t alloc_arg, uint64_t size_in, + enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ) { @@ -507,6 +508,7 @@ enum vrtd_ret vrtd_buffer_open( resp.size, resp.phys_addr, qpair_fd, + page_mode, buffer_out ); if (ret != VRTD_RET_OK) { @@ -523,6 +525,7 @@ enum vrtd_ret vrtd_buffer_open_raw( uint64_t phys_addr, uint64_t size, uint32_t alloc_dir, + enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ) { @@ -561,6 +564,7 @@ enum vrtd_ret vrtd_buffer_open_raw( size, phys_addr, qpair_fd, + page_mode, buffer_out ); if (ret != VRTD_RET_OK) { diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp index 8748d379..254eb660 100644 --- a/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp +++ b/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp @@ -48,6 +48,17 @@ enum class BufferAllocDir : uint32_t { DeviceToHost = VRTD_ALLOC_DIR_DEVICE_TO_HOST, }; +/** + * @brief Host staging-buffer page granule for a buffer's DMA mapping. + * + * Mirrors @c vrtd_host_page_mode (values must stay in sync). @c Huge2M fails + * the allocation, with no fallback, when 2 MiB hugepages cannot be mapped. + */ +enum class HostPageSize : uint32_t { + Base4K = 0, ///< Regular 4 KiB base pages. + Huge2M = 1, ///< 2 MiB hugetlb pages; allocation fails if unavailable. +}; + /** * @brief RAII wrapper for a vrtd buffer allocation. * diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp index 8123a7f1..7695d975 100644 --- a/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp +++ b/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp @@ -158,19 +158,22 @@ class Device { * @param size Requested size in bytes. * @param allocArg Allocation argument (HBM region index for HBM). * @param allocDir QDMA transfer direction. + * @param page Host staging-buffer page granule (defaults to 4 KiB). * @return An owning @c Buffer. * @throws vrtd::Error on error. */ Buffer openBuffer(BufferAllocType allocType, uint64_t size, uint64_t allocArg = 0, - BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const; + BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + HostPageSize page = HostPageSize::Base4K) const; /** * @brief Convenience helper for DDR allocations. */ - Buffer openDdrBuffer(uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const { - return openBuffer(BufferAllocType::Ddr, size, 0, allocDir); + Buffer openDdrBuffer(uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + HostPageSize page = HostPageSize::Base4K) const { + return openBuffer(BufferAllocType::Ddr, size, 0, allocDir, page); } /** @@ -178,16 +181,18 @@ class Device { */ Buffer openHbmBuffer(uint32_t region, uint64_t size, - BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const { - return openBuffer(BufferAllocType::Hbm, size, region, allocDir); + BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + HostPageSize page = HostPageSize::Base4K) const { + return openBuffer(BufferAllocType::Hbm, size, region, allocDir, page); } /** * @brief Convenience helper for HBM VNOC allocations. */ Buffer openHbmVnocBuffer(uint64_t size, - BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const { - return openBuffer(BufferAllocType::HbmVnoc, size, 0, allocDir); + BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + HostPageSize page = HostPageSize::Base4K) const { + return openBuffer(BufferAllocType::HbmVnoc, size, 0, allocDir, page); } /** @@ -199,12 +204,14 @@ class Device { * @param phys_addr Device physical address. * @param size Size in bytes. * @param allocDir QDMA transfer direction. + * @param page Host staging-buffer page granule (defaults to 4 KiB). * @return An owning @c Buffer. * @throws vrtd::Error on error. */ Buffer openRawBuffer(uint64_t phys_addr, uint64_t size, - BufferAllocDir allocDir = BufferAllocDir::Bidirectional) const; + BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + HostPageSize page = HostPageSize::Base4K) const; /** * @brief Perform a PCIe hotplug operation for this device. @@ -351,8 +358,8 @@ class Device { uint16_t subsystemDeviceId, std::function fGetBar, std::function fCreateQdmaQpair, - std::function fOpenBuffer, - std::function fOpenBufferRaw, + std::function fOpenBuffer, + std::function fOpenBufferRaw, std::function fHotplugOp, std::function fDesignWrite, std::function fDesignWriteFile, @@ -370,8 +377,8 @@ class Device { std::function fGetBar; std::function fCreateQdmaQpair; - std::function fOpenBuffer; - std::function fOpenBufferRaw; + std::function fOpenBuffer; + std::function fOpenBufferRaw; std::function fHotplugOp; std::function fDesignWrite; std::function fDesignWriteFile; diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp index e2dbbac2..0bc86acf 100644 --- a/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp +++ b/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp @@ -190,6 +190,7 @@ class Session { * @param size Requested size in bytes. * @param allocArg Allocation argument (HBM region index for HBM). * @param allocDir QDMA transfer direction. + * @param pageSize Host staging-buffer page granule (4 KiB or 2 MiB). * @return An owning @c Buffer. * @throws vrtd::Error on error. */ @@ -198,7 +199,8 @@ class Session { BufferAllocType allocType, uint64_t size, uint64_t allocArg, - BufferAllocDir allocDir + BufferAllocDir allocDir, + HostPageSize pageSize ) const; /** @@ -208,6 +210,7 @@ class Session { * @param phys_addr Caller-specified device physical address (bypasses allocator). * @param size Size in bytes. * @param allocDir QDMA transfer direction. + * @param pageSize Host staging-buffer page granule (4 KiB or 2 MiB). * @return An owning @c Buffer. * @throws vrtd::Error on error. */ @@ -215,7 +218,8 @@ class Session { const Device& device, uint64_t phys_addr, uint64_t size, - BufferAllocDir allocDir + BufferAllocDir allocDir, + HostPageSize pageSize ) const; /** diff --git a/vrt/vrtd/libvrtdpp/src/device.cpp b/vrt/vrtd/libvrtdpp/src/device.cpp index f45cda24..c4ae8d7a 100644 --- a/vrt/vrtd/libvrtdpp/src/device.cpp +++ b/vrt/vrtd/libvrtdpp/src/device.cpp @@ -31,8 +31,8 @@ Device::Device(uint32_t num, uint16_t subsystemDeviceId, std::function fGetBar, std::function fCreateQdmaQpair, - std::function fOpenBuffer, - std::function fOpenBufferRaw, + std::function fOpenBuffer, + std::function fOpenBufferRaw, std::function fHotplugOp, std::function fDesignWrite, std::function fDesignWriteFile, @@ -97,14 +97,16 @@ QdmaQpair Device::createQdmaQpair(const struct slash_qdma_qpair_add& cfg) const Buffer Device::openBuffer(BufferAllocType allocType, uint64_t size, uint64_t allocArg, - BufferAllocDir allocDir) const { - return fOpenBuffer(*this, allocType, size, allocArg, allocDir); + BufferAllocDir allocDir, + HostPageSize page) const { + return fOpenBuffer(*this, allocType, size, allocArg, allocDir, page); } Buffer Device::openRawBuffer(uint64_t phys_addr, uint64_t size, - BufferAllocDir allocDir) const { - return fOpenBufferRaw(*this, phys_addr, size, allocDir); + BufferAllocDir allocDir, + HostPageSize page) const { + return fOpenBufferRaw(*this, phys_addr, size, allocDir, page); } void Device::hotplugOp(HotplugOp op, uint8_t function) const { diff --git a/vrt/vrtd/libvrtdpp/src/session.cpp b/vrt/vrtd/libvrtdpp/src/session.cpp index d2e69fd9..8fcf42b6 100644 --- a/vrt/vrtd/libvrtdpp/src/session.cpp +++ b/vrt/vrtd/libvrtdpp/src/session.cpp @@ -132,11 +132,11 @@ Device Session::getDevice(size_t i) const { info.pci.subsystem_device_id, [&](const Device& device, uint8_t num) { return getBar(device, num); }, [&](const Device& device, const slash_qdma_qpair_add& cfg) { return createQdmaQpair(device, cfg); }, - [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir) { - return openBuffer(device, type, size, arg, dir); + [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, HostPageSize page) { + return openBuffer(device, type, size, arg, dir, page); }, - [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir) { - return openBufferRaw(device, phys_addr, size, dir); + [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, HostPageSize page) { + return openBufferRaw(device, phys_addr, size, dir, page); }, [&](const Device& device, HotplugOp op, uint8_t function) { return hotplugOp(device, op, function); }, [&](const Device& device, int input_fd) { return designWrite(device, input_fd); }, @@ -197,11 +197,11 @@ Device Session::getDeviceByBdf(std::string_view bdf) const { info.pci.subsystem_device_id, [&](const Device& device, uint8_t num) { return getBar(device, num); }, [&](const Device& device, const slash_qdma_qpair_add& cfg) { return createQdmaQpair(device, cfg); }, - [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir) { - return openBuffer(device, type, size, arg, dir); + [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, HostPageSize page) { + return openBuffer(device, type, size, arg, dir, page); }, - [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir) { - return openBufferRaw(device, phys_addr, size, dir); + [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, HostPageSize page) { + return openBufferRaw(device, phys_addr, size, dir, page); }, [&](const Device& device, HotplugOp op, uint8_t function) { return hotplugOp(device, op, function); }, [&](const Device& device, int input_fd) { return designWrite(device, input_fd); }, @@ -289,7 +289,8 @@ Buffer Session::openBuffer( BufferAllocType allocType, uint64_t size, uint64_t allocArg, - BufferAllocDir allocDir + BufferAllocDir allocDir, + HostPageSize pageSize ) const { if (isClosed()) { throw Error(VRTD_RET_BAD_LIB_CALL); @@ -304,6 +305,7 @@ Buffer Session::openBuffer( static_cast(allocDir), allocArg, size, + static_cast(static_cast(pageSize)), &raw ); if (ret != VRTD_RET_OK) { @@ -321,7 +323,8 @@ Buffer Session::openBufferRaw( const Device& device, uint64_t phys_addr, uint64_t size, - BufferAllocDir allocDir + BufferAllocDir allocDir, + HostPageSize pageSize ) const { if (isClosed()) { throw Error(VRTD_RET_BAD_LIB_CALL); @@ -335,6 +338,7 @@ Buffer Session::openBufferRaw( phys_addr, size, static_cast(allocDir), + static_cast(static_cast(pageSize)), &raw ); if (ret != VRTD_RET_OK) { From 5bd53456b864f6a9b93e0e14823403b24a17be52 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Fri, 12 Jun 2026 14:05:44 +0100 Subject: [PATCH 11/23] driver: add qdma registered-buffer abi with pinned, pre-dma-mapped transfers Signed-off-by: Vlad-Gabriel Serbu --- .../include/slash/uapi/slash_interface.h | 99 +++ driver/slash_qdma.c | 720 ++++++++++++++++-- 2 files changed, 735 insertions(+), 84 deletions(-) diff --git a/driver/libslash/include/slash/uapi/slash_interface.h b/driver/libslash/include/slash/uapi/slash_interface.h index bbe6908d..a731610d 100644 --- a/driver/libslash/include/slash/uapi/slash_interface.h +++ b/driver/libslash/include/slash/uapi/slash_interface.h @@ -153,6 +153,19 @@ struct slash_qdma_info { __u32 caps; /**< [out] Capability bitmask. */ }; +/** + * @brief AXI-MM / NoC channel selection for a queue pair. + * + * Selects which CPM5 AXI-MM channel a queue pair uses. libqdma mirrors the + * channel into the SW-context host_id, which selects the programmed Host + * Profile and hence the NoC channel. + */ +enum slash_qdma_mm_channel { + SLASH_QDMA_MM_CHANNEL_AUTO = 0, /**< Stripe across channels by (qid & 1). */ + SLASH_QDMA_MM_CHANNEL_0 = 1, /**< Pin to AXI-MM/NoC channel 0. */ + SLASH_QDMA_MM_CHANNEL_1 = 2, /**< Pin to AXI-MM/NoC channel 1. */ +}; + /** * @brief Add (allocate) a new QDMA queue pair. * @@ -176,6 +189,7 @@ struct slash_qdma_qpair_add { /* Userspace to kernel */ __u32 mode; /**< [in] Queue operating mode. */ __u32 dir_mask; /**< [in] Direction bitmask — which directions to enable. */ + __u32 mm_channel; /**< [in] AXI-MM/NoC channel selection (enum slash_qdma_mm_channel). */ __u32 h2c_ring_sz; /**< [in] Host-to-card descriptor ring size. */ __u32 c2h_ring_sz; /**< [in] Card-to-host descriptor ring size. */ @@ -228,6 +242,79 @@ struct slash_qdma_qpair_fd_request { __u32 flags; /**< [in] File descriptor flags. Only O_CLOEXEC is honoured. */ }; +/** + * @brief Transfer direction for a registered-buffer DMA transfer. + */ +enum slash_qdma_transfer_dir { + SLASH_QDMA_XFER_H2C = 1, /**< Host-to-Card (write to device). */ + SLASH_QDMA_XFER_C2H = 2, /**< Card-to-Host (read from device). */ +}; + +/** + * @brief Register a host buffer for DMA, pinning its pages once. + * + * The kernel pins the pages backing [user_addr, user_addr + length), + * builds a scatter-gather list, and DMA-maps it once. Subsequent + * transfers reference the buffer by \@buf_id instead of re-pinning and + * re-mapping per transfer. + * + * \@user_addr must be page-aligned and \@length a non-zero multiple of + * the host page size. The buffer must be backed by a single page + * granule (all 4 KiB base pages or all 2 MiB hugepages), matching the + * transfer data path. + * + * Buffers are owned by the control-fd open instance they are registered + * through, and are automatically unregistered when that fd is closed + * (including on process exit) if userspace forgets to unregister them. + */ +struct slash_qdma_buf_register { + __u32 size; /**< Struct size for ABI versioning. */ + + /* Userspace to kernel */ + __u32 flags; /**< [in] Reserved; must be 0. */ + __u64 user_addr; /**< [in] Page-aligned host buffer base address. */ + __u64 length; /**< [in] Buffer length in bytes (page multiple). */ + + /* Kernel to userspace */ + __u32 buf_id; /**< [out] Kernel-assigned buffer handle. */ + __u32 pad0; /**< Padding for natural alignment. */ +}; + +/** + * @brief Unregister a previously registered buffer. + * + * Removes the buffer from the owning client's lookup table. The pages + * are unpinned and the DMA mapping torn down once no in-flight transfer + * still references the buffer. + */ +struct slash_qdma_buf_unregister { + __u32 size; /**< Struct size for ABI versioning. */ + __u32 buf_id; /**< [in] Buffer handle from slash_qdma_buf_register. */ +}; + +/** + * @brief Perform a DMA transfer using a registered buffer. + * + * Issued on a queue-pair I/O fd (from SLASH_QDMA_IOCTL_QPAIR_GET_FD). + * Transfers \@length bytes between the registered buffer at + * \@buf_offset and the device endpoint address \@dev_addr. The number + * of bytes transferred is returned as the ioctl return value. + * + * \@buf_offset and \@length must be aligned to the registered buffer's + * page granule, and \@buf_offset + \@length must not exceed the + * registered length. \@direction must be one of enum slash_qdma_transfer_dir + * and must be enabled on the queue pair. + */ +struct slash_qdma_transfer { + __u32 size; /**< Struct size for ABI versioning. */ + __u32 buf_id; /**< [in] Registered buffer handle. */ + __u64 buf_offset; /**< [in] Byte offset within the registered buffer. */ + __u64 dev_addr; /**< [in] Device-side (endpoint) address. */ + __u64 length; /**< [in] Number of bytes to transfer. */ + __u32 direction; /**< [in] enum slash_qdma_transfer_dir (H2C or C2H). */ + __u32 pad0; /**< Padding for natural alignment. */ +}; + /** Query QDMA subsystem capabilities. */ #define SLASH_QDMA_IOCTL_INFO _IOWR('v', 0x50, struct slash_qdma_info) @@ -240,4 +327,16 @@ struct slash_qdma_qpair_fd_request { /** Obtain an I/O file descriptor for a queue pair. */ #define SLASH_QDMA_IOCTL_QPAIR_GET_FD _IOWR('v', 0x53, struct slash_qdma_qpair_fd_request) +/** Register a host buffer (pin + DMA-map once); returns assigned buf_id. */ +#define SLASH_QDMA_IOCTL_BUF_REGISTER _IOWR('v', 0x54, struct slash_qdma_buf_register) + +/** Unregister a previously registered buffer. */ +#define SLASH_QDMA_IOCTL_BUF_UNREGISTER _IOWR('v', 0x55, struct slash_qdma_buf_unregister) + +/** + * Perform a registered-buffer DMA transfer. Issued on a queue-pair I/O + * fd (not the control device); returns the number of bytes transferred. + */ +#define SLASH_QDMA_QPAIR_IOCTL_TRANSFER _IOWR('v', 0x56, struct slash_qdma_transfer) + #endif diff --git a/driver/slash_qdma.c b/driver/slash_qdma.c index ebdf9949..9b120be7 100644 --- a/driver/slash_qdma.c +++ b/driver/slash_qdma.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -106,6 +107,12 @@ offsetofend(struct slash_qdma_qpair_op, op) #define SLASH_QDMA_QPAIR_GET_FD_MIN_SIZE \ offsetofend(struct slash_qdma_qpair_fd_request, flags) +#define SLASH_QDMA_BUF_REGISTER_MIN_SIZE \ + offsetofend(struct slash_qdma_buf_register, length) +#define SLASH_QDMA_BUF_UNREGISTER_MIN_SIZE \ + offsetofend(struct slash_qdma_buf_unregister, buf_id) +#define SLASH_QDMA_TRANSFER_MIN_SIZE \ + offsetofend(struct slash_qdma_transfer, direction) /* * CPM5 Host Profile indirect-context programming. @@ -145,54 +152,6 @@ */ #define SLASH_QDMA_HUGEPAGE_SIZE (2UL * 1024UL * 1024UL) -/* - * qdma_force_mm_channel - Debug/experiment override for AXI-MM/NoC channel - * assignment of newly-added queue pairs. - * - * < 0 : automatic - stripe across channels by (qid & 1) [default] - * 0 : pin every new queue to MM channel 0 (Host Profile 0 / NoC Channel 0) - * 1 : pin every new queue to MM channel 1 (Host Profile 1 / NoC Channel 1) - * - * The value is read when a queue pair is added, so it can be changed at - * runtime via /sys/module/slash/parameters/qdma_force_mm_channel to A/B test - * whether both PCIe NMUs (NoC channels) actually contribute bandwidth: - * - * echo 0 > .../qdma_force_mm_channel # all traffic on NoC channel 0 (S00) - * echo 1 > .../qdma_force_mm_channel # all traffic on NoC channel 1 (S01) - * echo -1 > .../qdma_force_mm_channel # default split (qid & 1) - * - * Affects both the VRTD buffer path and the raw-transfer path (any queue - * created through this driver). It does not affect the off-the-shelf Xilinx - * QDMA driver path. - */ -static int qdma_force_mm_channel = -1; - -static int slash_qdma_force_mm_channel_set(const char *val, - const struct kernel_param *kp) -{ - int parsed; - int err; - - err = kstrtoint(val, 0, &parsed); - if (err) - return err; - - if (parsed < -1 || parsed > 1) - return -EINVAL; - - return param_set_int(val, kp); -} - -static const struct kernel_param_ops slash_qdma_force_mm_channel_ops = { - .set = slash_qdma_force_mm_channel_set, - .get = param_get_int, -}; - -module_param_cb(qdma_force_mm_channel, &slash_qdma_force_mm_channel_ops, - &qdma_force_mm_channel, 0644); -MODULE_PARM_DESC(qdma_force_mm_channel, - "Force QDMA AXI-MM/NoC channel for new queues: <0=auto(qid&1), 0 or 1 to pin (default -1)"); - /* * qdma_huge_desc_size - Experimental descriptor granularity for hugetlb-backed * raw qpair transfers. @@ -274,6 +233,17 @@ MODULE_PARM_DESC(qdma_huge_desc_size, */ #define SLASH_QDMA_QPAIR_ID_RANGE XA_LIMIT(0, SLASH_QDMA_MAX_QPAIRS - 1) +/** + * SLASH_QDMA_MAX_BUFS - Maximum number of registered DMA buffers per client. + * + * Each control-fd open instance gets its own buffer-id space; this bounds + * the xarray allocation range used by SLASH_QDMA_IOCTL_BUF_REGISTER. + */ +#define SLASH_QDMA_MAX_BUFS 4096 + +/** XArray allocation range for registered buffer IDs ([0, 4095]). */ +#define SLASH_QDMA_BUF_ID_RANGE XA_LIMIT(0, SLASH_QDMA_MAX_BUFS - 1) + /* * Debug logging infrastructure. * @@ -686,6 +656,7 @@ slash_qdma_qpair_remove(struct slash_qdma_dev *qdma_dev, u32 qid) struct slash_qdma_qpair_file_ctx { struct slash_qdma_dev *qdma_dev; struct slash_qdma_qpair_entry *entry; + struct slash_qdma_client *client; u32 qid; }; @@ -714,6 +685,56 @@ struct slash_qdma_io_cb { struct qdma_request req; }; +/** + * struct slash_qdma_buf - A registered (persistently pinned) host buffer. + * @ref: Reference count. The owning client's xarray holds one ref; + * each in-flight transfer takes a temporary ref so an + * unregister cannot tear the buffer down under active DMA. + * @qdma_dev: Device whose DMA mappings back this buffer (non-owning; the + * owning client holds the device reference). + * @buf_id: Client-scoped handle returned to userspace. + * @length: Registered length in bytes. + * @granule: Bytes per SGL entry (PAGE_SIZE for base pages, or the + * hugepage descriptor size). Uniform across all entries, so + * transfer slices can be computed by simple division. + * @iocb: Pinned pages and prebuilt scatter-gather list. Each entry's + * dma_addr is filled in once at registration so transfers can + * submit with req->dma_mapped = 1. + * + * Registered buffers amortise the per-transfer cost of pinning pages, + * building the SGL, and programming the IOMMU: that work happens once at + * registration, and every transfer reuses the cached, pre-DMA-mapped SGL. + */ +struct slash_qdma_buf { + struct kref ref; + struct slash_qdma_dev *qdma_dev; + u32 buf_id; + u64 length; + u64 granule; + struct slash_qdma_io_cb iocb; +}; + +/** + * struct slash_qdma_client - Per-open state for the QDMA control device. + * @ref: Reference count. The control fd holds the initial ref; each + * qpair I/O fd handed out via QPAIR_GET_FD takes another so that + * handle-based transfers can resolve buffer IDs even if the + * control fd is closed first. + * @qdma_dev: Owning QDMA device (holds a device reference). + * @buffers: XArray mapping buf_id -> &struct slash_qdma_buf. Buffers are + * owned by this client and auto-freed when the control fd closes. + * + * Replaces the bare device pointer previously stored in the control fd's + * file->private_data. Tying registered buffers to this per-open context + * makes cleanup automatic: if userspace exits or is killed without + * unregistering, the control fd release path drops every buffer. + */ +struct slash_qdma_client { + struct kref ref; + struct slash_qdma_dev *qdma_dev; + struct xarray buffers; +}; + /* ───────────────────────────────────────────────────────────────────── * Forward declarations * ───────────────────────────────────────────────────────────────────── */ @@ -755,8 +776,18 @@ static int slash_qdma_ioctl_qpair_op_apply(struct slash_qdma_dev *qdma_dev, const char *op_name, bool stop_on_err); static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc, - struct slash_qdma_dev *qdma_dev, + struct slash_qdma_client *client, + void __user *uarg); +static int slash_qdma_ioctl_buf_register_w(struct miscdevice *misc, + struct slash_qdma_client *client, void __user *uarg); +static int slash_qdma_ioctl_buf_unregister_w(struct miscdevice *misc, + struct slash_qdma_client *client, + void __user *uarg); +static void slash_qdma_buf_release(struct kref *ref); +static void slash_qdma_buf_put(struct slash_qdma_buf *buf); +static void slash_qdma_client_release(struct kref *ref); +static long slash_qdma_qpair_transfer(struct file *file, void __user *uarg); static ssize_t slash_qdma_qpair_read(struct file *file, char __user *buf, size_t count, loff_t *ppos); @@ -1703,14 +1734,18 @@ static void slash_qdma_conf_options(struct qdma_dev_conf *conf, struct pci_dev * */ static long slash_qdma_fop_ioctl(struct file *file, unsigned int op, unsigned long arg) { - struct slash_qdma_dev *qdma_dev = file->private_data; - struct miscdevice *misc = &qdma_dev->misc; + struct slash_qdma_client *client = file->private_data; + struct slash_qdma_dev *qdma_dev; + struct miscdevice *misc; void __user *uarg = (void __user *)arg; long ret = 0; - if (!qdma_dev) + if (!client || !client->qdma_dev) return -ENODEV; + qdma_dev = client->qdma_dev; + misc = &qdma_dev->misc; + SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, "ioctl op=0x%x\n", op); /* Early rejection if the device is shutting down. */ @@ -1735,7 +1770,15 @@ static long slash_qdma_fop_ioctl(struct file *file, unsigned int op, unsigned lo break; case SLASH_QDMA_IOCTL_QPAIR_GET_FD: - ret = slash_qdma_ioctl_qpair_get_fd_w(misc, qdma_dev, uarg); + ret = slash_qdma_ioctl_qpair_get_fd_w(misc, client, uarg); + break; + + case SLASH_QDMA_IOCTL_BUF_REGISTER: + ret = slash_qdma_ioctl_buf_register_w(misc, client, uarg); + break; + + case SLASH_QDMA_IOCTL_BUF_UNREGISTER: + ret = slash_qdma_ioctl_buf_unregister_w(misc, client, uarg); break; default: @@ -1763,18 +1806,34 @@ static int slash_qdma_fop_open(struct inode *inode, struct file *file) struct miscdevice *misc = file->private_data; struct slash_qdma_dev *qdma_dev = container_of(misc, struct slash_qdma_dev, misc); - int ret = 0; + struct slash_qdma_client *client; mutex_lock(&qdma_dev->lock); if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) { - ret = -ENODEV; - } else { - kref_get(&qdma_dev->ref); - file->private_data = qdma_dev; + mutex_unlock(&qdma_dev->lock); + return -ENODEV; } + kref_get(&qdma_dev->ref); mutex_unlock(&qdma_dev->lock); - return ret; + /* + * Allocate a per-open client context to own any buffers registered + * through this fd. The control fd holds the initial client ref; it + * is dropped (and the buffers torn down) in slash_qdma_fop_release(). + */ + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (!client) { + kref_put(&qdma_dev->ref, slash_qdma_dev_release); + return -ENOMEM; + } + + kref_init(&client->ref); + client->qdma_dev = qdma_dev; + xa_init_flags(&client->buffers, XA_FLAGS_ALLOC); + + file->private_data = client; + + return 0; } /** @@ -1789,10 +1848,28 @@ static int slash_qdma_fop_open(struct inode *inode, struct file *file) */ static int slash_qdma_fop_release(struct inode *inode, struct file *file) { - struct slash_qdma_dev *qdma_dev = file->private_data; + struct slash_qdma_client *client = file->private_data; + struct slash_qdma_buf *buf; + unsigned long index; - if (qdma_dev) - kref_put(&qdma_dev->ref, slash_qdma_dev_release); + if (!client) + return 0; + + /* + * Auto-unregister any buffers the client forgot (or had no chance) to + * release. Remove each from the lookup table first so no new transfer + * can find it, then drop the table's reference. Buffers with an + * in-flight transfer stay alive until that transfer releases its ref. + */ + xa_for_each(&client->buffers, index, buf) { + xa_erase(&client->buffers, index); + slash_qdma_buf_put(buf); + } + xa_destroy(&client->buffers); + + kref_put(&client->ref, slash_qdma_client_release); + + file->private_data = NULL; return 0; } @@ -1949,6 +2026,12 @@ static int slash_qdma_ioctl_qpair_add_w(struct miscdevice *misc, if (req.h2c_ring_sz >= 16 || req.c2h_ring_sz >= 16 || req.cmpt_ring_sz >= 16) return -EINVAL; + /* Validate the per-queue AXI-MM channel selection. */ + if (req.mm_channel != SLASH_QDMA_MM_CHANNEL_AUTO && + req.mm_channel != SLASH_QDMA_MM_CHANNEL_0 && + req.mm_channel != SLASH_QDMA_MM_CHANNEL_1) + return -EINVAL; + mutex_lock(&qdma_dev->lock); if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) { mutex_unlock(&qdma_dev->lock); @@ -2135,19 +2218,26 @@ static int slash_qdma_ioctl_qpair_add_q(struct miscdevice *misc, qconf.aperture_size = 0; /* Linear MM addressing; non-zero enables keyhole mode */ /* - * CPM5 exposes two MM channels; by default stripe queue pairs across - * them via (qid & 1). libqdma also mirrors mm_channel into the SW-context - * host_id, so this selects the programmed Host Profile too: even queues -> - * Host Profile 0 (NoC Channel 0), odd queues -> Host Profile 1 (NoC - * Channel 1). See slash_qdma_program_host_profiles(). - * - * The qdma_force_mm_channel module parameter overrides the split and pins - * every new queue to a single channel, for NoC-bandwidth A/B testing. + * CPM5 exposes two MM channels. The per-queue mm_channel selection + * (validated in slash_qdma_ioctl_qpair_add_w) chooses the channel: AUTO + * stripes across channels by (qid & 1); CHANNEL_0/CHANNEL_1 pin to a single + * channel. libqdma mirrors mm_channel into the SW-context host_id, so this + * also selects the programmed Host Profile: channel 0 -> Host Profile 0 + * (NoC Channel 0), channel 1 -> Host Profile 1 (NoC Channel 1). See + * slash_qdma_program_host_profiles(). */ - if (qdma_force_mm_channel >= 0) - qconf.mm_channel = (u32)qdma_force_mm_channel; - else + switch (req->mm_channel) { + case SLASH_QDMA_MM_CHANNEL_0: + qconf.mm_channel = 0; + break; + case SLASH_QDMA_MM_CHANNEL_1: + qconf.mm_channel = 1; + break; + case SLASH_QDMA_MM_CHANNEL_AUTO: + default: qconf.mm_channel = req->qid & 1; + break; + } /* --- Per-direction ring configuration --- */ switch (qtype) { @@ -2171,9 +2261,9 @@ static int slash_qdma_ioctl_qpair_add_q(struct miscdevice *misc, } SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, - "queue add qid=%u type=%u mode=%u mm_channel=%u%s\n", + "queue add qid=%u type=%u mode=%u mm_channel=%u (req=%u)\n", req->qid, qtype, req->mode, qconf.mm_channel, - qdma_force_mm_channel >= 0 ? " (forced)" : ""); + req->mm_channel); err = qdma_queue_add(qdma_dev->qdma_handle, &qconf, &qhndl, errbuf, sizeof(errbuf)); if (err) { @@ -2785,6 +2875,311 @@ static int slash_qdma_map_user_buf_to_sgl(struct slash_qdma_io_cb *iocb, return slash_qdma_map_user_base_pages_to_sgl(iocb, write); } +/* ───────────────────────────────────────────────────────────────────── + * Registered buffers: persistent pin + DMA mapping + * ───────────────────────────────────────────────────────────────────── */ + +/** + * slash_qdma_buf_dma_unmap() - Tear down the cached DMA mapping of a buffer. + * @buf: Registered buffer whose SGL entries were DMA-mapped. + * + * Unmaps every SGL entry that carries a non-zero dma_addr and clears it. + * Safe to call on a partially-mapped buffer (used on the registration + * error path). + */ +static void slash_qdma_buf_dma_unmap(struct slash_qdma_buf *buf) +{ + struct device *dev = &buf->qdma_dev->pdev->dev; + unsigned int i; + + if (!buf->iocb.sgl) + return; + + for (i = 0; i < buf->iocb.pages_nr; i++) { + struct qdma_sw_sg *sg = &buf->iocb.sgl[i]; + + if (sg->dma_addr) { + dma_unmap_page(dev, sg->dma_addr, sg->len, DMA_BIDIRECTIONAL); + sg->dma_addr = 0UL; + } + } +} + +/** + * slash_qdma_buf_dma_map() - DMA-map every SGL entry of a registered buffer. + * @buf: Registered buffer with a freshly built (pinned) SGL. + * + * Maps each entry with DMA_BIDIRECTIONAL so the same cached mapping serves + * both H2C and C2H transfers. On any failure all previously mapped entries + * are unmapped before returning. + * + * Return: 0 on success, negative errno on failure. + */ +static int slash_qdma_buf_dma_map(struct slash_qdma_buf *buf) +{ + struct device *dev = &buf->qdma_dev->pdev->dev; + unsigned int i; + + for (i = 0; i < buf->iocb.pages_nr; i++) { + struct qdma_sw_sg *sg = &buf->iocb.sgl[i]; + + sg->dma_addr = dma_map_page(dev, sg->pg, sg->offset, sg->len, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(dev, sg->dma_addr)) { + sg->dma_addr = 0UL; + pr_err("slash: qdma: buffer DMA map failed at entry %u/%u\n", + i, buf->iocb.pages_nr); + slash_qdma_buf_dma_unmap(buf); + return -ENOMEM; + } + } + + return 0; +} + +/** + * slash_qdma_buf_release() - kref release callback for a registered buffer. + * @ref: kref embedded in the slash_qdma_buf being freed. + * + * Runs when the last reference drops (table ref plus any in-flight transfer + * refs). Tears down the DMA mapping, unpins the pages (marking them dirty in + * case a C2H transfer wrote into them), frees the SGL, and frees the struct. + */ +static void slash_qdma_buf_release(struct kref *ref) +{ + struct slash_qdma_buf *buf = + container_of(ref, struct slash_qdma_buf, ref); + + slash_qdma_buf_dma_unmap(buf); + /* write=false marks the pages dirty: a C2H transfer may have written. */ + slash_qdma_unmap_user_buf(&buf->iocb, false); + slash_qdma_iocb_release(&buf->iocb); + kfree(buf); +} + +static inline void slash_qdma_buf_get(struct slash_qdma_buf *buf) +{ + kref_get(&buf->ref); +} + +static void slash_qdma_buf_put(struct slash_qdma_buf *buf) +{ + kref_put(&buf->ref, slash_qdma_buf_release); +} + +/** + * slash_qdma_client_release() - kref release callback for a control-fd client. + * @ref: kref embedded in the slash_qdma_client being freed. + * + * Runs when the control fd and all qpair fds derived from it have closed. + * By this point the buffer table has already been drained in + * slash_qdma_fop_release(); here we just release the device reference and + * free the client. + */ +static void slash_qdma_client_release(struct kref *ref) +{ + struct slash_qdma_client *client = + container_of(ref, struct slash_qdma_client, ref); + + xa_destroy(&client->buffers); + if (client->qdma_dev) + kref_put(&client->qdma_dev->ref, slash_qdma_dev_release); + kfree(client); +} + +/** + * slash_qdma_buf_lookup_get() - Look up a buffer by id and take a ref. + * @client: Owning client context. + * @buf_id: Buffer handle. + * + * Returns the buffer with an extra reference held, or NULL if no such + * buffer exists. The xa_lock serialises against unregister/teardown so the + * buffer cannot be freed between lookup and the kref_get. + */ +static struct slash_qdma_buf * +slash_qdma_buf_lookup_get(struct slash_qdma_client *client, u32 buf_id) +{ + struct slash_qdma_buf *buf; + + xa_lock(&client->buffers); + buf = xa_load(&client->buffers, buf_id); + if (buf) + slash_qdma_buf_get(buf); + xa_unlock(&client->buffers); + + return buf; +} + +/* ───────────────────────────────────────────────────────────────────── + * IOCTL: buffer register / unregister + * ───────────────────────────────────────────────────────────────────── */ + +/** + * slash_qdma_ioctl_buf_register_w() - Pin and DMA-map a host buffer. + * @misc: Miscdevice handle (for logging). + * @client: Owning control-fd client. + * @uarg: User pointer to a struct slash_qdma_buf_register. + * + * Pins the pages backing the user buffer, builds a scatter-gather list + * (reusing the same 4 KiB / 2 MiB granule detection as the per-transfer + * path), DMA-maps every entry once, and inserts the resulting buffer into + * the client's table under a freshly allocated buf_id. + * + * Return: 0 on success, negative errno on failure. + */ +static int slash_qdma_ioctl_buf_register_w(struct miscdevice *misc, + struct slash_qdma_client *client, + void __user *uarg) +{ + struct slash_qdma_buf_register req; + struct slash_qdma_dev *qdma_dev = client->qdma_dev; + struct slash_qdma_buf *buf; + __u32 user_size = 0; + size_t copy_size; + u32 buf_id; + int rv; + + if (copy_from_user(&user_size, uarg, sizeof(user_size))) + return -EFAULT; + + if (user_size < SLASH_QDMA_BUF_REGISTER_MIN_SIZE) { + dev_warn(misc->this_device, + "qdma: BUF_REGISTER size too small (%u)\n", user_size); + return -EINVAL; + } + + memset(&req, 0, sizeof(req)); + if (copy_from_user(&req, uarg, min_t(size_t, user_size, sizeof(req)))) + return -EFAULT; + + if (req.flags != 0) + return -EINVAL; + + if (req.length == 0 || (req.length % PAGE_SIZE) != 0) + return -EINVAL; + + if ((req.user_addr % PAGE_SIZE) != 0) + return -EINVAL; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + kref_init(&buf->ref); + buf->qdma_dev = qdma_dev; + buf->length = req.length; + buf->iocb.buf = (void __user *)(unsigned long)req.user_addr; + buf->iocb.len = (size_t)req.length; + + /* + * Pin the pages and build the SGL once. Pin writable (write=true) so + * the same registration serves C2H transfers, where the device writes + * into the pages. + */ + rv = slash_qdma_map_user_buf_to_sgl(&buf->iocb, true); + if (rv < 0) { + kfree(buf); + return rv; + } + + if (buf->iocb.pages_nr == 0 || !buf->iocb.sgl) { + slash_qdma_unmap_user_buf(&buf->iocb, false); + slash_qdma_iocb_release(&buf->iocb); + kfree(buf); + return -EINVAL; + } + + buf->granule = buf->iocb.sgl[0].len; + + rv = slash_qdma_buf_dma_map(buf); + if (rv < 0) { + slash_qdma_unmap_user_buf(&buf->iocb, false); + slash_qdma_iocb_release(&buf->iocb); + kfree(buf); + return rv; + } + + rv = xa_alloc(&client->buffers, &buf_id, buf, + SLASH_QDMA_BUF_ID_RANGE, GFP_KERNEL); + if (rv < 0) { + slash_qdma_buf_dma_unmap(buf); + slash_qdma_unmap_user_buf(&buf->iocb, false); + slash_qdma_iocb_release(&buf->iocb); + kfree(buf); + return rv; + } + + buf->buf_id = buf_id; + + SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, + "buf register: id=%u addr=0x%llx len=%llu granule=%llu entries=%u\n", + buf_id, (unsigned long long)req.user_addr, + (unsigned long long)req.length, + (unsigned long long)buf->granule, + buf->iocb.pages_nr); + + /* Copy the assigned buf_id back to userspace. */ + req.size = sizeof(req); + req.buf_id = buf_id; + copy_size = min_t(size_t, user_size, sizeof(req)); + if (copy_to_user(uarg, &req, copy_size)) { + xa_erase(&client->buffers, buf_id); + slash_qdma_buf_put(buf); + return -EFAULT; + } + if (user_size > sizeof(req)) { + if (clear_user((void __user *)((unsigned long)uarg + sizeof(req)), + user_size - sizeof(req))) { + xa_erase(&client->buffers, buf_id); + slash_qdma_buf_put(buf); + return -EFAULT; + } + } + + return 0; +} + +/** + * slash_qdma_ioctl_buf_unregister_w() - Drop a registered buffer. + * @misc: Miscdevice handle (unused). + * @client: Owning control-fd client. + * @uarg: User pointer to a struct slash_qdma_buf_unregister. + * + * Removes the buffer from the client table so no new transfer can find it, + * then drops the table's reference. Actual unpin/unmap is deferred to the + * buffer's release callback once any in-flight transfer has finished. + * + * Return: 0 on success, negative errno on failure. + */ +static int slash_qdma_ioctl_buf_unregister_w(struct miscdevice *misc, + struct slash_qdma_client *client, + void __user *uarg) +{ + struct slash_qdma_buf_unregister req; + struct slash_qdma_buf *buf; + __u32 user_size = 0; + + (void)misc; + + if (copy_from_user(&user_size, uarg, sizeof(user_size))) + return -EFAULT; + + if (user_size < SLASH_QDMA_BUF_UNREGISTER_MIN_SIZE) + return -EINVAL; + + memset(&req, 0, sizeof(req)); + if (copy_from_user(&req, uarg, min_t(size_t, user_size, sizeof(req)))) + return -EFAULT; + + buf = xa_erase(&client->buffers, req.buf_id); + if (!buf) + return -ENOENT; + + slash_qdma_buf_put(buf); + + return 0; +} + /** * slash_qdma_qpair_read_write() - Perform a DMA transfer via a qpair fd. * @file: The anon_inode file for this queue pair. @@ -2965,25 +3360,170 @@ static ssize_t slash_qdma_qpair_write(struct file *file, const char __user *buf, count, ppos, true); } +/** + * slash_qdma_qpair_transfer() - Registered-buffer DMA transfer on a qpair fd. + * @file: Anon_inode file for the queue pair. + * @uarg: User pointer to a struct slash_qdma_transfer. + * + * Looks up the registered buffer by id in the owning client, validates the + * requested slice against the buffer's page granule and length, resolves the + * queue handle for the requested direction, and submits the cached, + * pre-DMA-mapped SGL slice (req->dma_mapped = 1) to libqdma. + * + * Unlike the legacy read/write path, no pages are pinned or DMA-mapped here: + * that work was amortised at registration time. + * + * Return: number of bytes transferred (>= 0) on success, negative errno on + * failure. + */ +static long slash_qdma_qpair_transfer(struct file *file, void __user *uarg) +{ + struct slash_qdma_qpair_file_ctx *ctx = file->private_data; + struct slash_qdma_transfer req; + struct slash_qdma_dev *qdma_dev; + struct slash_qdma_qpair_entry *entry; + struct slash_qdma_client *client; + struct slash_qdma_buf *buf; + struct qdma_request qreq; + unsigned long qhndl; + bool write; + u32 dir_bit; + enum queue_type_t qtype; + u64 start_entry, n_entries; + __u32 user_size = 0; + ssize_t res; + + if (!ctx) + return -EINVAL; + + qdma_dev = ctx->qdma_dev; + entry = ctx->entry; + client = ctx->client; + + if (!qdma_dev || !entry || !client) + return -ENODEV; + + if (copy_from_user(&user_size, uarg, sizeof(user_size))) + return -EFAULT; + + if (user_size < SLASH_QDMA_TRANSFER_MIN_SIZE) + return -EINVAL; + + memset(&req, 0, sizeof(req)); + if (copy_from_user(&req, uarg, min_t(size_t, user_size, sizeof(req)))) + return -EFAULT; + + switch (req.direction) { + case SLASH_QDMA_XFER_H2C: + write = true; + dir_bit = SLASH_QDMA_DIR_H2C; + qtype = Q_H2C; + break; + case SLASH_QDMA_XFER_C2H: + write = false; + dir_bit = SLASH_QDMA_DIR_C2H; + qtype = Q_C2H; + break; + default: + return -EINVAL; + } + + /* Resolve and ref the registered buffer. */ + buf = slash_qdma_buf_lookup_get(client, req.buf_id); + if (!buf) + return -ENOENT; + + /* Validate the requested slice against the buffer's page granule. */ + if (buf->granule == 0 || req.length == 0 || + (req.buf_offset % buf->granule) != 0 || + (req.length % buf->granule) != 0) { + slash_qdma_buf_put(buf); + return -EINVAL; + } + if (req.buf_offset > buf->length || + req.length > buf->length - req.buf_offset) { + slash_qdma_buf_put(buf); + return -EINVAL; + } + + start_entry = req.buf_offset / buf->granule; + n_entries = req.length / buf->granule; + if (start_entry + n_entries > buf->iocb.pages_nr) { + slash_qdma_buf_put(buf); + return -EINVAL; + } + + /* Check device liveness and resolve the queue handle for the direction. */ + mutex_lock(&qdma_dev->lock); + if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) { + mutex_unlock(&qdma_dev->lock); + slash_qdma_buf_put(buf); + return -ENODEV; + } + if (!(entry->dir_mask & dir_bit) || + !slash_qdma_qhndl_is_valid(entry->qhndl[qtype])) { + mutex_unlock(&qdma_dev->lock); + slash_qdma_buf_put(buf); + return -ENODEV; + } + qhndl = entry->qhndl[qtype]; + mutex_unlock(&qdma_dev->lock); + + /* + * Submit the cached SGL slice. dma_mapped = 1 tells libqdma the SGL is + * already DMA-mapped (dma_addr filled at registration), so it skips the + * per-request map/unmap entirely. + */ + memset(&qreq, 0, sizeof(qreq)); + qreq.sgcnt = (unsigned int)n_entries; + qreq.sgl = &buf->iocb.sgl[start_entry]; + qreq.write = write ? 1 : 0; + qreq.dma_mapped = 1; + qreq.udd_len = 0; + qreq.ep_addr = (u64)req.dev_addr; + qreq.count = (unsigned int)req.length; + qreq.timeout_ms = 10 * 1000; + qreq.fp_done = NULL; + qreq.h2c_eot = 1; + + SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, + "transfer: qid=%u buf=%u off=%llu dev=0x%llx len=%llu dir=%s\n", + ctx->qid, req.buf_id, + (unsigned long long)req.buf_offset, + (unsigned long long)req.dev_addr, + (unsigned long long)req.length, + write ? "H2C" : "C2H"); + + res = qdma_request_submit(qdma_dev->qdma_handle, qhndl, &qreq); + + slash_qdma_buf_put(buf); + + if (res < 0) + return (long)res; + + return (long)res; +} + /** * slash_qdma_qpair_ioctl() - Ioctl handler for per-qpair anon_inode fds. * @file: Anon_inode file. * @cmd: Ioctl command number. * @arg: User-space argument. * - * Currently a stub — no per-fd ioctls are defined. Returns -ENOTTY - * for all commands. + * Supports SLASH_QDMA_QPAIR_IOCTL_TRANSFER (registered-buffer DMA transfer). * - * Return: -ENOTTY (no valid ioctl). + * Return: bytes transferred (>= 0) for TRANSFER, or -ENOTTY for any other + * command. */ static long slash_qdma_qpair_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - (void)file; - (void)cmd; - (void)arg; - - return -ENOTTY; + switch (cmd) { + case SLASH_QDMA_QPAIR_IOCTL_TRANSFER: + return slash_qdma_qpair_transfer(file, (void __user *)arg); + default: + return -ENOTTY; + } } /** @@ -3010,6 +3550,8 @@ static int slash_qdma_qpair_release(struct inode *inode, struct file *file) if (ctx) { if (ctx->entry) slash_qdma_qpair_put(ctx->entry); + if (ctx->client) + kref_put(&ctx->client->ref, slash_qdma_client_release); if (ctx->qdma_dev) kref_put(&ctx->qdma_dev->ref, slash_qdma_dev_release); kfree(ctx); @@ -3046,9 +3588,10 @@ static int slash_qdma_qpair_release(struct inode *inode, struct file *file) * Return: The new fd (>= 0) on success, negative errno on failure. */ static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc, - struct slash_qdma_dev *qdma_dev, + struct slash_qdma_client *client, void __user *uarg) { + struct slash_qdma_dev *qdma_dev = client->qdma_dev; struct slash_qdma_qpair_fd_request req; __u32 user_size = 0; size_t copy_size; @@ -3096,18 +3639,26 @@ static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc, */ slash_qdma_qpair_get(entry); kref_get(&qdma_dev->ref); + /* + * Take a ref on the owning client so handle-based transfers issued on + * this qpair fd can resolve registered buffers even if the control fd + * that created the qpair is closed first. + */ + kref_get(&client->ref); mutex_unlock(&qdma_dev->lock); /* Allocate the per-fd context. */ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { slash_qdma_qpair_put(entry); + kref_put(&client->ref, slash_qdma_client_release); kref_put(&qdma_dev->ref, slash_qdma_dev_release); return -ENOMEM; } ctx->qdma_dev = qdma_dev; ctx->entry = entry; + ctx->client = client; ctx->qid = req.qid; /* Create the anonymous inode file with read/write access. */ @@ -3116,6 +3667,7 @@ static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc, if (IS_ERR(file)) { err = PTR_ERR(file); slash_qdma_qpair_put(entry); + kref_put(&client->ref, slash_qdma_client_release); kref_put(&qdma_dev->ref, slash_qdma_dev_release); kfree(ctx); return err; From b94e3199a8656ceef0b7054f95c5d8632a636284 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Fri, 12 Jun 2026 14:05:51 +0100 Subject: [PATCH 12/23] libslash: add qdma buffer register/unregister/transfer wrappers and mock Signed-off-by: Vlad-Gabriel Serbu --- driver/libslash/include/slash/qdma.h | 54 ++++++++++++ driver/libslash/src/qdma.c | 96 ++++++++++++++++++++ driver/libslash/src/qdma_mock.c | 127 +++++++++++++++++++++++++++ driver/libslash/src/qdma_mock.h | 9 ++ driver/libslash/tests/qdma_test.cpp | 89 +++++++++++++++++++ 5 files changed, 375 insertions(+) diff --git a/driver/libslash/include/slash/qdma.h b/driver/libslash/include/slash/qdma.h index 8d726544..db9b95ad 100644 --- a/driver/libslash/include/slash/qdma.h +++ b/driver/libslash/include/slash/qdma.h @@ -36,6 +36,13 @@ * via lseek()/pread()/pwrite() is also supported. splice(), mmap(), * and poll() are not available. * + * Registered buffers: + * For high-throughput transfers, a host buffer can be registered once + * with slash_qdma_buffer_register() (pinning its pages and DMA-mapping + * it), then moved with slash_qdma_transfer() which references the buffer + * by handle instead of re-pinning per call. Buffers are owned by the + * open QDMA handle and are auto-released when it is closed. + * * Error conventions: int-returning functions return -1 with errno set. * Pointer-returning functions return NULL with errno set. */ @@ -46,6 +53,7 @@ #include "uapi/slash_interface.h" #include +#include #ifdef __cplusplus extern "C" { @@ -151,6 +159,52 @@ int slash_qdma_qpair_del(struct slash_qdma *qdma, uint32_t qid); */ int slash_qdma_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags); +/** + * @brief Register a host buffer for DMA, pinning and DMA-mapping it once. + * + * @param qdma Open QDMA handle. + * @param addr Page-aligned host buffer base. + * @param length Buffer length in bytes (non-zero multiple of the page size). + * @param buf_id [out] Receives the kernel-assigned buffer handle. + * + * The buffer is owned by @qdma and is automatically released when the + * handle is closed. Pass the returned @buf_id to slash_qdma_transfer(). + * + * @return 0 on success, -1 on failure (errno set). + */ +int slash_qdma_buffer_register(struct slash_qdma *qdma, void *addr, + uint64_t length, uint32_t *buf_id); + +/** + * @brief Unregister a buffer previously registered with + * slash_qdma_buffer_register(). + * + * @param qdma Open QDMA handle. + * @param buf_id Buffer handle to release. + * + * @return 0 on success, -1 on failure (errno set). + */ +int slash_qdma_buffer_unregister(struct slash_qdma *qdma, uint32_t buf_id); + +/** + * @brief Perform a DMA transfer using a registered buffer. + * + * @param qdma Open QDMA handle (used to dispatch to the mock backend). + * @param qpair_fd Queue-pair I/O fd from slash_qdma_qpair_get_fd(). + * @param buf_id Registered buffer handle. + * @param buf_offset Byte offset within the registered buffer. + * @param dev_addr Device-side (endpoint) address. + * @param length Number of bytes to transfer. + * @param direction One of enum slash_qdma_transfer_dir (H2C or C2H). + * + * @return Number of bytes transferred (>= 0) on success, -1 on failure + * (errno set). + */ +ssize_t slash_qdma_transfer(struct slash_qdma *qdma, int qpair_fd, + uint32_t buf_id, uint64_t buf_offset, + uint64_t dev_addr, uint64_t length, + uint32_t direction); + #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ diff --git a/driver/libslash/src/qdma.c b/driver/libslash/src/qdma.c index 68c38b6d..a95ec114 100644 --- a/driver/libslash/src/qdma.c +++ b/driver/libslash/src/qdma.c @@ -146,6 +146,7 @@ int slash_qdma_qpair_add(struct slash_qdma *qdma, tmp.size = sizeof(tmp); tmp.mode = req->mode; tmp.dir_mask = req->dir_mask; + tmp.mm_channel = req->mm_channel; tmp.h2c_ring_sz = req->h2c_ring_sz; tmp.c2h_ring_sz = req->c2h_ring_sz; tmp.cmpt_ring_sz = req->cmpt_ring_sz; @@ -248,3 +249,98 @@ int slash_qdma_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags) return fd; } +int slash_qdma_buffer_register(struct slash_qdma *qdma, void *addr, + uint64_t length, uint32_t *buf_id) +{ + struct slash_qdma_buf_register req; + int ret; + + if (qdma == NULL || addr == NULL || buf_id == NULL) { + errno = EINVAL; + return -1; + } + + if (qdma->priv) { + return slash_qdma_mock_buffer_register(qdma, addr, length, buf_id); + } + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.user_addr = (uint64_t)(uintptr_t)addr; + req.length = length; + + ret = ioctl(qdma->fd, SLASH_QDMA_IOCTL_BUF_REGISTER, &req); + if (ret < 0) { + return -1; + } + + *buf_id = req.buf_id; + + return 0; +} + +int slash_qdma_buffer_unregister(struct slash_qdma *qdma, uint32_t buf_id) +{ + struct slash_qdma_buf_unregister req; + int ret; + + if (qdma == NULL) { + errno = EINVAL; + return -1; + } + + if (qdma->priv) { + return slash_qdma_mock_buffer_unregister(qdma, buf_id); + } + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.buf_id = buf_id; + + ret = ioctl(qdma->fd, SLASH_QDMA_IOCTL_BUF_UNREGISTER, &req); + if (ret < 0) { + return -1; + } + + return 0; +} + +ssize_t slash_qdma_transfer(struct slash_qdma *qdma, int qpair_fd, + uint32_t buf_id, uint64_t buf_offset, + uint64_t dev_addr, uint64_t length, + uint32_t direction) +{ + struct slash_qdma_transfer req; + int ret; + + if (qdma == NULL || qpair_fd < 0) { + errno = EINVAL; + return -1; + } + + if (direction != SLASH_QDMA_XFER_H2C && direction != SLASH_QDMA_XFER_C2H) { + errno = EINVAL; + return -1; + } + + if (qdma->priv) { + return slash_qdma_mock_transfer(qdma, qpair_fd, buf_id, buf_offset, + dev_addr, length, direction); + } + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.buf_id = buf_id; + req.buf_offset = buf_offset; + req.dev_addr = dev_addr; + req.length = length; + req.direction = direction; + + ret = ioctl(qpair_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req); + if (ret < 0) { + return -1; + } + + return (ssize_t)ret; +} + diff --git a/driver/libslash/src/qdma_mock.c b/driver/libslash/src/qdma_mock.c index 92a24c6c..3194f43f 100644 --- a/driver/libslash/src/qdma_mock.c +++ b/driver/libslash/src/qdma_mock.c @@ -39,6 +39,7 @@ #include #define QDMA_MOCK_MAX_QUEUES 64 +#define QDMA_MOCK_MAX_BUFS 64 struct slash_qdma_mock_qpair { bool in_use; @@ -46,8 +47,15 @@ struct slash_qdma_mock_qpair { int fd; /* backing memfd; -1 when slot is free */ }; +struct slash_qdma_mock_buf { + bool in_use; + void *addr; /* host base address */ + uint64_t length; +}; + struct slash_qdma_mock { struct slash_qdma_mock_qpair queues[QDMA_MOCK_MAX_QUEUES]; + struct slash_qdma_mock_buf bufs[QDMA_MOCK_MAX_BUFS]; }; static struct slash_qdma_mock *mock_ctx(struct slash_qdma *qdma) @@ -257,3 +265,122 @@ int slash_qdma_mock_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flag return new_fd; } + +int slash_qdma_mock_buffer_register(struct slash_qdma *qdma, void *addr, + uint64_t length, uint32_t *buf_id) +{ + struct slash_qdma_mock *ctx; + size_t i; + + if (qdma == NULL || addr == NULL || buf_id == NULL || length == 0) { + errno = EINVAL; + return -1; + } + + ctx = mock_ctx(qdma); + + for (i = 0; i < QDMA_MOCK_MAX_BUFS; ++i) { + if (!ctx->bufs[i].in_use) { + break; + } + } + + if (i == QDMA_MOCK_MAX_BUFS) { + errno = ENOSPC; + return -1; + } + + ctx->bufs[i].in_use = true; + ctx->bufs[i].addr = addr; + ctx->bufs[i].length = length; + + *buf_id = (uint32_t) i; + + return 0; +} + +int slash_qdma_mock_buffer_unregister(struct slash_qdma *qdma, uint32_t buf_id) +{ + struct slash_qdma_mock *ctx; + + if (qdma == NULL || buf_id >= QDMA_MOCK_MAX_BUFS) { + errno = EINVAL; + return -1; + } + + ctx = mock_ctx(qdma); + + if (!ctx->bufs[buf_id].in_use) { + errno = ENOENT; + return -1; + } + + memset(&ctx->bufs[buf_id], 0, sizeof(ctx->bufs[buf_id])); + + return 0; +} + +ssize_t slash_qdma_mock_transfer(struct slash_qdma *qdma, int qpair_fd, + uint32_t buf_id, uint64_t buf_offset, + uint64_t dev_addr, uint64_t length, + uint32_t direction) +{ + struct slash_qdma_mock *ctx; + struct slash_qdma_mock_buf *buf; + char *host; + uint64_t done = 0; + + if (qdma == NULL || qpair_fd < 0 || buf_id >= QDMA_MOCK_MAX_BUFS) { + errno = EINVAL; + return -1; + } + + ctx = mock_ctx(qdma); + buf = &ctx->bufs[buf_id]; + + if (!buf->in_use) { + errno = ENOENT; + return -1; + } + + if (length == 0 || buf_offset > buf->length || + length > buf->length - buf_offset) { + errno = EINVAL; + return -1; + } + + host = (char *) buf->addr + buf_offset; + + /* + * Emulate the device endpoint with the queue's backing memfd: H2C writes + * host data to the memfd at dev_addr, C2H reads it back. Loop to absorb + * short transfers from the underlying file ops. + */ + while (done < length) { + ssize_t n; + + if (direction == SLASH_QDMA_XFER_H2C) { + n = pwrite(qpair_fd, host + done, (size_t)(length - done), + (off_t)(dev_addr + done)); + } else if (direction == SLASH_QDMA_XFER_C2H) { + n = pread(qpair_fd, host + done, (size_t)(length - done), + (off_t)(dev_addr + done)); + } else { + errno = EINVAL; + return -1; + } + + if (n < 0) { + if (errno == EINTR) { + continue; + } + return -1; + } + if (n == 0) { + break; + } + done += (uint64_t) n; + } + + return (ssize_t) done; +} diff --git a/driver/libslash/src/qdma_mock.h b/driver/libslash/src/qdma_mock.h index 36f3d596..34f591ed 100644 --- a/driver/libslash/src/qdma_mock.h +++ b/driver/libslash/src/qdma_mock.h @@ -25,6 +25,8 @@ #include +#include + struct slash_qdma *slash_qdma_mock_open(void); int slash_qdma_mock_close(struct slash_qdma *qdma); int slash_qdma_mock_info_read(struct slash_qdma *qdma, struct slash_qdma_info *info); @@ -33,5 +35,12 @@ int slash_qdma_mock_qpair_start(struct slash_qdma *qdma, uint32_t qid); int slash_qdma_mock_qpair_stop(struct slash_qdma *qdma, uint32_t qid); int slash_qdma_mock_qpair_del(struct slash_qdma *qdma, uint32_t qid); int slash_qdma_mock_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags); +int slash_qdma_mock_buffer_register(struct slash_qdma *qdma, void *addr, + uint64_t length, uint32_t *buf_id); +int slash_qdma_mock_buffer_unregister(struct slash_qdma *qdma, uint32_t buf_id); +ssize_t slash_qdma_mock_transfer(struct slash_qdma *qdma, int qpair_fd, + uint32_t buf_id, uint64_t buf_offset, + uint64_t dev_addr, uint64_t length, + uint32_t direction); #endif /* LIBSLASH_QDMA_MOCK_H */ diff --git a/driver/libslash/tests/qdma_test.cpp b/driver/libslash/tests/qdma_test.cpp index 5b024111..98ae2c9e 100644 --- a/driver/libslash/tests/qdma_test.cpp +++ b/driver/libslash/tests/qdma_test.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -100,6 +101,39 @@ TEST(QdmaNullTest, QpaiGetFd) { EXPECT_EQ(errno, EINVAL); } +TEST(QdmaNullTest, BufferRegister) { + uint32_t buf_id = 0; + uint8_t local = 0; + errno = 0; + EXPECT_EQ(slash_qdma_buffer_register(nullptr, &local, 4096, &buf_id), -1); + EXPECT_EQ(errno, EINVAL); + + struct slash_qdma fake{}; + fake.fd = -1; + errno = 0; + EXPECT_EQ(slash_qdma_buffer_register(&fake, nullptr, 4096, &buf_id), -1); + EXPECT_EQ(errno, EINVAL); +} + +TEST(QdmaNullTest, BufferUnregister) { + errno = 0; + EXPECT_EQ(slash_qdma_buffer_unregister(nullptr, 0), -1); + EXPECT_EQ(errno, EINVAL); +} + +TEST(QdmaNullTest, Transfer) { + errno = 0; + EXPECT_EQ(slash_qdma_transfer(nullptr, 3, 0, 0, 0, 4096, SLASH_QDMA_XFER_H2C), -1); + EXPECT_EQ(errno, EINVAL); + + struct slash_qdma fake{}; + fake.fd = -1; + errno = 0; + /* Invalid direction is rejected before any backend dispatch. */ + EXPECT_EQ(slash_qdma_transfer(&fake, 3, 0, 0, 0, 4096, 0), -1); + EXPECT_EQ(errno, EINVAL); +} + // ─── Real device tests (requires /dev/slash_qdma_ctl0) ─────────────────────── class ParametrizedQdmaTest : public ::testing::TestWithParam { @@ -178,6 +212,61 @@ TEST_P(ParametrizedQdmaTest, QueueDmaTransfer) { EXPECT_EQ(slash_qdma_qpair_del(qdma_, qid), 0); } +TEST_P(ParametrizedQdmaTest, RegisteredBufferTransfer) { + static constexpr size_t XFER_SIZE = 4096; + + struct slash_qdma_qpair_add req{}; + req.mode = 0; /* QDMA_Q_MODE_MM */ + req.dir_mask = 0x3; /* H2C | C2H */ + + ASSERT_EQ(slash_qdma_qpair_add(qdma_, &req), 0); + uint32_t qid = req.qid; + ASSERT_EQ(slash_qdma_qpair_start(qdma_, qid), 0); + + int queue_fd = slash_qdma_qpair_get_fd(qdma_, qid, 0); + ASSERT_GE(queue_fd, 0); + + // Page-aligned host staging buffers, as registration requires. + void *src_mem = nullptr; + void *dst_mem = nullptr; + ASSERT_EQ(posix_memalign(&src_mem, 4096, XFER_SIZE), 0); + ASSERT_EQ(posix_memalign(&dst_mem, 4096, XFER_SIZE), 0); + auto *src = static_cast(src_mem); + auto *dst = static_cast(dst_mem); + for (size_t i = 0; i < XFER_SIZE; ++i) { + src[i] = static_cast(i & 0xFF); + } + std::memset(dst, 0, XFER_SIZE); + + uint32_t src_buf = 0; + uint32_t dst_buf = 0; + ASSERT_EQ(slash_qdma_buffer_register(qdma_, src, XFER_SIZE, &src_buf), 0); + ASSERT_EQ(slash_qdma_buffer_register(qdma_, dst, XFER_SIZE, &dst_buf), 0); + + // H2C: push the source buffer to the device. + ssize_t written = slash_qdma_transfer(qdma_, queue_fd, src_buf, 0, + DDR_BASE_ADDRESS, XFER_SIZE, + SLASH_QDMA_XFER_H2C); + EXPECT_EQ(written, static_cast(XFER_SIZE)); + + // C2H: pull it back into the destination buffer and verify. + ssize_t read_bytes = slash_qdma_transfer(qdma_, queue_fd, dst_buf, 0, + DDR_BASE_ADDRESS, XFER_SIZE, + SLASH_QDMA_XFER_C2H); + EXPECT_EQ(read_bytes, static_cast(XFER_SIZE)); + EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); + + EXPECT_EQ(slash_qdma_buffer_unregister(qdma_, src_buf), 0); + EXPECT_EQ(slash_qdma_buffer_unregister(qdma_, dst_buf), 0); + + free(src_mem); + free(dst_mem); + + EXPECT_EQ(close(queue_fd), 0); + EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qid), 0); + EXPECT_EQ(slash_qdma_qpair_del(qdma_, qid), 0); +} + TEST_P(ParametrizedQdmaTest, CloseSucceeds) { EXPECT_EQ(slash_qdma_close(qdma_), 0); qdma_ = nullptr; From 5dc6e4fbfcfcceef0b830737a120c38cc7cdf11d Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Fri, 12 Jun 2026 14:06:00 +0100 Subject: [PATCH 13/23] tests: cover qdma registered-buffer kselftest abi Signed-off-by: Vlad-Gabriel Serbu --- driver/tests/test_slash_qdma.c | 209 +++++++++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) diff --git a/driver/tests/test_slash_qdma.c b/driver/tests/test_slash_qdma.c index bf706cf9..afe7bfe7 100644 --- a/driver/tests/test_slash_qdma.c +++ b/driver/tests/test_slash_qdma.c @@ -823,4 +823,213 @@ TEST_F(qdma, hugepage_write_read_verify) munmap(read_buf, HUGE_TRANSFER_SIZE); } +/* ---------- registered buffers ---------- */ + +/* Register a host buffer via the control fd; returns 0 or -errno. */ +static int qdma_buf_register(int ctl_fd, void *addr, uint64_t length, + uint32_t *buf_id) +{ + struct slash_qdma_buf_register req; + int ret; + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.user_addr = (uint64_t)(uintptr_t)addr; + req.length = length; + + ret = ioctl(ctl_fd, SLASH_QDMA_IOCTL_BUF_REGISTER, &req); + if (ret < 0) + return -errno; + + *buf_id = req.buf_id; + return 0; +} + +/* Unregister a buffer via the control fd; returns 0 or -errno. */ +static int qdma_buf_unregister(int ctl_fd, uint32_t buf_id) +{ + struct slash_qdma_buf_unregister req; + int ret; + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.buf_id = buf_id; + + ret = ioctl(ctl_fd, SLASH_QDMA_IOCTL_BUF_UNREGISTER, &req); + return ret < 0 ? -errno : 0; +} + +/* Issue a registered-buffer transfer on a qpair fd; returns ioctl result. */ +static long qdma_buf_transfer(int io_fd, uint32_t buf_id, uint64_t buf_offset, + uint64_t dev_addr, uint64_t length, + uint32_t direction) +{ + struct slash_qdma_transfer req; + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.buf_id = buf_id; + req.buf_offset = buf_offset; + req.dev_addr = dev_addr; + req.length = length; + req.direction = direction; + + return ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req); +} + +TEST_F(qdma, buf_register_size_below_input_min_returns_einval) +{ + struct slash_qdma_buf_register req; + + memset(&req, 0, sizeof(req)); + req.size = sizeof(__u32); /* below the trailing input field */ + EXPECT_EQ(-1, ioctl(self->ctl_fd, SLASH_QDMA_IOCTL_BUF_REGISTER, &req)); + EXPECT_EQ(EINVAL, errno); +} + +TEST_F(qdma, buf_register_zero_length_returns_einval) +{ + uint8_t *buf; + uint32_t buf_id = 0; + + buf = aligned_alloc(4096, TRANSFER_SIZE); + ASSERT_NE(NULL, buf); + + EXPECT_EQ(-EINVAL, qdma_buf_register(self->ctl_fd, buf, 0, &buf_id)); + + free(buf); +} + +TEST_F(qdma, buf_register_unaligned_returns_einval) +{ + uint8_t *buf; + uint32_t buf_id = 0; + + buf = aligned_alloc(4096, TRANSFER_SIZE * 2); + ASSERT_NE(NULL, buf); + + /* Misaligned base address is rejected. */ + EXPECT_EQ(-EINVAL, + qdma_buf_register(self->ctl_fd, buf + 1, TRANSFER_SIZE, &buf_id)); + + free(buf); +} + +TEST_F(qdma, transfer_size_below_input_min_returns_einval) +{ + struct slash_qdma_transfer req; + + bring_up_qpair(_metadata, self, 0x3); + + memset(&req, 0, sizeof(req)); + req.size = sizeof(__u32); /* below the trailing input field */ + EXPECT_EQ(-1, ioctl(self->io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req)); + EXPECT_EQ(EINVAL, errno); +} + +TEST_F(qdma, transfer_unknown_buf_returns_enoent) +{ + long ret; + + bring_up_qpair(_metadata, self, 0x3); + + ret = qdma_buf_transfer(self->io_fd, 0xDEAD, 0, + get_dma_addr(), TRANSFER_SIZE, + SLASH_QDMA_XFER_H2C); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ENOENT, errno); +} + +TEST_F(qdma, transfer_wrong_direction_returns_enodev) +{ + uint8_t *buf; + uint32_t buf_id = 0; + long ret; + + bring_up_qpair(_metadata, self, 0x1); /* H2C only */ + + buf = mmap(NULL, TRANSFER_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, buf); + + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id)); + + /* C2H is not enabled on this qpair. */ + ret = qdma_buf_transfer(self->io_fd, buf_id, 0, + get_dma_addr(), TRANSFER_SIZE, + SLASH_QDMA_XFER_C2H); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ENODEV, errno); + + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); + munmap(buf, TRANSFER_SIZE); +} + +TEST_F(qdma, transfer_out_of_range_returns_einval) +{ + uint8_t *buf; + uint32_t buf_id = 0; + long ret; + + bring_up_qpair(_metadata, self, 0x3); + + buf = mmap(NULL, TRANSFER_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, buf); + + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id)); + + /* Slice extends past the registered length. */ + ret = qdma_buf_transfer(self->io_fd, buf_id, TRANSFER_SIZE, + get_dma_addr(), TRANSFER_SIZE, + SLASH_QDMA_XFER_H2C); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); + munmap(buf, TRANSFER_SIZE); +} + +TEST_F(qdma, registered_buffer_round_trip) +{ + const size_t xfer_size = TRANSFER_SIZE * 8; /* 8 base pages */ + uint8_t *write_buf, *read_buf; + uint32_t write_id = 0, read_id = 0; + uint64_t dma_addr = get_dma_addr(); + long ret; + + bring_up_qpair(_metadata, self, 0x3); + + write_buf = mmap(NULL, xfer_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, write_buf); + read_buf = mmap(NULL, xfer_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, read_buf); + + fill_pattern(write_buf, xfer_size); + memset(read_buf, 0, xfer_size); + + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, xfer_size, + &write_id)); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, xfer_size, + &read_id)); + + ret = qdma_buf_transfer(self->io_fd, write_id, 0, dma_addr, xfer_size, + SLASH_QDMA_XFER_H2C); + ASSERT_EQ((long)xfer_size, ret); + + ret = qdma_buf_transfer(self->io_fd, read_id, 0, dma_addr, xfer_size, + SLASH_QDMA_XFER_C2H); + ASSERT_EQ((long)xfer_size, ret); + + EXPECT_EQ(0, memcmp(write_buf, read_buf, xfer_size)); + + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, write_id)); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, read_id)); + + munmap(write_buf, xfer_size); + munmap(read_buf, xfer_size); +} + TEST_HARNESS_MAIN From 23dca652f52f66b946767132e8ebfefcdafd4c43 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Fri, 12 Jun 2026 14:06:08 +0100 Subject: [PATCH 14/23] docs: document qdma registered-buffer abi Signed-off-by: Vlad-Gabriel Serbu --- docs/reference/kernel-abi/index.rst | 118 ++++++++++++++++++++++++++++ driver/libslash/README.md | 20 +++++ 2 files changed, 138 insertions(+) diff --git a/docs/reference/kernel-abi/index.rst b/docs/reference/kernel-abi/index.rst index dd4e6a1d..d97eda1c 100644 --- a/docs/reference/kernel-abi/index.rst +++ b/docs/reference/kernel-abi/index.rst @@ -718,6 +718,124 @@ as the ``ioctl()`` return value (not as a struct field). - ``-ENOMEM`` — allocation failure - Other negative errno from ``anon_inode_getfile()`` or ``get_unused_fd_flags()`` +``SLASH_QDMA_IOCTL_BUF_REGISTER`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Registers a host buffer for DMA. The kernel pins the backing pages, builds a scatter-gather list, +and DMA-maps it **once**. Subsequent transfers reference the buffer by ``buf_id`` and reuse the +cached, pre-DMA-mapped SGL instead of pinning and mapping per transfer. Registered buffers are owned +by the control-fd open instance they are registered through and are auto-released when that fd is +closed (including on process exit). + +**Interface:** + +.. code-block:: c + + #define SLASH_QDMA_IOCTL_BUF_REGISTER _IOWR('v', 0x54, struct slash_qdma_buf_register) + + struct slash_qdma_buf_register { + __u32 size; /* [in/out] ABI version */ + __u32 flags; /* [in] Reserved; must be 0 */ + __u64 user_addr; /* [in] Page-aligned host buffer base */ + __u64 length; /* [in] Buffer length in bytes (page multiple) */ + __u32 buf_id; /* [out] Kernel-assigned buffer handle */ + __u32 pad0; /* padding */ + }; + +**Direction:** ``_IOWR`` — userspace writes ``flags``, ``user_addr``, ``length``; the kernel writes +back ``buf_id``. + +**Preconditions:** + +- ``size`` must cover at least ``length`` (the trailing input field) — otherwise ``-EINVAL`` +- ``flags`` must be 0 +- ``user_addr`` must be page-aligned; ``length`` must be a non-zero multiple of the page size +- The buffer must be backed by a single page granule (all 4 KiB base pages or all 2 MiB hugepages) + +**Postconditions:** + +- ``buf_id`` is filled with the client-scoped handle, used in ``SLASH_QDMA_QPAIR_IOCTL_TRANSFER``. +- The pages remain pinned and DMA-mapped until the buffer is unregistered or the owning control fd + is closed. + +**Return values:** + +- ``0`` — success +- ``-EFAULT`` — copy failure +- ``-EINVAL`` — ``size`` too small, non-zero ``flags``, misaligned/zero ``length`` or ``user_addr``, + or a page granule that does not match the transfer data path +- ``-ENOMEM`` — allocation, pinning, or DMA-mapping failure +- ``-EBUSY`` — no buffer IDs available +- ``-ENODEV`` — device shutting down + +``SLASH_QDMA_IOCTL_BUF_UNREGISTER`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Removes a registered buffer from the owning client's table. The pages are unpinned and the DMA +mapping torn down once no in-flight transfer still references the buffer. + +**Interface:** + +.. code-block:: c + + #define SLASH_QDMA_IOCTL_BUF_UNREGISTER _IOWR('v', 0x55, struct slash_qdma_buf_unregister) + + struct slash_qdma_buf_unregister { + __u32 size; /* [in/out] ABI version */ + __u32 buf_id; /* [in] Buffer handle from BUF_REGISTER */ + }; + +**Return values:** + +- ``0`` — success +- ``-EFAULT`` — copy failure +- ``-EINVAL`` — ``size`` too small +- ``-ENOENT`` — ``buf_id`` not found in this client's table + +``SLASH_QDMA_QPAIR_IOCTL_TRANSFER`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Performs a DMA transfer using a registered buffer. Unlike ``read``/``write``/``pread``/``pwrite``, +this ioctl is issued on a **queue-pair I/O fd** (from ``SLASH_QDMA_IOCTL_QPAIR_GET_FD``), not the +control device. No pages are pinned or DMA-mapped on this path — that work was amortised at +registration time — so it submits the cached, pre-DMA-mapped SGL slice directly. + +**Interface:** + +.. code-block:: c + + #define SLASH_QDMA_QPAIR_IOCTL_TRANSFER _IOWR('v', 0x56, struct slash_qdma_transfer) + + struct slash_qdma_transfer { + __u32 size; /* [in/out] ABI version */ + __u32 buf_id; /* [in] Registered buffer handle */ + __u64 buf_offset; /* [in] Byte offset within the registered buffer */ + __u64 dev_addr; /* [in] Device-side (endpoint) address */ + __u64 length; /* [in] Number of bytes to transfer */ + __u32 direction; /* [in] 1=H2C (write), 2=C2H (read) */ + __u32 pad0; /* padding */ + }; + +**Direction:** ``_IOWR`` — userspace writes all input fields; the number of bytes transferred is +returned as the ``ioctl()`` return value (not as a struct field). + +**Preconditions:** + +- ``size`` must cover at least ``direction`` (the trailing input field) — otherwise ``-EINVAL`` +- ``direction`` must be 1 (H2C) or 2 (C2H) and must be enabled on the queue pair +- ``buf_id`` must refer to a buffer registered on the same control fd that created this qpair fd +- ``buf_offset`` and ``length`` must be aligned to the buffer's page granule, ``length`` non-zero, + and ``buf_offset + length`` must not exceed the registered length + +**Return values:** + +- ``>= 0`` — number of bytes transferred (success) +- ``-EFAULT`` — copy failure +- ``-EINVAL`` — ``size`` too small, bad ``direction``, or an out-of-range / misaligned slice +- ``-ENOENT`` — ``buf_id`` not found +- ``-ENODEV`` — device shutting down or the requested direction is not enabled on the qpair +- Other negative errno from libqdma's ``qdma_request_submit()`` + Device resets and hotplugging: ``/dev/slash_hotplug`` ===================================================== diff --git a/driver/libslash/README.md b/driver/libslash/README.md index 9e04813a..e9915dd4 100644 --- a/driver/libslash/README.md +++ b/driver/libslash/README.md @@ -119,6 +119,26 @@ slash_qdma_qpair_del(qdma, qid); slash_qdma_close(qdma); ``` +For high-throughput transfers, register a host buffer once (pinning its pages +and DMA-mapping it) and then move data by handle, avoiding per-transfer pinning: + +```c +/* buf must be page-aligned and a whole number of pages */ +uint32_t buf_id; +slash_qdma_buffer_register(qdma, buf, len, &buf_id); + +int fd = slash_qdma_qpair_get_fd(qdma, qid, O_CLOEXEC); + +/* H2C: host -> device at dev_addr */ +slash_qdma_transfer(qdma, fd, buf_id, /*buf_offset=*/0, dev_addr, len, + SLASH_QDMA_XFER_H2C); +/* C2H: device -> host */ +slash_qdma_transfer(qdma, fd, buf_id, 0, dev_addr, len, SLASH_QDMA_XFER_C2H); + +close(fd); +slash_qdma_buffer_unregister(qdma, buf_id); +``` + ### Hotplug — PCIe device lifecycle Typical FPGA reconfiguration flow: From a8146eb307ee912b14e07c40b502e4ab92947f00 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Fri, 12 Jun 2026 14:06:15 +0100 Subject: [PATCH 15/23] smi: add --ring-size-index and use registered buffers for raw transfers Signed-off-by: Vlad-Gabriel Serbu --- smi/src/qdma_driver_backend.cpp | 31 ++-- smi/src/qdma_driver_backend.hpp | 13 +- smi/src/smi.cpp | 20 +++ smi/src/validate.cpp | 250 +++++++++++++++++++++++++++----- smi/src/validate.hpp | 21 +++ 5 files changed, 283 insertions(+), 52 deletions(-) diff --git a/smi/src/qdma_driver_backend.cpp b/smi/src/qdma_driver_backend.cpp index 7cc25440..fd3faf50 100644 --- a/smi/src/qdma_driver_backend.cpp +++ b/smi/src/qdma_driver_backend.cpp @@ -302,8 +302,10 @@ constexpr uint32_t QRNGSZ_IDX_DEFAULT = 9; } // namespace -QdmaDriverDevice::QdmaDriverDevice(const std::string& boardBdf) - : nl_(std::make_unique()) { +QdmaDriverDevice::QdmaDriverDevice(const std::string& boardBdf, + std::optional ringSizeIndex) + : nl_(std::make_unique()), + ringSizeIndex_(ringSizeIndex.value_or(QRNGSZ_IDX_DEFAULT)) { const ParsedBdf board = parseBdf(boardBdf); // Enumerate the driver's devices and find the QDMA function on this board. @@ -439,16 +441,16 @@ void QdmaDriverDevice::queueAdd(uint32_t qid) { } } -void QdmaDriverDevice::queueStart(uint32_t qid) { - // Round-robin the queue pair across the function's MM engine channels. - // This has to be carried on `q start`: the driver only reads - // XNL_ATTR_MM_CHANNEL in its start handler (via qdma_queue_config) and - // defaults the queue to channel 0 whenever the attribute is absent. - // mmChannelMax_ is always >= 1, so the modulo is safe. - const uint32_t channel = qid % mmChannelMax_; +void QdmaDriverDevice::queueStart(uint32_t qid, uint32_t channel) { + // The caller chooses the MM engine channel for this queue pair. It has to + // be carried on `q start`: the driver only reads XNL_ATTR_MM_CHANNEL in its + // start handler (via qdma_queue_config) and defaults the queue to channel 0 + // whenever the attribute is absent. mmChannelMax_ is always >= 1, so the + // modulo keeps an out-of-range request inside the device's channel count. + channel %= mmChannelMax_; XnlClient::Response resp = nl_->sendCmd(XNL_CMD_Q_START, index_, {{XNL_ATTR_QIDX, qid}, {XNL_ATTR_NUM_Q, 1}, {XNL_ATTR_QFLAG, QFLAG_MM_BI_START}, - {XNL_ATTR_QRNGSZ_IDX, QRNGSZ_IDX_DEFAULT}, {XNL_ATTR_MM_CHANNEL, channel}}); + {XNL_ATTR_QRNGSZ_IDX, ringSizeIndex_}, {XNL_ATTR_MM_CHANNEL, channel}}); if (resp.present[XNL_ATTR_ERROR] && resp.attrs[XNL_ATTR_ERROR] != 0) { throw std::runtime_error("QDMA q start failed for qid " + std::to_string(qid) + ": " + (resp.genmsg.empty() ? "netlink error" : resp.genmsg)); @@ -481,14 +483,19 @@ std::string QdmaDriverDevice::charDevPath(uint32_t qid) const { QdmaDriverBuffer::QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid, uint64_t physAddr, uint64_t size, - raw::PageSize pageSize) + raw::PageSize pageSize, int mmChannel) : device_(&device), qid_(qid), physAddr_(physAddr) { try { mapping_ = raw::createHostMapping(size, physAddr, pageSize); + // mmChannel < 0 means auto: spread the queue across channels by qid. + const uint32_t channel = (mmChannel < 0) + ? qid_ + : static_cast(mmChannel); + device_->queueAdd(qid_); queueAdded_ = true; - device_->queueStart(qid_); + device_->queueStart(qid_, channel); queueStarted_ = true; const std::string path = device_->charDevPath(qid_); diff --git a/smi/src/qdma_driver_backend.hpp b/smi/src/qdma_driver_backend.hpp index 381e025e..7694fd1b 100644 --- a/smi/src/qdma_driver_backend.hpp +++ b/smi/src/qdma_driver_backend.hpp @@ -40,6 +40,7 @@ #include #include +#include #include #include "raw_transfer.hpp" @@ -57,7 +58,8 @@ class QdmaDriverDevice { public: /// @param boardBdf Board-level BDF "DDDD:BB:DD" (function is resolved by /// enumerating the driver's device list). - explicit QdmaDriverDevice(const std::string& boardBdf); + explicit QdmaDriverDevice(const std::string& boardBdf, + std::optional ringSizeIndex = std::nullopt); ~QdmaDriverDevice(); QdmaDriverDevice(const QdmaDriverDevice&) = delete; @@ -73,7 +75,9 @@ class QdmaDriverDevice { /// spreading queues across the device's MM channels (the channel only /// takes effect on `q start`; the driver ignores it on `q add`). void queueAdd(uint32_t qid); - void queueStart(uint32_t qid); + /// Start queue @p qid pinned to MM engine @p channel (0-based, clamped to + /// the device's channel count). + void queueStart(uint32_t qid, uint32_t channel); /// Stop + delete a queue pair. Best-effort; never throws (safe in dtors). void queueStop(uint32_t qid) noexcept; @@ -97,6 +101,7 @@ class QdmaDriverDevice { std::string functionBdf_; ///< Full BDF including function. unsigned qmax_ = 0; ///< Currently provisioned queue count. unsigned mmChannelMax_ = 1; ///< Number of MM engine channels (>= 1). + uint32_t ringSizeIndex_ = 0; ///< QRNGSZ_IDX used when starting queues. }; /// One host buffer bound to a freshly-created upstream QDMA queue pair. @@ -105,8 +110,10 @@ class QdmaDriverDevice { /// testBandwidth() templates. class QdmaDriverBuffer { public: + /// @param mmChannel Concrete MM channel to pin to, or -1 to spread the + /// queue across channels by qid % channel-count. QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid, uint64_t physAddr, uint64_t size, - raw::PageSize pageSize); + raw::PageSize pageSize, int mmChannel); QdmaDriverBuffer(const QdmaDriverBuffer&) = delete; QdmaDriverBuffer& operator=(const QdmaDriverBuffer&) = delete; diff --git a/smi/src/smi.cpp b/smi/src/smi.cpp index bc06787a..bba4c3e4 100644 --- a/smi/src/smi.cpp +++ b/smi/src/smi.cpp @@ -139,6 +139,19 @@ static int smiMain(int argc, char **argv) { "buffer-size/offsets; the allocation fails with no fallback otherwise.") ->transform(CLI::CheckedTransformer(pageSizeMap, CLI::ignore_case)) ->default_str("4k"); + validateCommand->add_option_function("--mm-channel", + [&validateOptions](const std::string& value) { + try { + validateOptions.mmChannels = Validate::parseMmChannelSpec(value); + } catch (const std::exception& e) { + throw CLI::ValidationError("--mm-channel", e.what()); + } + }, + "AXI-MM/NoC channel per buffer: auto|0|1 applied to all buffers, or a " + "comma-separated list with exactly one entry per buffer position " + "(2 x --threads entries, e.g. -j 1 -> '0,1'); no repeating. " + "auto stripes across channels by qid&1. Default auto.") + ->default_str("auto"); addValidateSizeOption("--buffer-size", &validateOptions.bufferSize, "Size of each validate buffer; accepts bytes or k/K/m/M suffixes (max 512M)") ->default_str("512M"); @@ -177,6 +190,13 @@ static int smiMain(int argc, char **argv) { "In --channel-allocation paired mode, byte distance between the two per-channel " "memory regions (NSU/pseudo-channel stride); accepts k/K/m/M/g/G suffixes") ->default_str("16G"); + validateCommand->add_option_function("--ring-size-index", + [&validateOptions](uint32_t value) { + validateOptions.ringSizeIndex = value; + }, + "Raw-transfer queue descriptor-ring size index (0-15). Overrides the backend default.") + ->check(CLI::Range(0u, 15u)) + ->default_str("backend default"); validateCommand->add_option("--bandwidth-iterations", validateOptions.bandwidthIterations, "Raw-transfer bandwidth mode only: repeat each whole-buffer transfer this many times") ->default_val(1)->check(CLI::Range(static_cast(1), diff --git a/smi/src/validate.cpp b/smi/src/validate.cpp index a16aaea8..188f505e 100644 --- a/smi/src/validate.cpp +++ b/smi/src/validate.cpp @@ -119,6 +119,46 @@ static uint64_t requiredAlignment(const Validate::Options& options) { : TRANSFER_ALIGNMENT; } +/// Per-buffer AXI-MM channel selection. A single-element list applies to every +/// buffer; otherwise the list has exactly one entry per logical position +/// (validated in validatePlacement) and is indexed directly. +static Validate::Options::MmChannel mmChannelForPosition(const Validate::Options& options, + uint64_t position) { + const auto& list = options.mmChannels; + return list.size() == 1 ? list.front() : list[position]; +} + +/// Map the per-buffer channel selection to the vrtd channel enum. +static vrtd::MmChannel vrtdMmChannel(const Validate::Options& options, uint64_t position) { + switch (mmChannelForPosition(options, position)) { + case Validate::Options::MmChannel::Ch0: return vrtd::MmChannel::Ch0; + case Validate::Options::MmChannel::Ch1: return vrtd::MmChannel::Ch1; + case Validate::Options::MmChannel::Auto: + default: return vrtd::MmChannel::Auto; + } +} + +/// Map the per-buffer channel selection to the SLASH UAPI channel enum. +static slash_qdma_mm_channel slashMmChannel(const Validate::Options& options, uint64_t position) { + switch (mmChannelForPosition(options, position)) { + case Validate::Options::MmChannel::Ch0: return SLASH_QDMA_MM_CHANNEL_0; + case Validate::Options::MmChannel::Ch1: return SLASH_QDMA_MM_CHANNEL_1; + case Validate::Options::MmChannel::Auto: + default: return SLASH_QDMA_MM_CHANNEL_AUTO; + } +} + +/// Map the per-buffer channel selection to a concrete channel for the +/// off-the-shelf QDMA driver; -1 means auto (queue spreads by qid % channels). +static int qdmaDriverMmChannel(const Validate::Options& options, uint64_t position) { + switch (mmChannelForPosition(options, position)) { + case Validate::Options::MmChannel::Ch0: return 0; + case Validate::Options::MmChannel::Ch1: return 1; + case Validate::Options::MmChannel::Auto: + default: return -1; + } +} + static std::string trim(std::string_view text) { size_t first = 0; while (first < text.size() && @@ -284,6 +324,14 @@ static bool checkMemoryPlacementRangePaired(const char* memoryName, } static bool validatePlacement(const Validate::Options& options) { + const uint64_t positions = 2ULL * options.threads; + if (options.mmChannels.size() != 1 && options.mmChannels.size() != positions) { + std::cerr << "validate: --mm-channel list must have exactly 1 or " << positions + << " entries (one per buffer position = 2 x --threads); got " + << options.mmChannels.size() << std::endl; + return false; + } + if (options.bufferSize == 0 || options.bufferSize > MAX_BUFFER_SIZE) { std::cerr << "validate: --buffer-size must be in the range 1..512M" << std::endl; return false; @@ -317,12 +365,21 @@ static bool validatePlacement(const Validate::Options& options) { " tests (--raw-transfer-test or --use-qdma-driver)" << std::endl; return false; } + if (options.ringSizeIndex.has_value() && + !options.rawTransferTest && !options.useQdmaDriver) { + std::cerr << "validate: --ring-size-index only applies to the raw transfer" + " tests (--raw-transfer-test or --use-qdma-driver)" << std::endl; + return false; + } if (options.bandwidthDuration < 0.0) { std::cerr << "validate: --bandwidth-duration must be non-negative" << std::endl; return false; } + if (options.ringSizeIndex.has_value() && *options.ringSizeIndex > 15) { + std::cerr << "validate: --ring-size-index must be in the range 0..15" << std::endl; + return false; + } - const uint64_t positions = 2ULL * options.threads; const auto checkRange = paired ? checkMemoryPlacementRangePaired : checkMemoryPlacementRange; if (!options.ddrOnly && !checkRange("HBM", options, positions)) { return false; @@ -379,6 +436,31 @@ static void printPageSize(const Validate::Options& options) { << std::endl; } +/// Print the raw-transfer queue ring-size override, when one was requested. +static void printRingSizeIndex(const Validate::Options& options) { + if (options.ringSizeIndex.has_value()) { + std::cout << "QDMA ring size index: " << *options.ringSizeIndex << std::endl; + } +} + +/// Print the per-buffer AXI-MM channel selection in effect. +static void printMmChannel(const Validate::Options& options) { + std::cout << "MM channel: "; + for (size_t i = 0; i < options.mmChannels.size(); ++i) { + if (i != 0) { + std::cout << ","; + } + switch (options.mmChannels[i]) { + case Validate::Options::MmChannel::Ch0: std::cout << "0"; break; + case Validate::Options::MmChannel::Ch1: std::cout << "1"; break; + case Validate::Options::MmChannel::Auto: + default: std::cout << "auto"; break; + } + } + std::cout << (options.mmChannels.size() == 1 ? " (all buffers)" : " (per buffer position)") + << std::endl; +} + static bool checkHostMemoryBudget(const Validate::Options& options) { const uint64_t maxConcurrentBuffers = (!options.ddrOnly && !options.hbmOnly) ? 4ULL * options.threads @@ -542,10 +624,13 @@ class RawQdmaDevice { class RawTransferBuffer { public: RawTransferBuffer(slash_qdma* qdma, uint64_t physAddr, uint64_t size, - smi::raw::PageSize pageSize) - : qdma_{qdma}, physAddr_{physAddr}, size_{size}, pageSize_{pageSize} { + smi::raw::PageSize pageSize, slash_qdma_mm_channel mmChannel, + uint32_t ringSizeIndex) + : qdma_{qdma}, physAddr_{physAddr}, size_{size}, pageSize_{pageSize}, + mmChannel_{mmChannel}, ringSizeIndex_{ringSizeIndex} { try { createHostMapping(); + registerBuffer(); createQpair(); } catch (...) { cleanup(); @@ -602,6 +687,10 @@ class RawTransferBuffer { size_ = other.size_; transferStepSize_ = other.transferStepSize_; pageSize_ = other.pageSize_; + mmChannel_ = other.mmChannel_; + ringSizeIndex_ = other.ringSizeIndex_; + bufId_ = other.bufId_; + bufRegistered_ = other.bufRegistered_; other.qdma_ = nullptr; other.fd_ = -1; @@ -612,6 +701,9 @@ class RawTransferBuffer { other.physAddr_ = 0; other.size_ = 0; other.transferStepSize_ = 0; + other.ringSizeIndex_ = QDMA_RING_SZ_IDX; + other.bufId_ = 0; + other.bufRegistered_ = false; } void createHostMapping() { @@ -620,6 +712,13 @@ class RawTransferBuffer { transferStepSize_ = mapping.step; } + void registerBuffer() { + if (slash_qdma_buffer_register(qdma_, data_, size_, &bufId_) != 0) { + throwSystemError("Failed to register raw transfer DMA buffer"); + } + bufRegistered_ = true; + } + void createQpair() { if (qdma_ == nullptr || size_ == 0) { throw std::invalid_argument("Invalid raw transfer buffer arguments"); @@ -629,9 +728,10 @@ class RawTransferBuffer { req.size = sizeof(req); req.mode = QDMA_Q_MODE_MM; req.dir_mask = QDMA_DIR_H2C | QDMA_DIR_C2H; - req.h2c_ring_sz = QDMA_RING_SZ_IDX; - req.c2h_ring_sz = QDMA_RING_SZ_IDX; - req.cmpt_ring_sz = QDMA_RING_SZ_IDX; + req.mm_channel = mmChannel_; + req.h2c_ring_sz = ringSizeIndex_; + req.c2h_ring_sz = ringSizeIndex_; + req.cmpt_ring_sz = ringSizeIndex_; if (slash_qdma_qpair_add(qdma_, &req) != 0) { throwSystemError("Failed to add raw transfer QDMA queue pair"); @@ -655,7 +755,16 @@ class RawTransferBuffer { } void transfer(uint64_t offset, uint64_t size, bool toDevice) { - smi::raw::rawTransfer(fd_, data_, physAddr_, offset, size, transferStepSize_, toDevice); + const uint32_t dir = toDevice ? SLASH_QDMA_XFER_H2C : SLASH_QDMA_XFER_C2H; + ssize_t n = slash_qdma_transfer(qdma_, fd_, bufId_, offset, + physAddr_ + offset, size, dir); + if (n < 0) { + throwSystemError(toDevice ? "Raw QDMA write failed" + : "Raw QDMA read failed"); + } + if (static_cast(n) != size) { + throw std::runtime_error("Raw QDMA transfer moved fewer bytes than requested"); + } } void cleanup() { @@ -663,6 +772,10 @@ class RawTransferBuffer { (void)close(fd_); fd_ = -1; } + if (qdma_ != nullptr && bufRegistered_) { + (void)slash_qdma_buffer_unregister(qdma_, bufId_); + bufRegistered_ = false; + } if (qdma_ != nullptr && qpairStarted_) { (void)slash_qdma_qpair_stop(qdma_, qid_); qpairStarted_ = false; @@ -687,6 +800,10 @@ class RawTransferBuffer { uint64_t size_ = 0; uint64_t transferStepSize_ = 0; smi::raw::PageSize pageSize_ = smi::raw::PageSize::Base4K; + slash_qdma_mm_channel mmChannel_ = SLASH_QDMA_MM_CHANNEL_AUTO; + uint32_t ringSizeIndex_ = QDMA_RING_SZ_IDX; + uint32_t bufId_ = 0; + bool bufRegistered_ = false; }; /// Fill @p buf with a deterministic pattern seeded by @p seed. @@ -1061,11 +1178,12 @@ static vrtd::Buffer openValidateHbmBuffer(const vrtd::Device& device, if (options.placementExplicit) { return device.openRawBuffer(addressFor(HBM_BASE, options, position), options.bufferSize, vrtd::BufferAllocDir::Bidirectional, - vrtdPageSize(options)); + vrtdMmChannel(options, position), vrtdPageSize(options)); } return device.openHbmBuffer(static_cast(position), options.bufferSize, - vrtd::BufferAllocDir::Bidirectional, vrtdPageSize(options)); + vrtd::BufferAllocDir::Bidirectional, + vrtdMmChannel(options, position), vrtdPageSize(options)); } static vrtd::Buffer openValidateDdrBuffer(const vrtd::Device& device, @@ -1074,12 +1192,11 @@ static vrtd::Buffer openValidateDdrBuffer(const vrtd::Device& device, if (options.placementExplicit) { return device.openRawBuffer(addressFor(DDR_BASE, options, position), options.bufferSize, vrtd::BufferAllocDir::Bidirectional, - vrtdPageSize(options)); + vrtdMmChannel(options, position), vrtdPageSize(options)); } - (void)position; return device.openDdrBuffer(options.bufferSize, vrtd::BufferAllocDir::Bidirectional, - vrtdPageSize(options)); + vrtdMmChannel(options, position), vrtdPageSize(options)); } static int runRawTransferTest(const std::string& bdf, const Validate::Options& options) { @@ -1095,9 +1212,12 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o std::cout << "Using raw QDMA device " << qdmaPath << "..." << std::endl; printChannelAllocation(options); printPageSize(options); + printMmChannel(options); + printRingSizeIndex(options); printBandwidthRepeatMode(repeat); RawQdmaDevice qdma(qdmaPath); + const uint32_t ringSizeIndex = options.ringSizeIndex.value_or(QDMA_RING_SZ_IDX); if (!options.ddrOnly) { std::cout << "Testing HBM data integrity (" << N << " regions, raw QDMA)..." << std::endl; @@ -1106,7 +1226,8 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o hbmBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { hbmBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + slashMmChannel(options, i), ringSizeIndex); } if (!testDataIntegrity(hbmBuffers, "HBM")) { @@ -1126,10 +1247,12 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o for (unsigned i = 0; i < N; ++i) { hbmReadBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + slashMmChannel(options, 2 * i), ringSizeIndex); hbmWriteBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + slashMmChannel(options, 2 * i + 1), ringSizeIndex); } testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", raw QDMA", repeat); @@ -1143,7 +1266,8 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o ddrBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { ddrBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + slashMmChannel(options, i), ringSizeIndex); } if (!testDataIntegrity(ddrBuffers, "DDR")) { @@ -1163,10 +1287,12 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o for (unsigned i = 0; i < N; ++i) { ddrReadBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + slashMmChannel(options, 2 * i), ringSizeIndex); ddrWriteBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + slashMmChannel(options, 2 * i + 1), ringSizeIndex); } testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", raw QDMA", repeat); @@ -1179,11 +1305,13 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o parBuffers.reserve(2 * N); for (unsigned i = 0; i < N; ++i) { parBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + slashMmChannel(options, i), ringSizeIndex); } for (unsigned i = 0; i < N; ++i) { parBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + slashMmChannel(options, i), ringSizeIndex); } testBandwidthSuite(parBuffers, "HBM+DDR", ", raw QDMA", repeat); @@ -1200,18 +1328,22 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + slashMmChannel(options, 2 * i), ringSizeIndex); parWriteBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + slashMmChannel(options, 2 * i + 1), ringSizeIndex); } for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + slashMmChannel(options, 2 * i), ringSizeIndex); parWriteBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + slashMmChannel(options, 2 * i + 1), ringSizeIndex); } testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", raw QDMA", repeat); @@ -1246,8 +1378,10 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op std::cout << "Using off-the-shelf Xilinx QDMA driver for board " << bdf << "..." << std::endl; printChannelAllocation(options); printPageSize(options); + printMmChannel(options); + printRingSizeIndex(options); printBandwidthRepeatMode(repeat); - smi::qdma_driver::QdmaDriverDevice qdma(bdf); + smi::qdma_driver::QdmaDriverDevice qdma(bdf, options.ringSizeIndex); std::cout << "Resolved QDMA function " << qdma.functionBdf() << std::endl; qdma.ensureQmax(runParallel ? 4 * N : 2 * N); @@ -1266,7 +1400,8 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op hbmBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { hbmBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + qdmaDriverMmChannel(options, i)); } if (!testDataIntegrity(hbmBuffers, "HBM")) { @@ -1284,10 +1419,12 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op for (unsigned i = 0; i < N; ++i) { hbmReadBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + qdmaDriverMmChannel(options, 2 * i)); hbmWriteBuffers.emplace_back(qdma, N + i, rawAddressFor(HBM_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + qdmaDriverMmChannel(options, 2 * i + 1)); } testBidirectionalBandwidthSuite(hbmWriteBuffers, hbmReadBuffers, "HBM", ", QDMA driver", repeat); @@ -1301,7 +1438,8 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op ddrBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { ddrBuffers.emplace_back(qdma, i, rawAddressFor(DDR_BASE, options, i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + qdmaDriverMmChannel(options, i)); } if (!testDataIntegrity(ddrBuffers, "DDR")) { @@ -1319,10 +1457,12 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op for (unsigned i = 0; i < N; ++i) { ddrReadBuffers.emplace_back(qdma, i, rawAddressFor(DDR_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + qdmaDriverMmChannel(options, 2 * i)); ddrWriteBuffers.emplace_back(qdma, N + i, rawAddressFor(DDR_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + qdmaDriverMmChannel(options, 2 * i + 1)); } testBidirectionalBandwidthSuite(ddrWriteBuffers, ddrReadBuffers, "DDR", ", QDMA driver", repeat); @@ -1335,11 +1475,13 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op parBuffers.reserve(2 * N); for (unsigned i = 0; i < N; ++i) { parBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + qdmaDriverMmChannel(options, i)); } for (unsigned i = 0; i < N; ++i) { parBuffers.emplace_back(qdma, N + i, rawAddressFor(DDR_BASE, options, i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + qdmaDriverMmChannel(options, i)); } testBandwidthSuite(parBuffers, "HBM+DDR", ", QDMA driver", repeat); @@ -1352,18 +1494,22 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + qdmaDriverMmChannel(options, 2 * i)); parWriteBuffers.emplace_back(qdma, 2 * N + i, rawAddressFor(HBM_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + qdmaDriverMmChannel(options, 2 * i + 1)); } for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma, N + i, rawAddressFor(DDR_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + qdmaDriverMmChannel(options, 2 * i)); parWriteBuffers.emplace_back(qdma, 3 * N + i, rawAddressFor(DDR_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options)); + options.bufferSize, rawPageSize(options), + qdmaDriverMmChannel(options, 2 * i + 1)); } testBidirectionalBandwidthSuite(parWriteBuffers, parReadBuffers, "HBM+DDR", ", QDMA driver", repeat); @@ -1380,6 +1526,35 @@ uint64_t Validate::parseByteSizeOption(const std::string& text) { return parseByteSizeText(text); } +std::vector Validate::parseMmChannelSpec(const std::string& text) { + std::vector result; + size_t start = 0; + while (true) { + const size_t comma = text.find(',', start); + std::string token = trim(comma == std::string::npos ? text.substr(start) + : text.substr(start, comma - start)); + std::transform(token.begin(), token.end(), token.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + if (token == "auto") { + result.push_back(Options::MmChannel::Auto); + } else if (token == "0") { + result.push_back(Options::MmChannel::Ch0); + } else if (token == "1") { + result.push_back(Options::MmChannel::Ch1); + } else { + throw std::invalid_argument("mm-channel entries must be auto, 0, or 1"); + } + if (comma == std::string::npos) { + break; + } + start = comma + 1; + } + if (result.empty()) { + throw std::invalid_argument("mm-channel spec must not be empty"); + } + return result; +} + int Validate::run(const Options& options) { std::string bdf = resolveBoardBdf(options.bdf, "validate"); unsigned N = options.threads; @@ -1415,6 +1590,7 @@ int Validate::run(const Options& options) { auto device = session.getDeviceByBdf(bdf); printPageSize(options); + printMmChannel(options); // -- Step 2: HBM — integrity then bandwidth -- if (!options.ddrOnly) { diff --git a/smi/src/validate.hpp b/smi/src/validate.hpp index 4c9f7db1..46c6f807 100644 --- a/smi/src/validate.hpp +++ b/smi/src/validate.hpp @@ -30,7 +30,9 @@ /// the default VRTD buffer path. #include +#include #include +#include /// @brief Static entry-point for the validate command. /// @@ -66,6 +68,17 @@ class Validate { Huge2M, ///< 2 MiB hugepages. }; + /// @brief Per-queue AXI-MM/NoC channel selection for a buffer. + /// + /// Auto lets the driver stripe by qid&1; Ch0/Ch1 pin the queue to a + /// single AXI-MM channel (and hence NoC channel). Applies to the VRTD, + /// raw SLASH, and off-the-shelf QDMA-driver backends. + enum class MmChannel { + Auto, ///< Driver stripes by qid&1 (default). + Ch0, ///< Pin to AXI-MM/NoC channel 0. + Ch1, ///< Pin to AXI-MM/NoC channel 1. + }; + std::string bdf; ///< BDF (Bus:Device.Function) address of the target device. unsigned threads = 8; ///< Number of parallel buffers/threads (1-64). bool noReset = false; ///< Skip the device reset step before running memory tests. @@ -74,6 +87,9 @@ class Validate { bool rawTransferTest = false; ///< Use libslash raw QDMA transfers instead of VRTD buffers. bool useQdmaDriver = false; ///< Run the raw test over the off-the-shelf Xilinx QDMA driver. PageSize pageSize = PageSize::Base4K; ///< Host staging-buffer page granule (4 KiB or 2 MiB). + /// Per-buffer AXI-MM channel selection, indexed by buffer position + /// modulo size (a single entry applies to every buffer). Default auto. + std::vector mmChannels{MmChannel::Auto}; uint64_t bufferSize = 512ULL * 1024ULL * 1024ULL; ///< Size of each test buffer. uint64_t offset = 512ULL * 1024ULL * 1024ULL; ///< Distance between logical buffer positions. uint64_t startingOffset = 0; ///< Offset from memory-space base for position 0. @@ -88,6 +104,8 @@ class Validate { uint64_t bandwidthIterations = 1; /// Raw bandwidth phase duration in seconds. 0 means use fixed iterations. double bandwidthDuration = 0.0; + /// Optional descriptor-ring size index for raw QDMA queue creation. + std::optional ringSizeIndex; }; /// @brief Executes the validate command. @@ -97,6 +115,9 @@ class Validate { /// @brief Parse a byte-size option accepting bare values and k/K/m/M suffixes. static uint64_t parseByteSizeOption(const std::string& text); + + /// @brief Parse an --mm-channel spec: a single auto|0|1 or a comma-separated list. + static std::vector parseMmChannelSpec(const std::string& text); }; #endif // SMI_VALIDATE_HPP From a037eb1f2be7434403959840495b16a04f5a4cb6 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Fri, 12 Jun 2026 14:06:22 +0100 Subject: [PATCH 16/23] docs: document validate --ring-size-index option Signed-off-by: Vlad-Gabriel Serbu --- docs/reference/smi/commands.rst | 30 +++++++++++++++++++++++++++++- smi/README.md | 7 ++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/docs/reference/smi/commands.rst b/docs/reference/smi/commands.rst index 63ce3064..a9af357e 100644 --- a/docs/reference/smi/commands.rst +++ b/docs/reference/smi/commands.rst @@ -160,7 +160,7 @@ phase is skipped when ``--ddr-only`` or ``--hbm-only`` is given. .. code-block:: text - v80-smi validate -d [-j|--threads ] [-R|--no-reset] [--page-size <4k|2m>] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--bandwidth-iterations ] [--bandwidth-duration ] + v80-smi validate -d [-j|--threads ] [-R|--no-reset] [--page-size <4k|2m>] [--mm-channel ] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--ring-size-index <0-15>] [--bandwidth-iterations ] [--bandwidth-duration ] Requirements by mode: @@ -231,6 +231,26 @@ host memory. With ``--page-size 2m`` that footprint is checked against the free 2 MiB-aligned, otherwise ``validate`` fails early. Reserve hugepages with, e.g., ``echo | sudo tee /proc/sys/vm/nr_hugepages``. +.. option:: --mm-channel + + AXI-MM / NoC channel selection for each buffer's QDMA queue pair, in every + mode. ``spec`` is either a single value applied to all buffers, or a + comma-separated list giving one channel per logical buffer position + (exactly ``2 x --threads`` entries; there is no repeating/wrap, and any + other length is an error): + + * ``auto`` (the default) lets the driver stripe queues across both channels + by ``qid & 1``. + * ``0`` / ``1`` pin the queue to that AXI-MM channel (and hence NoC channel). + * e.g. with ``-j 1`` the list ``0,1`` puts buffer position 0 on channel 0 and + position 1 on channel 1. Bidirectional phases use positions ``0..2N-1``; + single-direction phases use the first ``N`` entries. + + This is independent of ``--channel-allocation`` (which controls the device + address): ``--mm-channel`` controls the host-side NoC ingress (NMU) per + queue. With ``--use-qdma-driver`` the selection maps to the stock driver's + per-queue MM-channel attribute. + .. option:: --raw-transfer-test Use libslash raw QDMA transfers instead of VRTD buffers. This mode implies @@ -284,6 +304,14 @@ host memory. With ``--page-size 2m`` that footprint is checked against the free ``offset_ch1 - offset_ch0`` spacing). Must be a non-zero multiple of 4 KiB. Accepts bare bytes or ``k``/``K``, ``m``/``M``, ``g``/``G`` suffixes. +.. option:: --ring-size-index <0-15> + + Raw-transfer-only (``--raw-transfer-test`` or ``--use-qdma-driver``). + Override the QDMA descriptor-ring size index used when creating SLASH raw + queue pairs or starting stock-driver queues. When omitted, each backend keeps + its existing default. Useful A/B values for 4 KiB descriptor throughput are + ``0``, ``11``, ``13``, and ``15``. + .. option:: --bandwidth-iterations Raw-transfer-only (``--raw-transfer-test`` or ``--use-qdma-driver``). Repeat diff --git a/smi/README.md b/smi/README.md index facf992b..7f3dae56 100644 --- a/smi/README.md +++ b/smi/README.md @@ -183,7 +183,7 @@ bandwidth. Raw transfer modes skip reset and bypass the default VRTD buffer path for data movement. ``` -v80-smi validate -d [-j ] [-R] [--page-size <4k|2m>] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--bandwidth-iterations ] [--bandwidth-duration ] +v80-smi validate -d [-j ] [-R] [--page-size <4k|2m>] [--mm-channel ] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--ring-size-index <0-15>] [--bandwidth-iterations ] [--bandwidth-duration ] ``` | Flag | Description | @@ -192,6 +192,7 @@ v80-smi validate -d [-j ] [-R] [--page-size <4k|2m>] [--buffer-si | `-j,--threads` | Parallel buffers/threads, 1-64 (default 8). Bidirectional phases use `2 * threads` logical positions in each enabled memory space. | | `-R,--no-reset` | Skip the device reset step before running memory tests | | `--page-size` | Host staging-buffer page granule for all backends: `4k` (default; 4 KiB base pages) or `2m` (2 MiB hugepages). No fallback: `2m` needs reserved 2 MiB hugepages and 2 MiB-aligned `--buffer-size`/`--offset`/`--starting-offset` (and `--channel-region-stride` in paired mode). | +| `--mm-channel` | AXI-MM/NoC channel per buffer queue: `auto` (default; driver stripes by `qid&1`), `0`, or `1`, or a comma-separated list with exactly one entry per buffer position (`2 x --threads` entries, e.g. `-j 1` -> `0,1`); no repeating, wrong length errors. Independent of `--channel-allocation`; also honored by `--use-qdma-driver`. | | `--buffer-size` | Size of each test buffer, accepting bytes or `k`/`K`/`m`/`M` suffixes (default `512M`, maximum `512M`) | | `--offset` | Distance between logical buffer positions (default `512M`) | | `--starting-offset` | Offset from each memory-space base for logical position 0 (default `0`) | @@ -201,6 +202,7 @@ v80-smi validate -d [-j ] [-R] [--page-size <4k|2m>] [--buffer-si | `--hbm-only` | Run only HBM memory tests (skip DDR); mutually exclusive with `--ddr-only` | | `--channel-allocation` | Raw-transfer-only placement: `auto` (default; mm-channel `qid&1`, linear addressing) or `paired` (couple mm-channel to a distinct memory region/NSU: even positions -> region 0/channel 0, odd -> region 1/channel 1). `paired` mirrors dma-perf `offset_ch0`/`offset_ch1` so both NoC NMUs drive independent memory endpoints. | | `--channel-region-stride` | In `--channel-allocation paired`, byte distance between the two per-channel regions (NSU stride). Default `16G` (half the per-memory space); accepts `k`/`K`/`m`/`M`/`g`/`G`. | +| `--ring-size-index` | Raw-transfer-only descriptor-ring size index, `0`-`15`. Overrides the backend default when creating SLASH raw qpairs or starting stock-driver queues. | | `--bandwidth-iterations` | Raw-transfer-only sustained bandwidth mode: repeat each whole-buffer transfer this many times in each bandwidth phase (default `1`). | | `--bandwidth-duration` | Raw-transfer-only duration mode: repeat whole-buffer transfers until this many seconds have elapsed; `0` disables duration mode and uses `--bandwidth-iterations`. | @@ -236,6 +238,9 @@ placement or page size. `--bandwidth-iterations` repeats each whole-buffer transfer a fixed number of times, while `--bandwidth-duration` runs each bandwidth phase for a wall-clock duration and counts completed whole-buffer transfers. Integrity checks remain one-shot. +`--ring-size-index` can override the QDMA descriptor-ring size index for these +raw modes; useful A/B values for 4 KiB descriptor throughput are `0`, `11`, +`13`, and `15`. With `--use-qdma-driver`, the command runs the same raw test over the off-the-shelf Xilinx QDMA driver (`submodules/qdma_drv`) instead of SLASH. From 8a74b03d9ac86d186eb8e118963947bbcc0a0601 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Fri, 12 Jun 2026 14:06:44 +0100 Subject: [PATCH 17/23] vrtd: plumb mm-channel selection through buffer open Signed-off-by: Vlad-Gabriel Serbu --- driver/README.md | 33 +++++++++++---------- vrt/vrtd/include/vrtd/wire.h | 2 ++ vrt/vrtd/libvrtd/include/vrtd/vrtd.h | 16 ++++++++++ vrt/vrtd/libvrtd/src/requests.c | 4 +++ vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp | 12 ++++++++ vrt/vrtd/libvrtdpp/include/vrtd/device.hpp | 21 ++++++++----- vrt/vrtd/libvrtdpp/include/vrtd/session.hpp | 4 +++ vrt/vrtd/libvrtdpp/src/device.cpp | 10 ++++--- vrt/vrtd/libvrtdpp/src/session.cpp | 20 ++++++++----- vrt/vrtd/src/buffer.c | 9 ++++-- vrt/vrtd/src/buffer.h | 6 +++- vrt/vrtd/src/serve.c | 16 +++++++++- vrt/vrtd/tests/buffer_test.cpp | 22 +++++++------- vrt/vrtd/tests/device_test.cpp | 2 +- 14 files changed, 126 insertions(+), 51 deletions(-) diff --git a/driver/README.md b/driver/README.md index c04776d7..7576dafb 100644 --- a/driver/README.md +++ b/driver/README.md @@ -9,35 +9,36 @@ Exposed under `/sys/module/slash/parameters/` (all writable at runtime; see |-----------|------|---------|-------------| | `qdma_num_threads` | uint | 8 | Number of libqdma worker threads. | | `qdma_debugfs_path` | charp | disabled | debugfs mount path for libqdma. | -| `qdma_force_mm_channel` | int | -1 | Force the QDMA AXI-MM / NoC channel for newly-added queues: `<0` = auto (stripe by `qid & 1`), `0` or `1` = pin every new queue to that channel. | ### A/B testing NoC channel bandwidth -`qdma_force_mm_channel` is read when each queue pair is added, so it can be -changed between test runs to check whether both PCIe NMUs (NoC channels) -actually contribute bandwidth. Each value pins all new queues to one NoC -channel; the default (`-1`) splits them across both: +The AXI-MM / NoC channel is chosen per queue pair when it is added (the +`mm_channel` field of the qpair-add ioctl, `enum slash_qdma_mm_channel`): +`auto` stripes queues across both channels by `qid & 1`, while `0` / `1` pin a +queue to a single channel. Every queue creator carries this setting, so it can +be driven per buffer to check whether both PCIe NMUs (NoC channels) actually +contribute bandwidth. With `v80-smi validate`: ```sh # All queues on NoC channel 0 (NMU S00) -echo 0 | sudo tee /sys/module/slash/parameters/qdma_force_mm_channel -sudo v80-smi validate -d --raw-transfer-test --no-reset +sudo v80-smi validate -d --raw-transfer-test --no-reset --mm-channel 0 # All queues on NoC channel 1 (NMU S01) -echo 1 | sudo tee /sys/module/slash/parameters/qdma_force_mm_channel -sudo v80-smi validate -d --raw-transfer-test --no-reset +sudo v80-smi validate -d --raw-transfer-test --no-reset --mm-channel 1 -# Default: split across both channels (qid & 1) -echo -1 | sudo tee /sys/module/slash/parameters/qdma_force_mm_channel -sudo v80-smi validate -d --raw-transfer-test --no-reset +# Split across both channels (qid & 1) +sudo v80-smi validate -d --raw-transfer-test --no-reset --mm-channel auto + +# Explicit per-buffer split (even positions -> channel 0, odd -> channel 1) +sudo v80-smi validate -d --raw-transfer-test --no-reset --mm-channel 0,1 ``` Debug builds with `SLASH_QDMA_OP_DEBUG=1` log each queue's selected `mm_channel` when it is added. If the split run is no faster than a single -forced channel, traffic is not being spread across both NMUs. The parameter -affects every queue created through this driver (both the VRTD buffer path and -`--raw-transfer-test`), but not the off-the-shelf Xilinx QDMA driver path -(`--use-qdma-driver`). +forced channel, traffic is not being spread across both NMUs. The per-queue +setting affects every queue created through this driver (both the VRTD buffer +path and `--raw-transfer-test`); the off-the-shelf Xilinx QDMA driver path +(`--use-qdma-driver`) honors `--mm-channel` through its own channel attribute. ## Testing diff --git a/vrt/vrtd/include/vrtd/wire.h b/vrt/vrtd/include/vrtd/wire.h index 4749afd4..c464ce81 100644 --- a/vrt/vrtd/include/vrtd/wire.h +++ b/vrt/vrtd/include/vrtd/wire.h @@ -299,6 +299,7 @@ struct vrtd_req_buffer_open { uint32_t dev_number; ///< Device index (0-based). uint32_t alloc_type; ///< One of enum vrtd_alloc_type. uint32_t alloc_dir; ///< One of enum vrtd_alloc_dir. + uint32_t mm_channel; ///< AXI-MM/NoC channel selection (enum vrtd_mm_channel). uint64_t alloc_arg; ///< Allocation argument (HBM region index for HBM). uint64_t size; ///< Requested size in bytes. } __attribute__((packed)); @@ -333,6 +334,7 @@ struct vrtd_resp_buffer_close { struct vrtd_req_buffer_open_raw { uint32_t dev_number; ///< Device index (0-based). uint32_t alloc_dir; ///< One of enum vrtd_alloc_dir. + uint32_t mm_channel; ///< AXI-MM/NoC channel selection (enum vrtd_mm_channel). uint64_t phys_addr; ///< Caller-specified device physical address (bypasses allocator). uint64_t size; ///< Size in bytes. } __attribute__((packed)); diff --git a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h index 53467bd3..09c34a35 100644 --- a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h +++ b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h @@ -67,6 +67,18 @@ enum vrtd_host_page_mode { VRTD_HOST_PAGE_2M = 1, ///< 2 MiB hugetlb pages; allocation fails if they cannot be mapped. }; +/** + * @brief AXI-MM / NoC channel selection for a buffer's QDMA queue pair. + * + * Sent to the daemon, which forwards it to the SLASH driver's qpair-add ioctl + * (the values mirror enum slash_qdma_mm_channel). + */ +enum vrtd_mm_channel { + VRTD_MM_CHANNEL_AUTO = 0, ///< Stripe across channels by (qid & 1). + VRTD_MM_CHANNEL_0 = 1, ///< Pin to AXI-MM/NoC channel 0. + VRTD_MM_CHANNEL_1 = 2, ///< Pin to AXI-MM/NoC channel 1. +}; + /** * @brief Connect to the vrtd UNIX domain socket. @@ -338,6 +350,7 @@ enum vrtd_ret vrtd_qdma_qpair_get_fd( * @param alloc_dir QDMA direction (one of enum vrtd_alloc_dir). * @param alloc_arg Allocation argument (HBM region index for HBM). * @param size_in Requested size in bytes. + * @param mm_channel AXI-MM/NoC channel selection (one of enum vrtd_mm_channel). * @param page_mode Host staging-buffer page granule (one of enum vrtd_host_page_mode). * @param buffer_out Output pointer to receive the allocated buffer handle. * @@ -352,6 +365,7 @@ enum vrtd_ret vrtd_buffer_open( uint32_t alloc_dir, uint64_t alloc_arg, uint64_t size_in, + enum vrtd_mm_channel mm_channel, enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ); @@ -367,6 +381,7 @@ enum vrtd_ret vrtd_buffer_open( * @param phys_addr Caller-specified device physical address. * @param size Size in bytes. * @param alloc_dir One of #vrtd_alloc_dir. + * @param mm_channel AXI-MM/NoC channel selection (one of enum vrtd_mm_channel). * @param page_mode Host staging-buffer page granule (one of enum vrtd_host_page_mode). * @param buffer_out Output parameter set to the new buffer handle on success. * @@ -380,6 +395,7 @@ enum vrtd_ret vrtd_buffer_open_raw( uint64_t phys_addr, uint64_t size, uint32_t alloc_dir, + enum vrtd_mm_channel mm_channel, enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ); diff --git a/vrt/vrtd/libvrtd/src/requests.c b/vrt/vrtd/libvrtd/src/requests.c index 5f6b9357..cad284b5 100644 --- a/vrt/vrtd/libvrtd/src/requests.c +++ b/vrt/vrtd/libvrtd/src/requests.c @@ -468,6 +468,7 @@ enum vrtd_ret vrtd_buffer_open( uint32_t alloc_dir, uint64_t alloc_arg, uint64_t size_in, + enum vrtd_mm_channel mm_channel, enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ) @@ -481,6 +482,7 @@ enum vrtd_ret vrtd_buffer_open( .dev_number = dev, .alloc_type = alloc_type, .alloc_dir = alloc_dir, + .mm_channel = mm_channel, .alloc_arg = alloc_arg, .size = size_in, }; @@ -525,6 +527,7 @@ enum vrtd_ret vrtd_buffer_open_raw( uint64_t phys_addr, uint64_t size, uint32_t alloc_dir, + enum vrtd_mm_channel mm_channel, enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ) @@ -537,6 +540,7 @@ enum vrtd_ret vrtd_buffer_open_raw( struct vrtd_req_buffer_open_raw req = { .dev_number = dev, .alloc_dir = alloc_dir, + .mm_channel = mm_channel, .phys_addr = phys_addr, .size = size, }; diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp index 254eb660..e441a3fd 100644 --- a/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp +++ b/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp @@ -59,6 +59,18 @@ enum class HostPageSize : uint32_t { Huge2M = 1, ///< 2 MiB hugetlb pages; allocation fails if unavailable. }; +/** + * @brief AXI-MM / NoC channel selection for a buffer's QDMA queue pair. + * + * Mirrors @c vrtd_mm_channel (values must stay in sync). @c Auto stripes across + * channels by (qid & 1); @c Ch0 / @c Ch1 pin to a single channel. + */ +enum class MmChannel : uint32_t { + Auto = 0, ///< Stripe across channels by (qid & 1). + Ch0 = 1, ///< Pin to AXI-MM/NoC channel 0. + Ch1 = 2, ///< Pin to AXI-MM/NoC channel 1. +}; + /** * @brief RAII wrapper for a vrtd buffer allocation. * diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp index 7695d975..5af4fef2 100644 --- a/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp +++ b/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp @@ -158,6 +158,7 @@ class Device { * @param size Requested size in bytes. * @param allocArg Allocation argument (HBM region index for HBM). * @param allocDir QDMA transfer direction. + * @param mmChannel AXI-MM/NoC channel selection (defaults to auto). * @param page Host staging-buffer page granule (defaults to 4 KiB). * @return An owning @c Buffer. * @throws vrtd::Error on error. @@ -166,14 +167,16 @@ class Device { uint64_t size, uint64_t allocArg = 0, BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + MmChannel mmChannel = MmChannel::Auto, HostPageSize page = HostPageSize::Base4K) const; /** * @brief Convenience helper for DDR allocations. */ Buffer openDdrBuffer(uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + MmChannel mmChannel = MmChannel::Auto, HostPageSize page = HostPageSize::Base4K) const { - return openBuffer(BufferAllocType::Ddr, size, 0, allocDir, page); + return openBuffer(BufferAllocType::Ddr, size, 0, allocDir, mmChannel, page); } /** @@ -182,8 +185,9 @@ class Device { Buffer openHbmBuffer(uint32_t region, uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + MmChannel mmChannel = MmChannel::Auto, HostPageSize page = HostPageSize::Base4K) const { - return openBuffer(BufferAllocType::Hbm, size, region, allocDir, page); + return openBuffer(BufferAllocType::Hbm, size, region, allocDir, mmChannel, page); } /** @@ -191,8 +195,9 @@ class Device { */ Buffer openHbmVnocBuffer(uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + MmChannel mmChannel = MmChannel::Auto, HostPageSize page = HostPageSize::Base4K) const { - return openBuffer(BufferAllocType::HbmVnoc, size, 0, allocDir, page); + return openBuffer(BufferAllocType::HbmVnoc, size, 0, allocDir, mmChannel, page); } /** @@ -204,6 +209,7 @@ class Device { * @param phys_addr Device physical address. * @param size Size in bytes. * @param allocDir QDMA transfer direction. + * @param mmChannel AXI-MM/NoC channel selection (defaults to auto). * @param page Host staging-buffer page granule (defaults to 4 KiB). * @return An owning @c Buffer. * @throws vrtd::Error on error. @@ -211,6 +217,7 @@ class Device { Buffer openRawBuffer(uint64_t phys_addr, uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional, + MmChannel mmChannel = MmChannel::Auto, HostPageSize page = HostPageSize::Base4K) const; /** @@ -358,8 +365,8 @@ class Device { uint16_t subsystemDeviceId, std::function fGetBar, std::function fCreateQdmaQpair, - std::function fOpenBuffer, - std::function fOpenBufferRaw, + std::function fOpenBuffer, + std::function fOpenBufferRaw, std::function fHotplugOp, std::function fDesignWrite, std::function fDesignWriteFile, @@ -377,8 +384,8 @@ class Device { std::function fGetBar; std::function fCreateQdmaQpair; - std::function fOpenBuffer; - std::function fOpenBufferRaw; + std::function fOpenBuffer; + std::function fOpenBufferRaw; std::function fHotplugOp; std::function fDesignWrite; std::function fDesignWriteFile; diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp index 0bc86acf..32a7ae88 100644 --- a/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp +++ b/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp @@ -190,6 +190,7 @@ class Session { * @param size Requested size in bytes. * @param allocArg Allocation argument (HBM region index for HBM). * @param allocDir QDMA transfer direction. + * @param mmChannel AXI-MM/NoC channel selection for the queue pair. * @param pageSize Host staging-buffer page granule (4 KiB or 2 MiB). * @return An owning @c Buffer. * @throws vrtd::Error on error. @@ -200,6 +201,7 @@ class Session { uint64_t size, uint64_t allocArg, BufferAllocDir allocDir, + MmChannel mmChannel, HostPageSize pageSize ) const; @@ -210,6 +212,7 @@ class Session { * @param phys_addr Caller-specified device physical address (bypasses allocator). * @param size Size in bytes. * @param allocDir QDMA transfer direction. + * @param mmChannel AXI-MM/NoC channel selection for the queue pair. * @param pageSize Host staging-buffer page granule (4 KiB or 2 MiB). * @return An owning @c Buffer. * @throws vrtd::Error on error. @@ -219,6 +222,7 @@ class Session { uint64_t phys_addr, uint64_t size, BufferAllocDir allocDir, + MmChannel mmChannel, HostPageSize pageSize ) const; diff --git a/vrt/vrtd/libvrtdpp/src/device.cpp b/vrt/vrtd/libvrtdpp/src/device.cpp index c4ae8d7a..37a09120 100644 --- a/vrt/vrtd/libvrtdpp/src/device.cpp +++ b/vrt/vrtd/libvrtdpp/src/device.cpp @@ -31,8 +31,8 @@ Device::Device(uint32_t num, uint16_t subsystemDeviceId, std::function fGetBar, std::function fCreateQdmaQpair, - std::function fOpenBuffer, - std::function fOpenBufferRaw, + std::function fOpenBuffer, + std::function fOpenBufferRaw, std::function fHotplugOp, std::function fDesignWrite, std::function fDesignWriteFile, @@ -98,15 +98,17 @@ Buffer Device::openBuffer(BufferAllocType allocType, uint64_t size, uint64_t allocArg, BufferAllocDir allocDir, + MmChannel mmChannel, HostPageSize page) const { - return fOpenBuffer(*this, allocType, size, allocArg, allocDir, page); + return fOpenBuffer(*this, allocType, size, allocArg, allocDir, mmChannel, page); } Buffer Device::openRawBuffer(uint64_t phys_addr, uint64_t size, BufferAllocDir allocDir, + MmChannel mmChannel, HostPageSize page) const { - return fOpenBufferRaw(*this, phys_addr, size, allocDir, page); + return fOpenBufferRaw(*this, phys_addr, size, allocDir, mmChannel, page); } void Device::hotplugOp(HotplugOp op, uint8_t function) const { diff --git a/vrt/vrtd/libvrtdpp/src/session.cpp b/vrt/vrtd/libvrtdpp/src/session.cpp index 8fcf42b6..6799ee6e 100644 --- a/vrt/vrtd/libvrtdpp/src/session.cpp +++ b/vrt/vrtd/libvrtdpp/src/session.cpp @@ -132,11 +132,11 @@ Device Session::getDevice(size_t i) const { info.pci.subsystem_device_id, [&](const Device& device, uint8_t num) { return getBar(device, num); }, [&](const Device& device, const slash_qdma_qpair_add& cfg) { return createQdmaQpair(device, cfg); }, - [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, HostPageSize page) { - return openBuffer(device, type, size, arg, dir, page); + [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, MmChannel mm, HostPageSize page) { + return openBuffer(device, type, size, arg, dir, mm, page); }, - [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, HostPageSize page) { - return openBufferRaw(device, phys_addr, size, dir, page); + [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, MmChannel mm, HostPageSize page) { + return openBufferRaw(device, phys_addr, size, dir, mm, page); }, [&](const Device& device, HotplugOp op, uint8_t function) { return hotplugOp(device, op, function); }, [&](const Device& device, int input_fd) { return designWrite(device, input_fd); }, @@ -197,11 +197,11 @@ Device Session::getDeviceByBdf(std::string_view bdf) const { info.pci.subsystem_device_id, [&](const Device& device, uint8_t num) { return getBar(device, num); }, [&](const Device& device, const slash_qdma_qpair_add& cfg) { return createQdmaQpair(device, cfg); }, - [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, HostPageSize page) { - return openBuffer(device, type, size, arg, dir, page); + [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, MmChannel mm, HostPageSize page) { + return openBuffer(device, type, size, arg, dir, mm, page); }, - [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, HostPageSize page) { - return openBufferRaw(device, phys_addr, size, dir, page); + [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, MmChannel mm, HostPageSize page) { + return openBufferRaw(device, phys_addr, size, dir, mm, page); }, [&](const Device& device, HotplugOp op, uint8_t function) { return hotplugOp(device, op, function); }, [&](const Device& device, int input_fd) { return designWrite(device, input_fd); }, @@ -290,6 +290,7 @@ Buffer Session::openBuffer( uint64_t size, uint64_t allocArg, BufferAllocDir allocDir, + MmChannel mmChannel, HostPageSize pageSize ) const { if (isClosed()) { @@ -305,6 +306,7 @@ Buffer Session::openBuffer( static_cast(allocDir), allocArg, size, + static_cast(static_cast(mmChannel)), static_cast(static_cast(pageSize)), &raw ); @@ -324,6 +326,7 @@ Buffer Session::openBufferRaw( uint64_t phys_addr, uint64_t size, BufferAllocDir allocDir, + MmChannel mmChannel, HostPageSize pageSize ) const { if (isClosed()) { @@ -338,6 +341,7 @@ Buffer Session::openBufferRaw( phys_addr, size, static_cast(allocDir), + static_cast(static_cast(mmChannel)), static_cast(static_cast(pageSize)), &raw ); diff --git a/vrt/vrtd/src/buffer.c b/vrt/vrtd/src/buffer.c index 5a30076e..9bfd6c52 100644 --- a/vrt/vrtd/src/buffer.c +++ b/vrt/vrtd/src/buffer.c @@ -81,6 +81,7 @@ static int buffer_init(struct buffer *buf, uint64_t size, uint64_t alloc_arg, uint64_t client_id, + uint32_t mm_channel, const struct slash_qdma_qpair_add *qpair_params) { if (buf == NULL) { @@ -184,6 +185,7 @@ static int buffer_init(struct buffer *buf, qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX; } qpair.dir_mask = dir_mask; + qpair.mm_channel = mm_channel; qpair.size = sizeof(qpair); if (slash_qdma_qpair_add(qdma, &qpair) != 0) { @@ -235,6 +237,7 @@ struct buffer *buffer_create(struct slash_qdma *qdma, uint64_t size, uint64_t alloc_arg, uint64_t client_id, + uint32_t mm_channel, const struct slash_qdma_qpair_add *qpair_params) { struct buffer *buf = calloc(1, sizeof(*buf)); @@ -243,7 +246,7 @@ struct buffer *buffer_create(struct slash_qdma *qdma, return NULL; } - if (buffer_init(buf, qdma, map, alloc_type, alloc_dir, size, alloc_arg, client_id, qpair_params) != 0) { + if (buffer_init(buf, qdma, map, alloc_type, alloc_dir, size, alloc_arg, client_id, mm_channel, qpair_params) != 0) { LOG(LOG_ERR, "Failed to initialize buffer: %m"); return NULL; } @@ -263,7 +266,8 @@ struct buffer *buffer_create(struct slash_qdma *qdma, struct buffer *buffer_create_raw(struct slash_qdma *qdma, uint64_t phys_addr, uint64_t size, - enum vrtd_alloc_dir alloc_dir) + enum vrtd_alloc_dir alloc_dir, + uint32_t mm_channel) { if (qdma == NULL || size == 0) { errno = EINVAL; @@ -314,6 +318,7 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma, qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX; qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX; qpair.dir_mask = dir_mask; + qpair.mm_channel = mm_channel; qpair.size = sizeof(qpair); if (slash_qdma_qpair_add(qdma, &qpair) != 0) { diff --git a/vrt/vrtd/src/buffer.h b/vrt/vrtd/src/buffer.h index 6834222b..5a49e733 100644 --- a/vrt/vrtd/src/buffer.h +++ b/vrt/vrtd/src/buffer.h @@ -96,6 +96,7 @@ struct buffer { * @param size Requested buffer size in bytes (may be rounded up). * @param alloc_arg Type-specific argument (HBM region index for non-VNOC HBM). * @param client_id Connection ID of the owning client. + * @param mm_channel AXI-MM/NoC channel selection (enum slash_qdma_mm_channel). * @param qpair_params QDMA queue pair configuration parameters. * @return Heap-allocated buffer on success, NULL on failure. */ @@ -106,6 +107,7 @@ struct buffer *buffer_create(struct slash_qdma *qdma, uint64_t size, uint64_t alloc_arg, uint64_t client_id, + uint32_t mm_channel, const struct slash_qdma_qpair_add *qpair_params); /** @@ -120,12 +122,14 @@ struct buffer *buffer_create(struct slash_qdma *qdma, * @param phys_addr Caller-specified device physical address. * @param size Size in bytes. * @param alloc_dir DMA transfer direction. + * @param mm_channel AXI-MM/NoC channel selection (enum slash_qdma_mm_channel). * @return Heap-allocated buffer on success, NULL on failure (errno set). */ struct buffer *buffer_create_raw(struct slash_qdma *qdma, uint64_t phys_addr, uint64_t size, - enum vrtd_alloc_dir alloc_dir); + enum vrtd_alloc_dir alloc_dir, + uint32_t mm_channel); /** * @brief Release all resources owned by a buffer. diff --git a/vrt/vrtd/src/serve.c b/vrt/vrtd/src/serve.c index c11c4d32..299c72ff 100644 --- a/vrt/vrtd/src/serve.c +++ b/vrt/vrtd/src/serve.c @@ -1966,6 +1966,12 @@ static uint16_t client_handle_request_buffer_open( return VRTD_RET_INVALID_ARGUMENT; } + if (req_body->mm_channel > SLASH_QDMA_MM_CHANNEL_1) { + LOG(LOG_WARNING, "Received buffer open request with invalid mm_channel %u", + (unsigned int)req_body->mm_channel); + return VRTD_RET_INVALID_ARGUMENT; + } + struct device *d = client->state->devices.d[req_body->dev_number]; if (d == NULL || d->qdma == NULL || d->memory_map == NULL) { LOG(LOG_WARNING, "Received buffer open request for non-existent or non-functional device"); @@ -1992,6 +1998,7 @@ static uint16_t client_handle_request_buffer_open( req_body->size, req_body->alloc_arg, client_id, + req_body->mm_channel, NULL ); if (buf == NULL) { @@ -2104,6 +2111,12 @@ static uint16_t client_handle_request_buffer_open_raw( return VRTD_RET_INVALID_ARGUMENT; } + if (req_body->mm_channel > SLASH_QDMA_MM_CHANNEL_1) { + LOG(LOG_WARNING, "Received raw buffer open request with invalid mm_channel %u", + (unsigned int)req_body->mm_channel); + return VRTD_RET_INVALID_ARGUMENT; + } + struct device *d = client->state->devices.d[req_body->dev_number]; if (d == NULL || d->qdma == NULL) { LOG(LOG_WARNING, "Received raw buffer open request for non-existent or non-functional device"); @@ -2115,7 +2128,8 @@ static uint16_t client_handle_request_buffer_open_raw( d->qdma, req_body->phys_addr, req_body->size, - (enum vrtd_alloc_dir) req_body->alloc_dir + (enum vrtd_alloc_dir) req_body->alloc_dir, + req_body->mm_channel ); if (buf == NULL) { if (errno == EINVAL) { diff --git a/vrt/vrtd/tests/buffer_test.cpp b/vrt/vrtd/tests/buffer_test.cpp index 078f5819..a3c96b70 100644 --- a/vrt/vrtd/tests/buffer_test.cpp +++ b/vrt/vrtd/tests/buffer_test.cpp @@ -43,7 +43,7 @@ TEST(BufferNullTest, NullQdma) { ASSERT_NE(map, nullptr); struct buffer *buf = buffer_create(nullptr, map, ALLOCATION_TYPE_DDR, VRTD_ALLOC_DIR_HOST_TO_DEVICE, - XFER_SIZE, 0, CLIENT_ID, nullptr); + XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); EXPECT_EQ(buf, nullptr); device_memory_map_cleanup(map); } @@ -53,7 +53,7 @@ TEST(BufferNullTest, NullMap) { ASSERT_NE(qdma, nullptr); struct buffer *buf = buffer_create(qdma, nullptr, ALLOCATION_TYPE_DDR, VRTD_ALLOC_DIR_HOST_TO_DEVICE, - XFER_SIZE, 0, CLIENT_ID, nullptr); + XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); EXPECT_EQ(buf, nullptr); slash_qdma_close(qdma); } @@ -65,7 +65,7 @@ TEST(BufferNullTest, ZeroSize) { ASSERT_NE(map, nullptr); struct buffer *buf = buffer_create(qdma, map, ALLOCATION_TYPE_DDR, VRTD_ALLOC_DIR_HOST_TO_DEVICE, - 0, 0, CLIENT_ID, nullptr); + 0, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); EXPECT_EQ(buf, nullptr); device_memory_map_cleanup(map); slash_qdma_close(qdma); @@ -78,7 +78,7 @@ TEST(BufferNullTest, ZeroClientId) { ASSERT_NE(map, nullptr); struct buffer *buf = buffer_create(qdma, map, ALLOCATION_TYPE_DDR, VRTD_ALLOC_DIR_HOST_TO_DEVICE, - XFER_SIZE, 0, 0, nullptr); + XFER_SIZE, 0, 0, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); EXPECT_EQ(buf, nullptr); device_memory_map_cleanup(map); slash_qdma_close(qdma); @@ -91,7 +91,7 @@ TEST(BufferNullTest, InvalidDirection) { ASSERT_NE(map, nullptr); struct buffer *buf = buffer_create(qdma, map, ALLOCATION_TYPE_DDR, static_cast(99), - XFER_SIZE, 0, CLIENT_ID, nullptr); + XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); EXPECT_EQ(buf, nullptr); device_memory_map_cleanup(map); slash_qdma_close(qdma); @@ -103,7 +103,7 @@ TEST(BufferNullTest, CleanupNull) { TEST(BufferNullTest, RawNullQdma) { struct buffer *buf = buffer_create_raw(nullptr, DDR_START_ADDRESS, XFER_SIZE, - VRTD_ALLOC_DIR_HOST_TO_DEVICE); + VRTD_ALLOC_DIR_HOST_TO_DEVICE, SLASH_QDMA_MM_CHANNEL_AUTO); EXPECT_EQ(buf, nullptr); EXPECT_EQ(errno, EINVAL); } @@ -112,7 +112,7 @@ TEST(BufferNullTest, RawZeroSize) { struct slash_qdma *qdma = slash_qdma_open("@mock"); ASSERT_NE(qdma, nullptr); struct buffer *buf = buffer_create_raw(qdma, DDR_START_ADDRESS, 0, - VRTD_ALLOC_DIR_HOST_TO_DEVICE); + VRTD_ALLOC_DIR_HOST_TO_DEVICE, SLASH_QDMA_MM_CHANNEL_AUTO); EXPECT_EQ(buf, nullptr); EXPECT_EQ(errno, EINVAL); slash_qdma_close(qdma); @@ -154,7 +154,7 @@ class BufferTest : public ::testing::TestWithParam { TEST_P(BufferTest, LifecycleBidirectional) { struct buffer *buf = buffer_create(qdma_, map_, ALLOCATION_TYPE_DDR, VRTD_ALLOC_DIR_BIDIRECTIONAL, - XFER_SIZE, 0, CLIENT_ID, nullptr); + XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); ASSERT_NE(buf, nullptr); EXPECT_GE(buf->fd, 0); @@ -175,7 +175,7 @@ TEST_P(BufferTest, LifecycleBidirectional) { TEST_P(BufferTest, RawCreateAndIO) { struct buffer *buf = buffer_create_raw(qdma_, DDR_START_ADDRESS, XFER_SIZE, - VRTD_ALLOC_DIR_BIDIRECTIONAL); + VRTD_ALLOC_DIR_BIDIRECTIONAL, SLASH_QDMA_MM_CHANNEL_AUTO); ASSERT_NE(buf, nullptr); EXPECT_GE(buf->fd, 0); EXPECT_EQ(buf->addr, DDR_START_ADDRESS); @@ -208,14 +208,14 @@ TEST_P(BufferTest, QueueExhaustion) { for (int i = 0; i < MAX_QUEUES; ++i) { struct buffer *buf = buffer_create_raw(qdma_, DDR_START_ADDRESS + i * XFER_SIZE, - XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE); + XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE, SLASH_QDMA_MM_CHANNEL_AUTO); ASSERT_NE(buf, nullptr) << "Expected success for queue " << i; bufs.push_back(buf); } /* 65th allocation must fail */ struct buffer *overflow = buffer_create_raw(qdma_, DDR_START_ADDRESS, - XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE); + XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE, SLASH_QDMA_MM_CHANNEL_AUTO); EXPECT_EQ(overflow, nullptr); EXPECT_EQ(errno, ENOSPC); diff --git a/vrt/vrtd/tests/device_test.cpp b/vrt/vrtd/tests/device_test.cpp index 36518c10..93dda772 100644 --- a/vrt/vrtd/tests/device_test.cpp +++ b/vrt/vrtd/tests/device_test.cpp @@ -149,7 +149,7 @@ TEST(DeviceCleanupTest, CleanupWithBuffers) { /* Allocate a raw buffer on the mock QDMA and hand ownership to d->buffers. */ struct buffer *buf = buffer_create_raw(d->qdma, DDR_START_ADDRESS, 4096, - VRTD_ALLOC_DIR_HOST_TO_DEVICE); + VRTD_ALLOC_DIR_HOST_TO_DEVICE, SLASH_QDMA_MM_CHANNEL_AUTO); ASSERT_NE(buf, nullptr); int ret = buffer_ptr_array_push_move(&d->buffers, &buf); From c36c43cd5956e56fa886094df8b357527e50c3cf Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Fri, 12 Jun 2026 15:44:54 +0100 Subject: [PATCH 18/23] driver+libslash: added transfer performance hint Signed-off-by: Vlad-Gabriel Serbu --- docs/reference/kernel-abi/index.rst | 17 +++++++++++-- driver/libslash/README.md | 5 +++- driver/libslash/include/slash/qdma.h | 7 ++++-- .../include/slash/uapi/slash_interface.h | 18 ++++++++++++- driver/libslash/src/qdma.c | 9 +++++-- driver/libslash/src/qdma_mock.c | 6 ++++- driver/libslash/src/qdma_mock.h | 3 ++- driver/libslash/tests/qdma_test.cpp | 12 ++++++--- driver/slash_qdma.c | 1 + driver/tests/test_slash_qdma.c | 25 +++++++++++++------ smi/src/validate.cpp | 2 +- 11 files changed, 83 insertions(+), 22 deletions(-) diff --git a/docs/reference/kernel-abi/index.rst b/docs/reference/kernel-abi/index.rst index d97eda1c..11fb9a0c 100644 --- a/docs/reference/kernel-abi/index.rst +++ b/docs/reference/kernel-abi/index.rst @@ -739,11 +739,22 @@ closed (including on process exit). __u64 user_addr; /* [in] Page-aligned host buffer base */ __u64 length; /* [in] Buffer length in bytes (page multiple) */ __u32 buf_id; /* [out] Kernel-assigned buffer handle */ - __u32 pad0; /* padding */ + __u32 transfer_hint; /* [out] enum slash_qdma_transfer_hint */ }; **Direction:** ``_IOWR`` — userspace writes ``flags``, ``user_addr``, ``length``; the kernel writes -back ``buf_id``. +back ``buf_id`` and ``transfer_hint``. + +``transfer_hint`` is advisory and tells userspace which queue topology the kernel expects to be +best for this registered buffer on the current hardware. Current SLASH hardware returns +``SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR``; userspace may ignore this value. Known values are: + +.. code-block:: c + + enum slash_qdma_transfer_hint { + SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR = 1, + SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR = 2, + }; **Preconditions:** @@ -755,6 +766,8 @@ back ``buf_id``. **Postconditions:** - ``buf_id`` is filled with the client-scoped handle, used in ``SLASH_QDMA_QPAIR_IOCTL_TRANSFER``. +- ``transfer_hint`` is filled with an advisory transfer topology hint. Current SLASH hardware + returns ``SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR``. - The pages remain pinned and DMA-mapped until the buffer is unregistered or the owning control fd is closed. diff --git a/driver/libslash/README.md b/driver/libslash/README.md index e9915dd4..af2edec5 100644 --- a/driver/libslash/README.md +++ b/driver/libslash/README.md @@ -125,7 +125,10 @@ and DMA-mapping it) and then move data by handle, avoiding per-transfer pinning: ```c /* buf must be page-aligned and a whole number of pages */ uint32_t buf_id; -slash_qdma_buffer_register(qdma, buf, len, &buf_id); +enum slash_qdma_transfer_hint hint; +slash_qdma_buffer_register(qdma, buf, len, &buf_id, &hint); +/* Current SLASH hardware returns SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR. + * Pass NULL instead of &hint if the application does not care. */ int fd = slash_qdma_qpair_get_fd(qdma, qid, O_CLOEXEC); diff --git a/driver/libslash/include/slash/qdma.h b/driver/libslash/include/slash/qdma.h index db9b95ad..24f463b8 100644 --- a/driver/libslash/include/slash/qdma.h +++ b/driver/libslash/include/slash/qdma.h @@ -165,15 +165,18 @@ int slash_qdma_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags); * @param qdma Open QDMA handle. * @param addr Page-aligned host buffer base. * @param length Buffer length in bytes (non-zero multiple of the page size). - * @param buf_id [out] Receives the kernel-assigned buffer handle. + * @param buf_id [out] Receives the kernel-assigned buffer handle. + * @param transfer_hint [out] Optional transfer-topology hint; pass NULL to ignore. * * The buffer is owned by @qdma and is automatically released when the * handle is closed. Pass the returned @buf_id to slash_qdma_transfer(). + * Current SLASH hardware returns SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR. * * @return 0 on success, -1 on failure (errno set). */ int slash_qdma_buffer_register(struct slash_qdma *qdma, void *addr, - uint64_t length, uint32_t *buf_id); + uint64_t length, uint32_t *buf_id, + enum slash_qdma_transfer_hint *transfer_hint); /** * @brief Unregister a buffer previously registered with diff --git a/driver/libslash/include/slash/uapi/slash_interface.h b/driver/libslash/include/slash/uapi/slash_interface.h index a731610d..cc221bb0 100644 --- a/driver/libslash/include/slash/uapi/slash_interface.h +++ b/driver/libslash/include/slash/uapi/slash_interface.h @@ -250,6 +250,19 @@ enum slash_qdma_transfer_dir { SLASH_QDMA_XFER_C2H = 2, /**< Card-to-Host (read from device). */ }; +/** + * @brief Advisory transfer topology for a registered QDMA buffer. + * + * The kernel returns this hint when a buffer is registered so userspace can + * choose a suitable transfer strategy without hard-coding hardware-specific + * scheduling policy. The hint is advisory: transfers are still valid with any + * queue pair whose direction and ownership checks pass. + */ +enum slash_qdma_transfer_hint { + SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR = 1, /**< Prefer a single qpair. */ + SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR = 2, /**< Prefer two qpairs. */ +}; + /** * @brief Register a host buffer for DMA, pinning its pages once. * @@ -266,6 +279,9 @@ enum slash_qdma_transfer_dir { * Buffers are owned by the control-fd open instance they are registered * through, and are automatically unregistered when that fd is closed * (including on process exit) if userspace forgets to unregister them. + * + * The kernel also returns @transfer_hint. Current SLASH hardware returns + * SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR; userspace may ignore this field. */ struct slash_qdma_buf_register { __u32 size; /**< Struct size for ABI versioning. */ @@ -277,7 +293,7 @@ struct slash_qdma_buf_register { /* Kernel to userspace */ __u32 buf_id; /**< [out] Kernel-assigned buffer handle. */ - __u32 pad0; /**< Padding for natural alignment. */ + __u32 transfer_hint; /**< [out] enum slash_qdma_transfer_hint. */ }; /** diff --git a/driver/libslash/src/qdma.c b/driver/libslash/src/qdma.c index a95ec114..34791a63 100644 --- a/driver/libslash/src/qdma.c +++ b/driver/libslash/src/qdma.c @@ -250,7 +250,8 @@ int slash_qdma_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags) } int slash_qdma_buffer_register(struct slash_qdma *qdma, void *addr, - uint64_t length, uint32_t *buf_id) + uint64_t length, uint32_t *buf_id, + enum slash_qdma_transfer_hint *transfer_hint) { struct slash_qdma_buf_register req; int ret; @@ -261,7 +262,8 @@ int slash_qdma_buffer_register(struct slash_qdma *qdma, void *addr, } if (qdma->priv) { - return slash_qdma_mock_buffer_register(qdma, addr, length, buf_id); + return slash_qdma_mock_buffer_register(qdma, addr, length, buf_id, + transfer_hint); } memset(&req, 0, sizeof(req)); @@ -275,6 +277,9 @@ int slash_qdma_buffer_register(struct slash_qdma *qdma, void *addr, } *buf_id = req.buf_id; + if (transfer_hint != NULL) { + *transfer_hint = req.transfer_hint; + } return 0; } diff --git a/driver/libslash/src/qdma_mock.c b/driver/libslash/src/qdma_mock.c index 3194f43f..7cf32616 100644 --- a/driver/libslash/src/qdma_mock.c +++ b/driver/libslash/src/qdma_mock.c @@ -267,7 +267,8 @@ int slash_qdma_mock_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flag } int slash_qdma_mock_buffer_register(struct slash_qdma *qdma, void *addr, - uint64_t length, uint32_t *buf_id) + uint64_t length, uint32_t *buf_id, + enum slash_qdma_transfer_hint *transfer_hint) { struct slash_qdma_mock *ctx; size_t i; @@ -295,6 +296,9 @@ int slash_qdma_mock_buffer_register(struct slash_qdma *qdma, void *addr, ctx->bufs[i].length = length; *buf_id = (uint32_t) i; + if (transfer_hint != NULL) { + *transfer_hint = SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR; + } return 0; } diff --git a/driver/libslash/src/qdma_mock.h b/driver/libslash/src/qdma_mock.h index 34f591ed..d30a7294 100644 --- a/driver/libslash/src/qdma_mock.h +++ b/driver/libslash/src/qdma_mock.h @@ -36,7 +36,8 @@ int slash_qdma_mock_qpair_stop(struct slash_qdma *qdma, uint32_t qid); int slash_qdma_mock_qpair_del(struct slash_qdma *qdma, uint32_t qid); int slash_qdma_mock_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags); int slash_qdma_mock_buffer_register(struct slash_qdma *qdma, void *addr, - uint64_t length, uint32_t *buf_id); + uint64_t length, uint32_t *buf_id, + enum slash_qdma_transfer_hint *transfer_hint); int slash_qdma_mock_buffer_unregister(struct slash_qdma *qdma, uint32_t buf_id); ssize_t slash_qdma_mock_transfer(struct slash_qdma *qdma, int qpair_fd, uint32_t buf_id, uint64_t buf_offset, diff --git a/driver/libslash/tests/qdma_test.cpp b/driver/libslash/tests/qdma_test.cpp index 98ae2c9e..a2bd03cf 100644 --- a/driver/libslash/tests/qdma_test.cpp +++ b/driver/libslash/tests/qdma_test.cpp @@ -105,13 +105,13 @@ TEST(QdmaNullTest, BufferRegister) { uint32_t buf_id = 0; uint8_t local = 0; errno = 0; - EXPECT_EQ(slash_qdma_buffer_register(nullptr, &local, 4096, &buf_id), -1); + EXPECT_EQ(slash_qdma_buffer_register(nullptr, &local, 4096, &buf_id, nullptr), -1); EXPECT_EQ(errno, EINVAL); struct slash_qdma fake{}; fake.fd = -1; errno = 0; - EXPECT_EQ(slash_qdma_buffer_register(&fake, nullptr, 4096, &buf_id), -1); + EXPECT_EQ(slash_qdma_buffer_register(&fake, nullptr, 4096, &buf_id, nullptr), -1); EXPECT_EQ(errno, EINVAL); } @@ -240,8 +240,12 @@ TEST_P(ParametrizedQdmaTest, RegisteredBufferTransfer) { uint32_t src_buf = 0; uint32_t dst_buf = 0; - ASSERT_EQ(slash_qdma_buffer_register(qdma_, src, XFER_SIZE, &src_buf), 0); - ASSERT_EQ(slash_qdma_buffer_register(qdma_, dst, XFER_SIZE, &dst_buf), 0); + enum slash_qdma_transfer_hint src_hint = SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR; + enum slash_qdma_transfer_hint dst_hint = SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR; + ASSERT_EQ(slash_qdma_buffer_register(qdma_, src, XFER_SIZE, &src_buf, &src_hint), 0); + ASSERT_EQ(slash_qdma_buffer_register(qdma_, dst, XFER_SIZE, &dst_buf, &dst_hint), 0); + EXPECT_EQ(src_hint, SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR); + EXPECT_EQ(dst_hint, SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR); // H2C: push the source buffer to the device. ssize_t written = slash_qdma_transfer(qdma_, queue_fd, src_buf, 0, diff --git a/driver/slash_qdma.c b/driver/slash_qdma.c index 9b120be7..d8a2b597 100644 --- a/driver/slash_qdma.c +++ b/driver/slash_qdma.c @@ -3121,6 +3121,7 @@ static int slash_qdma_ioctl_buf_register_w(struct miscdevice *misc, /* Copy the assigned buf_id back to userspace. */ req.size = sizeof(req); req.buf_id = buf_id; + req.transfer_hint = SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR; copy_size = min_t(size_t, user_size, sizeof(req)); if (copy_to_user(uarg, &req, copy_size)) { xa_erase(&client->buffers, buf_id); diff --git a/driver/tests/test_slash_qdma.c b/driver/tests/test_slash_qdma.c index afe7bfe7..fd815143 100644 --- a/driver/tests/test_slash_qdma.c +++ b/driver/tests/test_slash_qdma.c @@ -827,7 +827,7 @@ TEST_F(qdma, hugepage_write_read_verify) /* Register a host buffer via the control fd; returns 0 or -errno. */ static int qdma_buf_register(int ctl_fd, void *addr, uint64_t length, - uint32_t *buf_id) + uint32_t *buf_id, uint32_t *transfer_hint) { struct slash_qdma_buf_register req; int ret; @@ -842,6 +842,8 @@ static int qdma_buf_register(int ctl_fd, void *addr, uint64_t length, return -errno; *buf_id = req.buf_id; + if (transfer_hint) + *transfer_hint = req.transfer_hint; return 0; } @@ -895,7 +897,7 @@ TEST_F(qdma, buf_register_zero_length_returns_einval) buf = aligned_alloc(4096, TRANSFER_SIZE); ASSERT_NE(NULL, buf); - EXPECT_EQ(-EINVAL, qdma_buf_register(self->ctl_fd, buf, 0, &buf_id)); + EXPECT_EQ(-EINVAL, qdma_buf_register(self->ctl_fd, buf, 0, &buf_id, NULL)); free(buf); } @@ -910,7 +912,7 @@ TEST_F(qdma, buf_register_unaligned_returns_einval) /* Misaligned base address is rejected. */ EXPECT_EQ(-EINVAL, - qdma_buf_register(self->ctl_fd, buf + 1, TRANSFER_SIZE, &buf_id)); + qdma_buf_register(self->ctl_fd, buf + 1, TRANSFER_SIZE, &buf_id, NULL)); free(buf); } @@ -944,6 +946,7 @@ TEST_F(qdma, transfer_wrong_direction_returns_enodev) { uint8_t *buf; uint32_t buf_id = 0; + uint32_t transfer_hint = 0; long ret; bring_up_qpair(_metadata, self, 0x1); /* H2C only */ @@ -952,7 +955,9 @@ TEST_F(qdma, transfer_wrong_direction_returns_enodev) MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ASSERT_NE(MAP_FAILED, buf); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id)); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, + &transfer_hint)); + EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR, transfer_hint); /* C2H is not enabled on this qpair. */ ret = qdma_buf_transfer(self->io_fd, buf_id, 0, @@ -969,6 +974,7 @@ TEST_F(qdma, transfer_out_of_range_returns_einval) { uint8_t *buf; uint32_t buf_id = 0; + uint32_t transfer_hint = 0; long ret; bring_up_qpair(_metadata, self, 0x3); @@ -977,7 +983,9 @@ TEST_F(qdma, transfer_out_of_range_returns_einval) MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ASSERT_NE(MAP_FAILED, buf); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id)); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, + &transfer_hint)); + EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR, transfer_hint); /* Slice extends past the registered length. */ ret = qdma_buf_transfer(self->io_fd, buf_id, TRANSFER_SIZE, @@ -995,6 +1003,7 @@ TEST_F(qdma, registered_buffer_round_trip) const size_t xfer_size = TRANSFER_SIZE * 8; /* 8 base pages */ uint8_t *write_buf, *read_buf; uint32_t write_id = 0, read_id = 0; + uint32_t write_hint = 0, read_hint = 0; uint64_t dma_addr = get_dma_addr(); long ret; @@ -1011,9 +1020,11 @@ TEST_F(qdma, registered_buffer_round_trip) memset(read_buf, 0, xfer_size); ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, xfer_size, - &write_id)); + &write_id, &write_hint)); ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, xfer_size, - &read_id)); + &read_id, &read_hint)); + EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR, write_hint); + EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR, read_hint); ret = qdma_buf_transfer(self->io_fd, write_id, 0, dma_addr, xfer_size, SLASH_QDMA_XFER_H2C); diff --git a/smi/src/validate.cpp b/smi/src/validate.cpp index 188f505e..7cdcae06 100644 --- a/smi/src/validate.cpp +++ b/smi/src/validate.cpp @@ -713,7 +713,7 @@ class RawTransferBuffer { } void registerBuffer() { - if (slash_qdma_buffer_register(qdma_, data_, size_, &bufId_) != 0) { + if (slash_qdma_buffer_register(qdma_, data_, size_, &bufId_, nullptr) != 0) { throwSystemError("Failed to register raw transfer DMA buffer"); } bufRegistered_ = true; From 3707e38de32cccf366f67b4b1483a0dfdf069da2 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Fri, 12 Jun 2026 16:52:24 +0100 Subject: [PATCH 19/23] vrt/vrtd: use new performance buffer ioctl api Signed-off-by: Vlad-Gabriel Serbu --- docs/reference/kernel-abi/index.rst | 84 +++--- driver/libslash/README.md | 30 +-- driver/libslash/include/slash/qdma.h | 47 +++- .../include/slash/uapi/slash_interface.h | 8 +- driver/libslash/src/qdma.c | 186 +++++++++++++ driver/libslash/tests/qdma_test.cpp | 58 +++- driver/slash_qdma.c | 251 +++--------------- driver/tests/test_slash_qdma.c | 205 ++++++++------ vrt/src/qdma/qdma_intf.cpp | 120 +++------ vrt/vrtd/include/vrtd/wire.h | 13 +- vrt/vrtd/libvrtd/include/vrtd/vrtd.h | 13 +- vrt/vrtd/libvrtd/src/buffer.c | 154 ++++++++--- vrt/vrtd/libvrtd/src/requests.c | 171 +++++++++--- vrt/vrtd/libvrtdpp/src/buffer.cpp | 22 +- vrt/vrtd/src/buffer.c | 205 +++++++------- vrt/vrtd/src/buffer.h | 12 +- vrt/vrtd/src/serve.c | 61 +++-- vrt/vrtd/src/serve.h | 8 +- vrt/vrtd/tests/buffer_test.cpp | 52 ++-- 19 files changed, 1014 insertions(+), 686 deletions(-) diff --git a/docs/reference/kernel-abi/index.rst b/docs/reference/kernel-abi/index.rst index 11fb9a0c..8db53a2c 100644 --- a/docs/reference/kernel-abi/index.rst +++ b/docs/reference/kernel-abi/index.rst @@ -336,8 +336,8 @@ Memory transfers via QDMA: ``/dev/slash_qdma_ctl`` The QDMA device manages DMA queue pairs for bulk data movement between host memory and the card's on-board memory (HBM or DDR). Each queue pair is allocated with a mode (currently only MM) and a direction mask, then started before use. An anon-inode fd obtained from the queue pair serves as -the I/O channel: ``write()`` performs H2C transfers, ``read()`` performs C2H transfers, and the -file position encodes the device-side physical address. +the transfer channel: host buffers are registered once, and transfer ioctls name the registered +buffer, buffer offset, device-side physical address, length, and direction. - **Device file name:** ``/dev/slash_qdma_ctl`` (e.g. ``/dev/slash_qdma_ctl0``) - **Sysfs name:** ``slash_qdma_ctl_`` (e.g. ``/sys/class/misc/slash_qdma_ctl_0000:61:00.1``) @@ -353,9 +353,9 @@ Usage ----- In order to transfer data via QDMA, a queue pair must be added, started, and an I/O fd needs -to be created. The I/O fd treats the file position as the device-side physical address: -``write()`` performs an H2C (host-to-card) transfer, and ``read()`` performs a C2H (card-to-host) -transfer. Full lifecycle: +to be created. The I/O fd is ioctl-only for data movement: userspace registers a host buffer, +then issues transfer ioctls that name the registered buffer, buffer offset, device-side address, +length, and direction. Full lifecycle: .. code-block:: c @@ -381,27 +381,38 @@ transfer. Full lifecycle: }; int io_fd = ioctl(qdma_fd, SLASH_QDMA_IOCTL_QPAIR_GET_FD, &fd_req); - /* Step 4: H2C transfer to device address 0x4000000000 */ - pwrite(io_fd, host_buf, nbytes, 0x4000000000LL); + /* Step 4: Register a page-aligned host buffer. */ + struct slash_qdma_buf_register reg = { + .size = sizeof(reg), .user_addr = (uintptr_t)host_buf, .length = nbytes + }; + ioctl(io_fd, SLASH_QDMA_IOCTL_BUF_REGISTER, ®); + + /* Step 5: H2C transfer to device address 0x4000000000 */ + struct slash_qdma_transfer xfer = { + .size = sizeof(xfer), + .buf_id = reg.buf_id, + .buf_offset = 0, + .dev_addr = 0x4000000000LL, + .length = nbytes, + .direction = SLASH_QDMA_XFER_H2C, + }; + ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &xfer); - /* Step 5: C2H transfer from device address 0x4000000000 */ - pread(io_fd, host_buf, nbytes, 0x4000000000LL); + /* Step 6: C2H transfer from device address 0x4000000000 */ + xfer.direction = SLASH_QDMA_XFER_C2H; + ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &xfer); - /* Step 6: Teardown */ + /* Step 7: Teardown */ + struct slash_qdma_buf_unregister unreg = { + .size = sizeof(unreg), .buf_id = reg.buf_id + }; + ioctl(io_fd, SLASH_QDMA_IOCTL_BUF_UNREGISTER, &unreg); close(io_fd); op.op = 1; ioctl(qdma_fd, SLASH_QDMA_IOCTL_Q_OP, &op); /* STOP */ op.op = 2; ioctl(qdma_fd, SLASH_QDMA_IOCTL_Q_OP, &op); /* DEL */ -The file position can also be set explicitly with ``lseek`` before a plain ``read()``/``write()``: - -.. code-block:: c - - lseek(io_fd, 0x1000, SEEK_SET); - write(io_fd, src_buf, nbytes); - -``lseek`` supports all flags ``SEEK_SET``, ``SEEK_CUR``, and ``SEEK_END``, and both ``pread`` and -``pwrite`` are supported. However, the fd does **not** support ``mmap``, ``poll``/``select``, or -``splice``. +The qpair fd does **not** support ``read``, ``write``, ``pread``, ``pwrite``, ``mmap``, +``poll``/``select``, or ``splice`` for data movement. All transfers are synchronous and block until the transfer completes or times out. The timeout is **10 seconds**; after expiry the call returns ``-ETIME``. Partial transfers are possible; the @@ -675,10 +686,11 @@ removed. ``SLASH_QDMA_IOCTL_QPAIR_GET_FD`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Creates a new file descriptor for data transfer on an existing queue pair. The returned fd supports -``read``, ``write``, ``pread``, ``pwrite``, and ``lseek``; it does **not** support ``mmap``, -``poll``/``select``, or ``splice``. Multiple fds can be obtained for the same qpair via multiple -calls. The fd is returned as the ``ioctl()`` return value. +Creates a new file descriptor for data transfer on an existing queue pair. The returned fd is +ioctl-only for data movement: it supports buffer register/unregister and transfer ioctls, but not +``read``, ``write``, ``pread``, ``pwrite``, ``mmap``, ``poll``/``select``, or ``splice``. Multiple +fds can be obtained for the same qpair via multiple calls. The fd is returned as the ``ioctl()`` +return value. **Interface:** @@ -705,8 +717,8 @@ as the ``ioctl()`` return value (not as a struct field). **Postconditions:** - The return value is a non-negative fd number on success. -- The fd holds a reference on both the qpair entry and the device; neither can be freed while - this fd is open. +- The fd holds a reference on the qpair entry, device, and the client context that owns registered + buffers; neither can be freed while this fd is open. **Return values:** @@ -721,11 +733,12 @@ as the ``ioctl()`` return value (not as a struct field). ``SLASH_QDMA_IOCTL_BUF_REGISTER`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Registers a host buffer for DMA. The kernel pins the backing pages, builds a scatter-gather list, -and DMA-maps it **once**. Subsequent transfers reference the buffer by ``buf_id`` and reuse the -cached, pre-DMA-mapped SGL instead of pinning and mapping per transfer. Registered buffers are owned -by the control-fd open instance they are registered through and are auto-released when that fd is -closed (including on process exit). +Registers a host buffer for DMA. The ioctl may be issued on either the QDMA control fd or a qpair +fd derived from that control fd; both resolve to the same client-scoped buffer table. The kernel +pins the backing pages, builds a scatter-gather list, and DMA-maps it **once**. Subsequent transfers +reference the buffer by ``buf_id`` and reuse the cached, pre-DMA-mapped SGL instead of pinning and +mapping per transfer. Registered buffers are owned by the shared client context and are +auto-released when the final fd referencing that context is closed (including on process exit). **Interface:** @@ -742,8 +755,8 @@ closed (including on process exit). __u32 transfer_hint; /* [out] enum slash_qdma_transfer_hint */ }; -**Direction:** ``_IOWR`` — userspace writes ``flags``, ``user_addr``, ``length``; the kernel writes -back ``buf_id`` and ``transfer_hint``. +**Direction:** ``_IOWR`` — issued on the control fd or a qpair fd. Userspace writes ``flags``, +``user_addr``, ``length``; the kernel writes back ``buf_id`` and ``transfer_hint``. ``transfer_hint`` is advisory and tells userspace which queue topology the kernel expects to be best for this registered buffer on the current hardware. Current SLASH hardware returns @@ -784,8 +797,9 @@ best for this registered buffer on the current hardware. Current SLASH hardware ``SLASH_QDMA_IOCTL_BUF_UNREGISTER`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Removes a registered buffer from the owning client's table. The pages are unpinned and the DMA -mapping torn down once no in-flight transfer still references the buffer. +Removes a registered buffer from the owning client's table. This ioctl may be issued on the same +control fd used for registration or on any qpair fd derived from that client context. The pages are +unpinned and the DMA mapping torn down once no in-flight transfer still references the buffer. **Interface:** diff --git a/driver/libslash/README.md b/driver/libslash/README.md index af2edec5..874931c0 100644 --- a/driver/libslash/README.md +++ b/driver/libslash/README.md @@ -108,38 +108,28 @@ uint32_t qid = req.qid; slash_qdma_qpair_start(qdma, qid); -/* Get an fd for data transfer — read() = C2H, write() = H2C */ +/* Get an ioctl-only qpair fd for registered-buffer transfers. */ int fd = slash_qdma_qpair_get_fd(qdma, qid, O_CLOEXEC); -write(fd, buf, len); /* H2C */ -read(fd, buf, len); /* C2H */ -close(fd); - -slash_qdma_qpair_stop(qdma, qid); -slash_qdma_qpair_del(qdma, qid); -slash_qdma_close(qdma); -``` - -For high-throughput transfers, register a host buffer once (pinning its pages -and DMA-mapping it) and then move data by handle, avoiding per-transfer pinning: -```c /* buf must be page-aligned and a whole number of pages */ uint32_t buf_id; enum slash_qdma_transfer_hint hint; -slash_qdma_buffer_register(qdma, buf, len, &buf_id, &hint); +slash_qdma_qpair_buffer_register(fd, buf, len, &buf_id, &hint); /* Current SLASH hardware returns SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR. * Pass NULL instead of &hint if the application does not care. */ -int fd = slash_qdma_qpair_get_fd(qdma, qid, O_CLOEXEC); - /* H2C: host -> device at dev_addr */ -slash_qdma_transfer(qdma, fd, buf_id, /*buf_offset=*/0, dev_addr, len, - SLASH_QDMA_XFER_H2C); +slash_qdma_qpair_transfer(fd, buf_id, /*buf_offset=*/0, dev_addr, len, + SLASH_QDMA_XFER_H2C); /* C2H: device -> host */ -slash_qdma_transfer(qdma, fd, buf_id, 0, dev_addr, len, SLASH_QDMA_XFER_C2H); +slash_qdma_qpair_transfer(fd, buf_id, 0, dev_addr, len, SLASH_QDMA_XFER_C2H); +slash_qdma_qpair_buffer_unregister(fd, buf_id); close(fd); -slash_qdma_buffer_unregister(qdma, buf_id); + +slash_qdma_qpair_stop(qdma, qid); +slash_qdma_qpair_del(qdma, qid); +slash_qdma_close(qdma); ``` ### Hotplug — PCIe device lifecycle diff --git a/driver/libslash/include/slash/qdma.h b/driver/libslash/include/slash/qdma.h index 24f463b8..72398b88 100644 --- a/driver/libslash/include/slash/qdma.h +++ b/driver/libslash/include/slash/qdma.h @@ -31,10 +31,11 @@ * 6. slash_qdma_qpair_del() — destroy * 7. slash_qdma_close() — close the device * - * The fd from qpair_get_fd() supports read() for C2H (card-to-host) - * and write() for H2C (host-to-card) DMA transfers. Positional I/O - * via lseek()/pread()/pwrite() is also supported. splice(), mmap(), - * and poll() are not available. + * The fd from qpair_get_fd() is ioctl-only for data movement: register host + * buffers with slash_qdma_qpair_buffer_register() (or through the owning + * control fd), then transfer with slash_qdma_qpair_transfer() / + * slash_qdma_transfer(). read(), write(), mmap(), and poll() are not + * available for SLASH transfers. * * Registered buffers: * For high-throughput transfers, a host buffer can be registered once @@ -152,8 +153,8 @@ int slash_qdma_qpair_del(struct slash_qdma *qdma, uint32_t qid); * @param flags Only O_CLOEXEC is accepted; the kernel returns -EINVAL for * any other bits. * - * The returned fd supports read() (C2H) and write() (H2C). Positional - * I/O via lseek()/pread()/pwrite() is also available. + * The returned fd supports transfer and buffer-registration ioctls. It does + * not support read/write data movement; use slash_qdma_qpair_transfer(). * * @return Non-negative fd on success, -1 on failure. */ @@ -189,6 +190,26 @@ int slash_qdma_buffer_register(struct slash_qdma *qdma, void *addr, */ int slash_qdma_buffer_unregister(struct slash_qdma *qdma, uint32_t buf_id); +/** + * @brief Register a host buffer through a queue-pair fd. + * + * Same semantics as slash_qdma_buffer_register(), but issues the registration + * ioctl on @p qpair_fd. This is useful for clients that received only qpair + * fds via SCM_RIGHTS (for example libvrtd clients). + * + * @return 0 on success, -1 on failure (errno set). + */ +int slash_qdma_qpair_buffer_register(int qpair_fd, void *addr, + uint64_t length, uint32_t *buf_id, + enum slash_qdma_transfer_hint *transfer_hint); + +/** + * @brief Unregister a buffer through a queue-pair fd. + * + * @return 0 on success, -1 on failure (errno set). + */ +int slash_qdma_qpair_buffer_unregister(int qpair_fd, uint32_t buf_id); + /** * @brief Perform a DMA transfer using a registered buffer. * @@ -208,6 +229,20 @@ ssize_t slash_qdma_transfer(struct slash_qdma *qdma, int qpair_fd, uint64_t dev_addr, uint64_t length, uint32_t direction); +/** + * @brief Perform a DMA transfer using only a queue-pair fd. + * + * Same transfer ioctl as slash_qdma_transfer(), but without a device handle. + * This is the preferred form for code that received a qpair fd over + * SCM_RIGHTS. + * + * @return Number of bytes transferred (>= 0) on success, -1 on failure + * (errno set). + */ +ssize_t slash_qdma_qpair_transfer(int qpair_fd, uint32_t buf_id, + uint64_t buf_offset, uint64_t dev_addr, + uint64_t length, uint32_t direction); + #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ diff --git a/driver/libslash/include/slash/uapi/slash_interface.h b/driver/libslash/include/slash/uapi/slash_interface.h index cc221bb0..977b8506 100644 --- a/driver/libslash/include/slash/uapi/slash_interface.h +++ b/driver/libslash/include/slash/uapi/slash_interface.h @@ -225,14 +225,14 @@ struct slash_qdma_qpair_op { /** * @brief Obtain a file descriptor for queue I/O. * - * The returned fd can be used for read/write (or mmap) to transfer data + * The returned fd can be used for registered-buffer ioctls to transfer data * through the queue pair. * * The fd is returned as the ioctl return value (same convention as * the BAR fd ioctl). A single fd is returned per queue pair; - * read() on the fd performs C2H transfers and write() performs H2C - * transfers, using whichever directions were enabled in \@dir_mask - * when the queue pair was added. + * Data movement is issued via SLASH_QDMA_QPAIR_IOCTL_TRANSFER, using + * whichever directions were enabled in \@dir_mask when the queue pair was + * added. */ struct slash_qdma_qpair_fd_request { __u32 size; /**< Struct size for ABI versioning. */ diff --git a/driver/libslash/src/qdma.c b/driver/libslash/src/qdma.c index 34791a63..135b34af 100644 --- a/driver/libslash/src/qdma.c +++ b/driver/libslash/src/qdma.c @@ -41,6 +41,109 @@ #include +#define QPAIR_FALLBACK_MAX_BUFS 128 + +struct qpair_fallback_buf { + int in_use; + void *addr; + uint64_t length; +}; + +/* + * Small process-local fallback table used only when qpair-fd registration + * ioctls return ENOTTY (the memfd-backed @mock path). Real hardware qpair fds + * implement the ioctl in the kernel and never use this table. + */ +static struct qpair_fallback_buf qpair_fallback_bufs[QPAIR_FALLBACK_MAX_BUFS]; + +static int qpair_fallback_register(void *addr, uint64_t length, uint32_t *buf_id, + enum slash_qdma_transfer_hint *transfer_hint) +{ + uint32_t i; + + if (addr == NULL || length == 0 || buf_id == NULL) { + errno = EINVAL; + return -1; + } + + for (i = 0; i < QPAIR_FALLBACK_MAX_BUFS; ++i) { + if (!qpair_fallback_bufs[i].in_use) { + qpair_fallback_bufs[i].in_use = 1; + qpair_fallback_bufs[i].addr = addr; + qpair_fallback_bufs[i].length = length; + *buf_id = i; + if (transfer_hint != NULL) { + *transfer_hint = SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR; + } + return 0; + } + } + + errno = ENOSPC; + return -1; +} + +static int qpair_fallback_unregister(uint32_t buf_id) +{ + if (buf_id >= QPAIR_FALLBACK_MAX_BUFS || !qpair_fallback_bufs[buf_id].in_use) { + errno = ENOENT; + return -1; + } + + memset(&qpair_fallback_bufs[buf_id], 0, sizeof(qpair_fallback_bufs[buf_id])); + return 0; +} + +static ssize_t qpair_fallback_transfer(int qpair_fd, uint32_t buf_id, + uint64_t buf_offset, uint64_t dev_addr, + uint64_t length, uint32_t direction) +{ + struct qpair_fallback_buf *buf; + char *host; + uint64_t done = 0; + + if (qpair_fd < 0 || buf_id >= QPAIR_FALLBACK_MAX_BUFS || + !qpair_fallback_bufs[buf_id].in_use) { + errno = EINVAL; + return -1; + } + + buf = &qpair_fallback_bufs[buf_id]; + if (length == 0 || buf_offset > buf->length || length > buf->length - buf_offset) { + errno = EINVAL; + return -1; + } + + host = (char *)buf->addr + buf_offset; + while (done < length) { + ssize_t n; + + if (direction == SLASH_QDMA_XFER_H2C) { + n = pwrite(qpair_fd, host + done, (size_t)(length - done), + (off_t)(dev_addr + done)); + } else if (direction == SLASH_QDMA_XFER_C2H) { + n = pread(qpair_fd, host + done, (size_t)(length - done), + (off_t)(dev_addr + done)); + } else { + errno = EINVAL; + return -1; + } + + if (n < 0) { + if (errno == EINTR) { + continue; + } + return -1; + } + if (n == 0) { + break; + } + done += (uint64_t)n; + } + + return (ssize_t)done; +} + struct slash_qdma *slash_qdma_open(const char *path) { struct slash_qdma *qdma; @@ -310,6 +413,64 @@ int slash_qdma_buffer_unregister(struct slash_qdma *qdma, uint32_t buf_id) return 0; } +int slash_qdma_qpair_buffer_register(int qpair_fd, void *addr, + uint64_t length, uint32_t *buf_id, + enum slash_qdma_transfer_hint *transfer_hint) +{ + struct slash_qdma_buf_register req; + int ret; + + if (qpair_fd < 0 || addr == NULL || buf_id == NULL) { + errno = EINVAL; + return -1; + } + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.user_addr = (uint64_t)(uintptr_t)addr; + req.length = length; + + ret = ioctl(qpair_fd, SLASH_QDMA_IOCTL_BUF_REGISTER, &req); + if (ret < 0) { + if (errno == ENOTTY) { + return qpair_fallback_register(addr, length, buf_id, transfer_hint); + } + return -1; + } + + *buf_id = req.buf_id; + if (transfer_hint != NULL) { + *transfer_hint = req.transfer_hint; + } + + return 0; +} + +int slash_qdma_qpair_buffer_unregister(int qpair_fd, uint32_t buf_id) +{ + struct slash_qdma_buf_unregister req; + int ret; + + if (qpair_fd < 0) { + errno = EINVAL; + return -1; + } + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.buf_id = buf_id; + + ret = ioctl(qpair_fd, SLASH_QDMA_IOCTL_BUF_UNREGISTER, &req); + if (ret < 0) { + if (errno == ENOTTY) { + return qpair_fallback_unregister(buf_id); + } + return -1; + } + + return 0; +} + ssize_t slash_qdma_transfer(struct slash_qdma *qdma, int qpair_fd, uint32_t buf_id, uint64_t buf_offset, uint64_t dev_addr, uint64_t length, @@ -333,6 +494,27 @@ ssize_t slash_qdma_transfer(struct slash_qdma *qdma, int qpair_fd, dev_addr, length, direction); } + return slash_qdma_qpair_transfer(qpair_fd, buf_id, buf_offset, dev_addr, + length, direction); +} + +ssize_t slash_qdma_qpair_transfer(int qpair_fd, uint32_t buf_id, + uint64_t buf_offset, uint64_t dev_addr, + uint64_t length, uint32_t direction) +{ + struct slash_qdma_transfer req; + int ret; + + if (qpair_fd < 0) { + errno = EINVAL; + return -1; + } + + if (direction != SLASH_QDMA_XFER_H2C && direction != SLASH_QDMA_XFER_C2H) { + errno = EINVAL; + return -1; + } + memset(&req, 0, sizeof(req)); req.size = sizeof(req); req.buf_id = buf_id; @@ -343,6 +525,10 @@ ssize_t slash_qdma_transfer(struct slash_qdma *qdma, int qpair_fd, ret = ioctl(qpair_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req); if (ret < 0) { + if (errno == ENOTTY) { + return qpair_fallback_transfer(qpair_fd, buf_id, buf_offset, + dev_addr, length, direction); + } return -1; } diff --git a/driver/libslash/tests/qdma_test.cpp b/driver/libslash/tests/qdma_test.cpp index a2bd03cf..b2ad3a70 100644 --- a/driver/libslash/tests/qdma_test.cpp +++ b/driver/libslash/tests/qdma_test.cpp @@ -192,20 +192,38 @@ TEST_P(ParametrizedQdmaTest, QueueDmaTransfer) { int queue_fd = slash_qdma_qpair_get_fd(qdma_, qid, 0); ASSERT_GE(queue_fd, 0); - // Write a known pattern to DDR (H2C). - uint8_t src[XFER_SIZE]; + // Write a known pattern to DDR (H2C) through the transfer-only ioctl path. + void *src_mem = nullptr; + void *dst_mem = nullptr; + ASSERT_EQ(posix_memalign(&src_mem, 4096, XFER_SIZE), 0); + ASSERT_EQ(posix_memalign(&dst_mem, 4096, XFER_SIZE), 0); + auto *src = static_cast(src_mem); + auto *dst = static_cast(dst_mem); for (size_t i = 0; i < XFER_SIZE; ++i) { src[i] = static_cast(i & 0xFF); } - ssize_t written = pwrite(queue_fd, src, XFER_SIZE, static_cast(DDR_BASE_ADDRESS)); + std::memset(dst, 0, XFER_SIZE); + + uint32_t src_buf = 0; + uint32_t dst_buf = 0; + ASSERT_EQ(slash_qdma_qpair_buffer_register(queue_fd, src, XFER_SIZE, &src_buf, nullptr), 0); + ASSERT_EQ(slash_qdma_qpair_buffer_register(queue_fd, dst, XFER_SIZE, &dst_buf, nullptr), 0); + + ssize_t written = slash_qdma_qpair_transfer( + queue_fd, src_buf, 0, DDR_BASE_ADDRESS, XFER_SIZE, SLASH_QDMA_XFER_H2C); EXPECT_EQ(written, static_cast(XFER_SIZE)); // Read back from DDR (C2H) and verify. - uint8_t dst[XFER_SIZE]{}; - ssize_t read_bytes = pread(queue_fd, dst, XFER_SIZE, static_cast(DDR_BASE_ADDRESS)); + ssize_t read_bytes = slash_qdma_qpair_transfer( + queue_fd, dst_buf, 0, DDR_BASE_ADDRESS, XFER_SIZE, SLASH_QDMA_XFER_C2H); EXPECT_EQ(read_bytes, static_cast(XFER_SIZE)); EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); + EXPECT_EQ(slash_qdma_qpair_buffer_unregister(queue_fd, src_buf), 0); + EXPECT_EQ(slash_qdma_qpair_buffer_unregister(queue_fd, dst_buf), 0); + free(src_mem); + free(dst_mem); + EXPECT_EQ(close(queue_fd), 0); EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qid), 0); @@ -271,6 +289,36 @@ TEST_P(ParametrizedQdmaTest, RegisteredBufferTransfer) { EXPECT_EQ(slash_qdma_qpair_del(qdma_, qid), 0); } +TEST_P(ParametrizedQdmaTest, QueueFdReadWriteRejectedOnHardware) { + if (mock) { + GTEST_SKIP() << "mock qpair fds are memfds and still support read/write"; + } + + struct slash_qdma_qpair_add req{}; + req.mode = 0; + req.dir_mask = 0x3; + + ASSERT_EQ(slash_qdma_qpair_add(qdma_, &req), 0); + uint32_t qid = req.qid; + ASSERT_EQ(slash_qdma_qpair_start(qdma_, qid), 0); + + int queue_fd = slash_qdma_qpair_get_fd(qdma_, qid, 0); + ASSERT_GE(queue_fd, 0); + + uint8_t byte = 0; + errno = 0; + EXPECT_EQ(write(queue_fd, &byte, sizeof(byte)), -1); + EXPECT_TRUE(errno == EINVAL || errno == EOPNOTSUPP || errno == EBADF); + + errno = 0; + EXPECT_EQ(read(queue_fd, &byte, sizeof(byte)), -1); + EXPECT_TRUE(errno == EINVAL || errno == EOPNOTSUPP || errno == EBADF); + + EXPECT_EQ(close(queue_fd), 0); + EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qid), 0); + EXPECT_EQ(slash_qdma_qpair_del(qdma_, qid), 0); +} + TEST_P(ParametrizedQdmaTest, CloseSucceeds) { EXPECT_EQ(slash_qdma_close(qdma_), 0); qdma_ = nullptr; diff --git a/driver/slash_qdma.c b/driver/slash_qdma.c index d8a2b597..d199344f 100644 --- a/driver/slash_qdma.c +++ b/driver/slash_qdma.c @@ -789,10 +789,6 @@ static void slash_qdma_buf_put(struct slash_qdma_buf *buf); static void slash_qdma_client_release(struct kref *ref); static long slash_qdma_qpair_transfer(struct file *file, void __user *uarg); -static ssize_t slash_qdma_qpair_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos); -static ssize_t slash_qdma_qpair_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos); static int slash_qdma_qpair_release(struct inode *inode, struct file *file); static long slash_qdma_qpair_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -800,20 +796,14 @@ static long slash_qdma_qpair_ioctl(struct file *file, /** * slash_qdma_qpair_fops - File operations for per-qpair anon_inode fds. * - * read() performs a C2H (card-to-host) DMA transfer. - * write() performs an H2C (host-to-card) DMA transfer. - * llseek uses default_llseek so that pread/pwrite can set the - * device-side address via the file position. - * ioctl is a stub that returns -ENOTTY (no per-fd ioctls defined yet). + * ioctl performs registered-buffer transfers and buffer registration + * operations that share the owning control fd's client context. * release drops the refs on the qpair entry and device. */ static const struct file_operations slash_qdma_qpair_fops = { .owner = THIS_MODULE, - .read = slash_qdma_qpair_read, - .write = slash_qdma_qpair_write, .unlocked_ioctl = slash_qdma_qpair_ioctl, .release = slash_qdma_qpair_release, - .llseek = default_llseek, }; @@ -1849,24 +1839,10 @@ static int slash_qdma_fop_open(struct inode *inode, struct file *file) static int slash_qdma_fop_release(struct inode *inode, struct file *file) { struct slash_qdma_client *client = file->private_data; - struct slash_qdma_buf *buf; - unsigned long index; if (!client) return 0; - /* - * Auto-unregister any buffers the client forgot (or had no chance) to - * release. Remove each from the lookup table first so no new transfer - * can find it, then drop the table's reference. Buffers with an - * in-flight transfer stay alive until that transfer releases its ref. - */ - xa_for_each(&client->buffers, index, buf) { - xa_erase(&client->buffers, index); - slash_qdma_buf_put(buf); - } - xa_destroy(&client->buffers); - kref_put(&client->ref, slash_qdma_client_release); file->private_data = NULL; @@ -2980,7 +2956,20 @@ static void slash_qdma_client_release(struct kref *ref) { struct slash_qdma_client *client = container_of(ref, struct slash_qdma_client, ref); + struct slash_qdma_buf *buf; + unsigned long index; + /* + * Auto-unregister any buffers the client forgot (or had no chance) to + * release. This runs when the final fd referencing the shared client + * context closes (control fd or any derived qpair fd), so registrations + * remain usable after the control fd closes as long as a qpair fd is + * still alive. + */ + xa_for_each(&client->buffers, index, buf) { + xa_erase(&client->buffers, index); + slash_qdma_buf_put(buf); + } xa_destroy(&client->buffers); if (client->qdma_dev) kref_put(&client->qdma_dev->ref, slash_qdma_dev_release); @@ -3181,186 +3170,6 @@ static int slash_qdma_ioctl_buf_unregister_w(struct miscdevice *misc, return 0; } -/** - * slash_qdma_qpair_read_write() - Perform a DMA transfer via a qpair fd. - * @file: The anon_inode file for this queue pair. - * @buf: User-space buffer (source for write/H2C, destination for read/C2H). - * @count: Number of bytes to transfer. - * @ppos: File position — used as the device-side (endpoint) address. - * Updated on success to reflect the bytes transferred, enabling - * sequential positional I/O. - * @write: true for H2C (host-to-card write), false for C2H (card-to-host read). - * - * Transfer flow: - * 1. Validate context and check that the required direction (H2C or C2H) - * is enabled on this queue pair. - * 2. Pin user pages and build a scatter-gather list. - * 3. Populate a qdma_request: - * - ep_addr = *ppos: the device-side address (FPGA memory offset). - * - h2c_eot = 1: signals end-of-transfer to the FPGA, allowing it to - * process the complete data packet. - * - timeout_ms = 10000 (10 seconds): if the transfer doesn't complete - * in this time, qdma_request_submit returns an error. - * - fp_done = NULL: synchronous mode — the call blocks until completion. - * If fp_done were set, libqdma would call it asynchronously. - * - dma_mapped = 0: libqdma handles the DMA mapping internally. - * 4. Submit to libqdma via qdma_request_submit(). - * 5. On success, advance *ppos by the number of bytes transferred. - * 6. Unpin pages and free the SGL. - * - * Return: Number of bytes transferred (>= 0) on success, negative errno - * on failure. - */ -static ssize_t slash_qdma_qpair_read_write(struct file *file, char __user *buf, - size_t count, loff_t *ppos, - bool write) -{ - struct slash_qdma_qpair_file_ctx *ctx = file->private_data; - struct slash_qdma_dev *qdma_dev; - struct slash_qdma_qpair_entry *entry; - struct slash_qdma_io_cb iocb; - struct qdma_request *req; - unsigned long qhndl; - ssize_t res; - int rv; -#if SLASH_QDMA_TIMING - ktime_t t_start, t_mapped, t_submitted, t_done; -#endif - - if (!ctx) - return -EINVAL; - - qdma_dev = ctx->qdma_dev; - entry = ctx->entry; - - if (!qdma_dev || !entry) - return -ENODEV; - - /* Check device liveness and resolve the queue handle for the direction. */ - mutex_lock(&qdma_dev->lock); - if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) { - mutex_unlock(&qdma_dev->lock); - return -ENODEV; - } - - if (write) { - /* H2C: writing data from host to card */ - if (!(entry->dir_mask & SLASH_QDMA_DIR_H2C) || - !slash_qdma_qhndl_is_valid(entry->qhndl[Q_H2C])) { - mutex_unlock(&qdma_dev->lock); - return -ENODEV; - } - qhndl = entry->qhndl[Q_H2C]; - } else { - /* C2H: reading data from card to host */ - if (!(entry->dir_mask & SLASH_QDMA_DIR_C2H) || - !slash_qdma_qhndl_is_valid(entry->qhndl[Q_C2H])) { - mutex_unlock(&qdma_dev->lock); - return -ENODEV; - } - qhndl = entry->qhndl[Q_C2H]; - } - mutex_unlock(&qdma_dev->lock); - - /* Pin user pages and build the scatter-gather list. */ -#if SLASH_QDMA_TIMING - t_start = ktime_get(); -#endif - memset(&iocb, 0, sizeof(iocb)); - iocb.buf = buf; - iocb.len = count; - rv = slash_qdma_map_user_buf_to_sgl(&iocb, write); - if (rv < 0) - return rv; -#if SLASH_QDMA_TIMING - t_mapped = ktime_get(); -#endif - - /* Populate the libqdma request structure. */ - req = &iocb.req; - req->sgcnt = iocb.pages_nr; /* Number of SGL entries */ - req->sgl = iocb.sgl; /* Scatter-gather list */ - req->write = write ? 1 : 0; /* Direction flag for libqdma */ - req->dma_mapped = 0; /* Let libqdma handle DMA mapping */ - req->udd_len = 0; /* No user-defined data */ - req->ep_addr = (u64)*ppos; /* Device-side (endpoint) address */ - req->count = count; /* Total byte count */ - req->timeout_ms = 10 * 1000; /* 10-second timeout */ - req->fp_done = NULL; /* Synchronous: block until complete */ - req->h2c_eot = 1; /* End-of-transfer marker for FPGA */ - - SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, - "qdma_request_submit start: qid=%u qhndl=%lu write=%d count=%zu ep_addr=0x%llx\n", - ctx->qid, qhndl, req->write, req->count, - (unsigned long long)req->ep_addr); - res = qdma_request_submit(qdma_dev->qdma_handle, qhndl, req); - SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, - "qdma_request_submit done: qid=%u qhndl=%lu res=%zd\n", - ctx->qid, qhndl, res); -#if SLASH_QDMA_TIMING - t_submitted = ktime_get(); -#endif - - /* Advance the file position by the number of bytes transferred. */ - if (res > 0) - *ppos += res; - - /* Unpin pages (marking dirty for C2H reads) and free the SGL. */ - slash_qdma_unmap_user_buf(&iocb, write); - slash_qdma_iocb_release(&iocb); - -#if SLASH_QDMA_TIMING - t_done = ktime_get(); - dev_info(&qdma_dev->pdev->dev, - "slash: qdma: timing qid=%u %s count=%zu sgcnt=%u ep=0x%llx res=%zd | map=%lld submit=%lld unmap=%lld total=%lld ns\n", - ctx->qid, write ? "H2C" : "C2H", count, req->sgcnt, - (unsigned long long)req->ep_addr, res, - ktime_to_ns(ktime_sub(t_mapped, t_start)), - ktime_to_ns(ktime_sub(t_submitted, t_mapped)), - ktime_to_ns(ktime_sub(t_done, t_submitted)), - ktime_to_ns(ktime_sub(t_done, t_start))); -#endif - - return res; -} - -/** - * slash_qdma_qpair_read() - Read (C2H) file operation for a qpair fd. - * @file: Anon_inode file for the queue pair. - * @buf: User-space destination buffer. - * @count: Number of bytes to read. - * @ppos: Device-side address to read from. - * - * Thin wrapper that delegates to slash_qdma_qpair_read_write() with - * write=false (C2H direction). - * - * Return: Bytes transferred or negative errno. - */ -static ssize_t slash_qdma_qpair_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - return slash_qdma_qpair_read_write(file, buf, count, ppos, false); -} - -/** - * slash_qdma_qpair_write() - Write (H2C) file operation for a qpair fd. - * @file: Anon_inode file for the queue pair. - * @buf: User-space source buffer. - * @count: Number of bytes to write. - * @ppos: Device-side address to write to. - * - * Thin wrapper that delegates to slash_qdma_qpair_read_write() with - * write=true (H2C direction). - * - * Return: Bytes transferred or negative errno. - */ -static ssize_t slash_qdma_qpair_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - return slash_qdma_qpair_read_write(file, (char __user *)buf, - count, ppos, true); -} - /** * slash_qdma_qpair_transfer() - Registered-buffer DMA transfer on a qpair fd. * @file: Anon_inode file for the queue pair. @@ -3519,7 +3328,20 @@ static long slash_qdma_qpair_transfer(struct file *file, void __user *uarg) static long slash_qdma_qpair_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + struct slash_qdma_qpair_file_ctx *ctx = file->private_data; + + if (!ctx || !ctx->client || !ctx->qdma_dev) + return -ENODEV; + switch (cmd) { + case SLASH_QDMA_IOCTL_BUF_REGISTER: + return slash_qdma_ioctl_buf_register_w(&ctx->qdma_dev->misc, + ctx->client, + (void __user *)arg); + case SLASH_QDMA_IOCTL_BUF_UNREGISTER: + return slash_qdma_ioctl_buf_unregister_w(&ctx->qdma_dev->misc, + ctx->client, + (void __user *)arg); case SLASH_QDMA_QPAIR_IOCTL_TRANSFER: return slash_qdma_qpair_transfer(file, (void __user *)arg); default: @@ -3572,16 +3394,15 @@ static int slash_qdma_qpair_release(struct inode *inode, struct file *file) * @qdma_dev: QDMA device. * @uarg: User-space pointer to a slash_qdma_qpair_fd_request struct. * - * Creates an anonymous inode file descriptor that userspace can use - * for read() (C2H) and write() (H2C) DMA transfers on the specified - * queue pair. The fd holds references to both the qpair entry and the - * device, preventing either from being freed while the fd is open. + * Creates an anonymous inode file descriptor that userspace can use for + * registered-buffer transfer ioctls on the specified queue pair. The fd + * holds references to the qpair entry, client context, and device, + * preventing any of them from being freed while the fd is open. * * The only supported flag is O_CLOEXEC (close-on-exec). * - * The file is created with FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE - * enabled, allowing pread/pwrite and lseek to set the device-side - * address for DMA transfers. + * The fd is ioctl-only for data movement; transfers pass the device-side + * address in struct slash_qdma_transfer. * * Error handling: on any failure after resources are acquired, all * refs and allocations are cleaned up before returning. @@ -3674,10 +3495,6 @@ static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc, return err; } - /* Enable seek and positional read/write for device-address control. */ - file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - - /* Allocate a file descriptor number. */ fd = get_unused_fd_flags(req.flags & O_CLOEXEC); if (fd < 0) { diff --git a/driver/tests/test_slash_qdma.c b/driver/tests/test_slash_qdma.c index fd815143..cd65ee40 100644 --- a/driver/tests/test_slash_qdma.c +++ b/driver/tests/test_slash_qdma.c @@ -45,6 +45,13 @@ static void fill_pattern(uint8_t *buf, size_t len) buf[i] = (uint8_t)(i & 0xff); } +static int qdma_buf_register(int ctl_fd, void *addr, uint64_t length, + uint32_t *buf_id, uint32_t *transfer_hint); +static int qdma_buf_unregister(int ctl_fd, uint32_t buf_id); +static long qdma_buf_transfer(int io_fd, uint32_t buf_id, uint64_t buf_offset, + uint64_t dev_addr, uint64_t length, + uint32_t direction); + /* ---------- fixture ---------- */ FIXTURE(qdma) @@ -138,7 +145,8 @@ TEST_F(qdma, write_read_verify) { uint8_t *write_buf, *read_buf; uint64_t dma_addr = get_dma_addr(); - ssize_t ret; + uint32_t write_id = 0, read_id = 0; + long ret; bring_up_qpair(_metadata, self, 0x3); @@ -150,14 +158,23 @@ TEST_F(qdma, write_read_verify) fill_pattern(write_buf, TRANSFER_SIZE); memset(read_buf, 0, TRANSFER_SIZE); - ret = pwrite(self->io_fd, write_buf, TRANSFER_SIZE, (off_t)dma_addr); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, TRANSFER_SIZE, + &write_id, NULL)); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, TRANSFER_SIZE, + &read_id, NULL)); + + ret = qdma_buf_transfer(self->io_fd, write_id, 0, dma_addr, + TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); ASSERT_EQ(TRANSFER_SIZE, ret); - ret = pread(self->io_fd, read_buf, TRANSFER_SIZE, (off_t)dma_addr); + ret = qdma_buf_transfer(self->io_fd, read_id, 0, dma_addr, + TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); ASSERT_EQ(TRANSFER_SIZE, ret); EXPECT_EQ(0, memcmp(write_buf, read_buf, TRANSFER_SIZE)); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, write_id)); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, read_id)); free(write_buf); free(read_buf); } @@ -282,51 +299,63 @@ TEST_F(qdma, qpair_get_fd_unknown_qid) TEST_F(qdma, io_read_on_h2c_only_returns_enodev) { uint8_t *buf; - ssize_t ret; + uint32_t buf_id = 0; + long ret; bring_up_qpair(_metadata, self, 0x1); /* H2C only */ buf = aligned_alloc(4096, TRANSFER_SIZE); ASSERT_NE(NULL, buf); - ret = pread(self->io_fd, buf, TRANSFER_SIZE, (off_t)SLASH_TEST_HBM_BASE); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, NULL)); + ret = qdma_buf_transfer(self->io_fd, buf_id, 0, SLASH_TEST_HBM_BASE, + TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); EXPECT_EQ(-1, ret); EXPECT_EQ(ENODEV, errno); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); free(buf); } TEST_F(qdma, io_write_on_c2h_only_returns_enodev) { uint8_t *buf; - ssize_t ret; + uint32_t buf_id = 0; + long ret; bring_up_qpair(_metadata, self, 0x2); /* C2H only */ buf = aligned_alloc(4096, TRANSFER_SIZE); ASSERT_NE(NULL, buf); - ret = pwrite(self->io_fd, buf, TRANSFER_SIZE, (off_t)SLASH_TEST_HBM_BASE); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, NULL)); + ret = qdma_buf_transfer(self->io_fd, buf_id, 0, SLASH_TEST_HBM_BASE, + TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); EXPECT_EQ(-1, ret); EXPECT_EQ(ENODEV, errno); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); free(buf); } TEST_F(qdma, io_zero_length_returns_einval) { uint8_t *buf; - ssize_t ret; + uint32_t buf_id = 0; + long ret; bring_up_qpair(_metadata, self, 0x3); buf = aligned_alloc(4096, TRANSFER_SIZE); ASSERT_NE(NULL, buf); - ret = pwrite(self->io_fd, buf, 0, (off_t)SLASH_TEST_HBM_BASE); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, NULL)); + ret = qdma_buf_transfer(self->io_fd, buf_id, 0, SLASH_TEST_HBM_BASE, + 0, SLASH_QDMA_XFER_H2C); EXPECT_EQ(-1, ret); EXPECT_EQ(EINVAL, errno); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); free(buf); } @@ -355,74 +384,37 @@ TEST_F(qdma, io_ioctl_returns_enotty) EXPECT_EQ(ENOTTY, errno); } -TEST_F(qdma, io_lseek_set_cur_end) +TEST_F(qdma, io_lseek_unsupported) { off_t pos; bring_up_qpair(_metadata, self, 0x3); pos = lseek(self->io_fd, (off_t)SLASH_TEST_HBM_BASE, SEEK_SET); - EXPECT_EQ((off_t)SLASH_TEST_HBM_BASE, pos); - - pos = lseek(self->io_fd, 0, SEEK_CUR); - EXPECT_EQ((off_t)SLASH_TEST_HBM_BASE, pos); - - pos = lseek(self->io_fd, 4096, SEEK_CUR); - EXPECT_EQ((off_t)(SLASH_TEST_HBM_BASE + 4096), pos); - - /* - * SEEK_END semantics are driver-defined for this anon-inode; the - * contract is "doesn't error", not any specific value. - */ - pos = lseek(self->io_fd, 0, SEEK_END); - EXPECT_NE((off_t)-1, pos); + EXPECT_EQ((off_t)-1, pos); + EXPECT_EQ(ESPIPE, errno); } -TEST_F(qdma, io_write_advances_file_position) +TEST_F(qdma, io_read_write_unsupported) { uint8_t *buf; - off_t pos; - ssize_t ret; + uint32_t buf_id = 0; + long ret; bring_up_qpair(_metadata, self, 0x3); buf = aligned_alloc(4096, TRANSFER_SIZE); ASSERT_NE(NULL, buf); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, NULL)); fill_pattern(buf, TRANSFER_SIZE); - ASSERT_EQ((off_t)SLASH_TEST_HBM_BASE, - lseek(self->io_fd, (off_t)SLASH_TEST_HBM_BASE, SEEK_SET)); - ret = write(self->io_fd, buf, TRANSFER_SIZE); - ASSERT_EQ(TRANSFER_SIZE, ret); - - pos = lseek(self->io_fd, 0, SEEK_CUR); - EXPECT_EQ((off_t)(SLASH_TEST_HBM_BASE + TRANSFER_SIZE), pos); - - free(buf); -} - -TEST_F(qdma, io_pwrite_does_not_advance_file_position) -{ - uint8_t *buf; - off_t pos; - ssize_t ret; - - bring_up_qpair(_metadata, self, 0x3); - - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); - fill_pattern(buf, TRANSFER_SIZE); - - ASSERT_EQ((off_t)0, lseek(self->io_fd, 0, SEEK_SET)); - - ret = pwrite(self->io_fd, buf, TRANSFER_SIZE, - (off_t)SLASH_TEST_HBM_BASE); - ASSERT_EQ(TRANSFER_SIZE, ret); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); - /* p* variants must not advance the file position. */ - pos = lseek(self->io_fd, 0, SEEK_CUR); - EXPECT_EQ((off_t)0, pos); + ret = read(self->io_fd, buf, TRANSFER_SIZE); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); free(buf); } @@ -430,8 +422,9 @@ TEST_F(qdma, io_pwrite_does_not_advance_file_position) TEST_F(qdma, io_multiple_fds_same_qpair) { uint8_t *write_buf, *read_buf; + uint32_t write_id = 0, read_id = 0; int io_fd_b; - ssize_t ret; + long ret; bring_up_qpair(_metadata, self, 0x3); @@ -446,16 +439,23 @@ TEST_F(qdma, io_multiple_fds_same_qpair) fill_pattern(write_buf, TRANSFER_SIZE); memset(read_buf, 0, TRANSFER_SIZE); - ret = pwrite(self->io_fd, write_buf, TRANSFER_SIZE, - (off_t)SLASH_TEST_HBM_BASE); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, TRANSFER_SIZE, + &write_id, NULL)); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, TRANSFER_SIZE, + &read_id, NULL)); + + ret = qdma_buf_transfer(self->io_fd, write_id, 0, SLASH_TEST_HBM_BASE, + TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); ASSERT_EQ(TRANSFER_SIZE, ret); - ret = pread(io_fd_b, read_buf, TRANSFER_SIZE, - (off_t)SLASH_TEST_HBM_BASE); + ret = qdma_buf_transfer(io_fd_b, read_id, 0, SLASH_TEST_HBM_BASE, + TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); ASSERT_EQ(TRANSFER_SIZE, ret); EXPECT_EQ(0, memcmp(write_buf, read_buf, TRANSFER_SIZE)); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, write_id)); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, read_id)); close(io_fd_b); free(write_buf); free(read_buf); @@ -464,7 +464,8 @@ TEST_F(qdma, io_multiple_fds_same_qpair) TEST_F(qdma, io_fd_outlives_qpair_del) { uint8_t *buf; - ssize_t ret; + uint32_t buf_id = 0; + long ret; bring_up_qpair(_metadata, self, 0x3); @@ -476,16 +477,18 @@ TEST_F(qdma, io_fd_outlives_qpair_del) buf = aligned_alloc(4096, TRANSFER_SIZE); ASSERT_NE(NULL, buf); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, NULL)); /* * fd is still valid but the qpair's HW queues are gone. The spec * (index.rst:613-616) does not name a specific errno, so we only * assert the call fails — not which errno it returns. */ - ret = pwrite(self->io_fd, buf, TRANSFER_SIZE, - (off_t)SLASH_TEST_HBM_BASE); + ret = qdma_buf_transfer(self->io_fd, buf_id, 0, SLASH_TEST_HBM_BASE, + TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); EXPECT_EQ(-1, ret); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); free(buf); /* close(io_fd) happens in fixture teardown — must not crash. */ } @@ -496,7 +499,8 @@ static void region_round_trip(struct __test_metadata *_metadata, FIXTURE_DATA(qdma) * self, uint64_t base) { uint8_t *write_buf, *read_buf; - ssize_t ret; + uint32_t write_id = 0, read_id = 0; + long ret; bring_up_qpair(_metadata, self, 0x3); @@ -508,16 +512,25 @@ static void region_round_trip(struct __test_metadata *_metadata, fill_pattern(write_buf, TRANSFER_SIZE); memset(read_buf, 0, TRANSFER_SIZE); - ret = pwrite(self->io_fd, write_buf, TRANSFER_SIZE, (off_t)base); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, TRANSFER_SIZE, + &write_id, NULL)); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, TRANSFER_SIZE, + &read_id, NULL)); + + ret = qdma_buf_transfer(self->io_fd, write_id, 0, base, + TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); ASSERT_EQ(TRANSFER_SIZE, ret) - TH_LOG("pwrite to 0x%llx failed: %s", + TH_LOG("H2C transfer to 0x%llx failed: %s", (unsigned long long)base, strerror(errno)); - ret = pread(self->io_fd, read_buf, TRANSFER_SIZE, (off_t)base); + ret = qdma_buf_transfer(self->io_fd, read_id, 0, base, + TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); ASSERT_EQ(TRANSFER_SIZE, ret); EXPECT_EQ(0, memcmp(write_buf, read_buf, TRANSFER_SIZE)); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, write_id)); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, read_id)); free(write_buf); free(read_buf); } @@ -711,8 +724,7 @@ TEST_F(qdma, qpair_get_fd_oversized_struct_zeros_tail) TEST_F(qdma, reject_unaligned_4k_transfer) { uint8_t *write_buf; - uint64_t dma_addr = get_dma_addr(); - ssize_t ret; + uint32_t buf_id = 0; bring_up_qpair(_metadata, self, 0x3); @@ -720,10 +732,9 @@ TEST_F(qdma, reject_unaligned_4k_transfer) ASSERT_NE(NULL, write_buf); fill_pattern(write_buf, TRANSFER_SIZE * 2); - errno = 0; - ret = pwrite(self->io_fd, write_buf + 1, TRANSFER_SIZE, (off_t)dma_addr); - ASSERT_EQ(-1, ret); - ASSERT_EQ(EINVAL, errno); + EXPECT_EQ(-EINVAL, + qdma_buf_register(self->ctl_fd, write_buf + 1, TRANSFER_SIZE, + &buf_id, NULL)); free(write_buf); } @@ -732,7 +743,8 @@ TEST_F(qdma, reject_partial_4k_transfer) { uint8_t *write_buf; uint64_t dma_addr = get_dma_addr(); - ssize_t ret; + uint32_t buf_id = 0; + long ret; bring_up_qpair(_metadata, self, 0x3); @@ -740,11 +752,14 @@ TEST_F(qdma, reject_partial_4k_transfer) ASSERT_NE(NULL, write_buf); fill_pattern(write_buf, TRANSFER_SIZE); - errno = 0; - ret = pwrite(self->io_fd, write_buf, TRANSFER_SIZE / 2, (off_t)dma_addr); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, TRANSFER_SIZE, + &buf_id, NULL)); + ret = qdma_buf_transfer(self->io_fd, buf_id, 0, dma_addr, + TRANSFER_SIZE / 2, SLASH_QDMA_XFER_H2C); ASSERT_EQ(-1, ret); ASSERT_EQ(EINVAL, errno); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); free(write_buf); } @@ -753,7 +768,8 @@ TEST_F(qdma, multipage_4k_write_read_verify) const size_t xfer_size = TRANSFER_SIZE * 8; /* 8 base pages, one request */ uint8_t *write_buf, *read_buf; uint64_t dma_addr = get_dma_addr(); - ssize_t ret; + uint32_t write_id = 0, read_id = 0; + long ret; bring_up_qpair(_metadata, self, 0x3); @@ -774,14 +790,23 @@ TEST_F(qdma, multipage_4k_write_read_verify) fill_pattern(write_buf, xfer_size); memset(read_buf, 0, xfer_size); - ret = pwrite(self->io_fd, write_buf, xfer_size, (off_t)dma_addr); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, xfer_size, + &write_id, NULL)); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, xfer_size, + &read_id, NULL)); + + ret = qdma_buf_transfer(self->io_fd, write_id, 0, dma_addr, xfer_size, + SLASH_QDMA_XFER_H2C); ASSERT_EQ((ssize_t)xfer_size, ret); - ret = pread(self->io_fd, read_buf, xfer_size, (off_t)dma_addr); + ret = qdma_buf_transfer(self->io_fd, read_id, 0, dma_addr, xfer_size, + SLASH_QDMA_XFER_C2H); ASSERT_EQ((ssize_t)xfer_size, ret); EXPECT_EQ(0, memcmp(write_buf, read_buf, xfer_size)); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, write_id)); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, read_id)); munmap(write_buf, xfer_size); munmap(read_buf, xfer_size); } @@ -790,7 +815,8 @@ TEST_F(qdma, hugepage_write_read_verify) { uint8_t *write_buf, *read_buf; uint64_t dma_addr = get_dma_addr(); - ssize_t ret; + uint32_t write_id = 0, read_id = 0; + long ret; bring_up_qpair(_metadata, self, 0x3); @@ -811,14 +837,23 @@ TEST_F(qdma, hugepage_write_read_verify) fill_pattern(write_buf, HUGE_TRANSFER_SIZE); memset(read_buf, 0, HUGE_TRANSFER_SIZE); - ret = pwrite(self->io_fd, write_buf, HUGE_TRANSFER_SIZE, (off_t)dma_addr); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, HUGE_TRANSFER_SIZE, + &write_id, NULL)); + ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, HUGE_TRANSFER_SIZE, + &read_id, NULL)); + + ret = qdma_buf_transfer(self->io_fd, write_id, 0, dma_addr, HUGE_TRANSFER_SIZE, + SLASH_QDMA_XFER_H2C); ASSERT_EQ(HUGE_TRANSFER_SIZE, ret); - ret = pread(self->io_fd, read_buf, HUGE_TRANSFER_SIZE, (off_t)dma_addr); + ret = qdma_buf_transfer(self->io_fd, read_id, 0, dma_addr, HUGE_TRANSFER_SIZE, + SLASH_QDMA_XFER_C2H); ASSERT_EQ(HUGE_TRANSFER_SIZE, ret); EXPECT_EQ(0, memcmp(write_buf, read_buf, HUGE_TRANSFER_SIZE)); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, write_id)); + EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, read_id)); munmap(write_buf, HUGE_TRANSFER_SIZE); munmap(read_buf, HUGE_TRANSFER_SIZE); } diff --git a/vrt/src/qdma/qdma_intf.cpp b/vrt/src/qdma/qdma_intf.cpp index 780fb766..e2e2e17c 100644 --- a/vrt/src/qdma/qdma_intf.cpp +++ b/vrt/src/qdma/qdma_intf.cpp @@ -62,55 +62,25 @@ ssize_t QdmaIntf::write_from_buffer(const char* fname, char* buffer, uint64_t si "QDMA streaming not initialized"); return -EIO; } - int fd = qpairFd; - ssize_t rc; - uint64_t count = 0; - char* buf = buffer; - off_t offset = base; - - do { /* Support zero byte transfer */ - uint64_t bytes = size - count; - - if (bytes > RW_MAX_SIZE) bytes = RW_MAX_SIZE; - - if (offset) { - rc = lseek(fd, offset, SEEK_SET); - if (rc < 0) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not write to {}", fname); - return -EIO; - } - if (rc != offset) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not write to {}", fname); - return -EIO; - } - } - - /* write data to file from memory buffer */ - rc = write(fd, buf, bytes); - if (rc < 0) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not write to {}", - fname); - return -EIO; - } - if (rc != bytes) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not write to {}", - fname); - return -EIO; - } - - count += bytes; - buf += bytes; - offset += bytes; - } while (count < size); - - if (count != size) { + if (size == 0) { + return 0; + } + + uint32_t bufId = 0; + if (slash_qdma_qpair_buffer_register(qpairFd, buffer, size, &bufId, nullptr) != 0) { + utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, + "Could not register QDMA write buffer for {}", fname); + return -EIO; + } + + ssize_t rc = slash_qdma_qpair_transfer(qpairFd, bufId, 0, base, size, SLASH_QDMA_XFER_H2C); + (void)slash_qdma_qpair_buffer_unregister(qpairFd, bufId); + if (rc != (ssize_t)size) { utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not write to {}", fname); return -EIO; } - return count; + return rc; } ssize_t QdmaIntf::read_to_buffer(const char* fname, char* buffer, uint64_t size, uint64_t base) { @@ -119,55 +89,25 @@ ssize_t QdmaIntf::read_to_buffer(const char* fname, char* buffer, uint64_t size, "QDMA streaming not initialized"); return -EIO; } - int fd = qpairFd; - ssize_t rc; - uint64_t count = 0; - char* buf = buffer; - off_t offset = base; - - do { /* Support zero byte transfer */ - uint64_t bytes = size - count; - - if (bytes > RW_MAX_SIZE) bytes = RW_MAX_SIZE; - - if (offset) { - rc = lseek(fd, offset, SEEK_SET); - if (rc < 0) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not read from {}", fname); - return -EIO; - } - if (rc != offset) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not read from {}", fname); - return -EIO; - } - } - - /* read data from file into memory buffer */ - rc = read(fd, buf, bytes); - if (rc < 0) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not read from {}", fname); - return -EIO; - } - if (rc != bytes) { - utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not read from {}", fname); - return -EIO; - } - - count += bytes; - buf += bytes; - offset += bytes; - } while (count < size); - - if (count != size) { + if (size == 0) { + return 0; + } + + uint32_t bufId = 0; + if (slash_qdma_qpair_buffer_register(qpairFd, buffer, size, &bufId, nullptr) != 0) { + utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, + "Could not register QDMA read buffer for {}", fname); + return -EIO; + } + + ssize_t rc = slash_qdma_qpair_transfer(qpairFd, bufId, 0, base, size, SLASH_QDMA_XFER_C2H); + (void)slash_qdma_qpair_buffer_unregister(qpairFd, bufId); + if (rc != (ssize_t)size) { utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not read from {}", fname); return -EIO; } - return count; + return rc; } void QdmaIntf::write_buff(char* buffer, uint64_t start_addr, uint64_t size) { diff --git a/vrt/vrtd/include/vrtd/wire.h b/vrt/vrtd/include/vrtd/wire.h index c464ce81..1bad765b 100644 --- a/vrt/vrtd/include/vrtd/wire.h +++ b/vrt/vrtd/include/vrtd/wire.h @@ -276,8 +276,9 @@ struct vrtd_resp_qdma_qpair_op { /** * @brief Request a read/write file descriptor for a QDMA qpair. * - * The qpair FD is sent out-of-band via SCM_RIGHTS when - * @ref vrtd_resp_header::ret == VRTD_RET_OK. + * One or more qpair FDs are sent out-of-band via SCM_RIGHTS when + * @ref vrtd_resp_header::ret == VRTD_RET_OK. The response body reports the + * number of descriptors attached. */ struct vrtd_req_qdma_qpair_get_fd { uint32_t dev_number; ///< Device index (0-based). @@ -307,6 +308,7 @@ struct vrtd_req_buffer_open { struct vrtd_resp_buffer_open { uint64_t size; ///< Allocated size in bytes (rounded up to subregion). uint64_t phys_addr; ///< Device physical address of the allocation. + uint32_t qpair_fd_count; ///< Number of qpair FDs sent via SCM_RIGHTS (1 or 2). } __attribute__((packed)); /** @@ -328,8 +330,9 @@ struct vrtd_resp_buffer_close { * Bypasses the allocator entirely — the caller is responsible for ensuring the * address is valid and not in use. Requires the @c raw-mem-access permission. * - * The qpair FD is sent out-of-band via SCM_RIGHTS when - * @ref vrtd_resp_header::ret == VRTD_RET_OK. + * One or more qpair FDs are sent out-of-band via SCM_RIGHTS when + * @ref vrtd_resp_header::ret == VRTD_RET_OK. The response body reports the + * number of descriptors attached. */ struct vrtd_req_buffer_open_raw { uint32_t dev_number; ///< Device index (0-based). @@ -340,7 +343,7 @@ struct vrtd_req_buffer_open_raw { } __attribute__((packed)); struct vrtd_resp_buffer_open_raw { - uint8_t zero; ///< Placeholder; all data is carried via SCM_RIGHTS. + uint32_t qpair_fd_count; ///< Number of qpair FDs sent via SCM_RIGHTS (1 or 2). } __attribute__((packed)); /** diff --git a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h index 09c34a35..c8dffc67 100644 --- a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h +++ b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h @@ -316,9 +316,10 @@ enum vrtd_ret vrtd_qdma_qpair_del( ); /** - * @brief Obtain a read/write file descriptor for a QDMA qpair. + * @brief Obtain an ioctl-only file descriptor for a QDMA qpair. * - * The descriptor can be used with read()/write() for C2H/H2C data transfer. + * The descriptor can be used with registered-buffer transfer ioctls for + * C2H/H2C data transfer. * * @param fd Connected vrtd socket file descriptor. * @param dev Device index (0‑based). @@ -547,7 +548,10 @@ struct vrtd_buffer { uint64_t size; uint64_t phys_addr; - int qpair_fd; + int qpair_fds[2]; + uint32_t qpair_fd_count; + uint32_t buf_id; + enum slash_qdma_transfer_hint transfer_hint; void *buf; /* Internal DMA granule for the local host mapping: 4096 or 2 MiB. */ uint64_t transfer_step_size; @@ -561,7 +565,8 @@ enum vrtd_ret vrtd_buffer_create_raw( uint64_t alloc_arg, uint64_t size, uint64_t phys_addr, - int qpair_fd, + const int *qpair_fds, + uint32_t qpair_fd_count, enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ); diff --git a/vrt/vrtd/libvrtd/src/buffer.c b/vrt/vrtd/libvrtd/src/buffer.c index cd3babd0..54ba151b 100644 --- a/vrt/vrtd/libvrtd/src/buffer.c +++ b/vrt/vrtd/libvrtd/src/buffer.c @@ -77,7 +77,7 @@ * * When SLASH_QDMA_TIMING is non-zero (compile-time flag, e.g. built with * -DSLASH_QDMA_TIMING=1), the sync_to/from_device paths log the wall-clock - * cost of each pwrite/pread syscall plus the aggregate per-sync time and + * cost of each transfer ioctl plus the aggregate per-sync time and * effective bandwidth. This is the userspace counterpart to the kernel's * SLASH_QDMA_TIMING and libqdma's QDMA_TIMING breakdowns. */ @@ -131,9 +131,11 @@ static int vrtd_mmap_regular_base_pages(uint64_t size, void **addr_out) { return 0; } -static int vrtd_transfer_pages( - int fd, - void *buf, +static int vrtd_transfer_registered( + const int *qpair_fds, + uint32_t qpair_fd_count, + enum slash_qdma_transfer_hint transfer_hint, + uint32_t buf_id, uint64_t phys_addr, uint64_t offset, uint64_t size, @@ -142,11 +144,16 @@ static int vrtd_transfer_pages( ) { uint64_t max_chunk; uint64_t transferred = 0; + uint32_t direction = to_device ? SLASH_QDMA_XFER_H2C : SLASH_QDMA_XFER_C2H; if (size == 0) { return 0; } + if (qpair_fds == NULL || qpair_fd_count == 0 || qpair_fds[0] < 0) { + return -EINVAL; + } + if (step == 0 || (offset % step) != 0 || (size % step) != 0) { return -EINVAL; } @@ -165,24 +172,39 @@ static int vrtd_transfer_pages( } while (done < chunk) { - size_t remaining = (size_t)(chunk - done); - off_t dev_offset = (off_t)(phys_addr + offset + transferred + done); - uint8_t *ptr = (uint8_t *)buf + offset + transferred + done; + uint64_t remaining = chunk - done; + uint64_t xfer_offset = offset + transferred + done; + uint64_t dev_offset = phys_addr + xfer_offset; + uint64_t xfer_size = remaining; + uint32_t fd_index = 0; ssize_t ret; - if (to_device) { - ret = pwrite(fd, ptr, remaining, dev_offset); - } else { - ret = pread(fd, ptr, remaining, dev_offset); + if (transfer_hint == SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR && + qpair_fd_count > 1 && qpair_fds[1] >= 0 && + remaining > step) { + /* + * Use the second qpair for the upper half of each large chunk. + * This keeps ranges disjoint while preserving alignment. + */ + uint64_t half = (chunk / 2) - ((chunk / 2) % step); + if (half != 0 && done < half) { + xfer_size = half - done; + } else if (half != 0) { + fd_index = 1; + } } - if (ret < 0 && errno == EINTR) { - continue; + ret = slash_qdma_qpair_transfer( + qpair_fds[fd_index], buf_id, xfer_offset, + dev_offset, xfer_size, direction); + + if (ret < 0) { + return -EIO; } - if (ret <= 0) { + if (ret == 0) { return -EIO; } - done += (uint64_t)ret; + done += (uint64_t) ret; } transferred += chunk; @@ -191,6 +213,34 @@ static int vrtd_transfer_pages( return 0; } +static int vrtd_transfer_temporary_mapping( + const struct vrtd_buffer *buffer, + void *mapping, + uint64_t phys_addr, + uint64_t size, + bool to_device +) { + uint32_t buf_id = 0; + enum slash_qdma_transfer_hint hint = SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR; + int ret; + + if (buffer == NULL || mapping == NULL || buffer->qpair_fd_count == 0) { + return -EINVAL; + } + + if (slash_qdma_qpair_buffer_register(buffer->qpair_fds[0], mapping, size, + &buf_id, &hint) != 0) { + return -EIO; + } + + ret = vrtd_transfer_registered(buffer->qpair_fds, buffer->qpair_fd_count, + hint, buf_id, phys_addr, 0, size, + BASE_TRANSFER_STEP_SIZE, to_device); + + (void)slash_qdma_qpair_buffer_unregister(buffer->qpair_fds[0], buf_id); + return ret; +} + enum vrtd_ret vrtd_buffer_create_raw( int sock_fd, uint32_t dev, @@ -199,7 +249,8 @@ enum vrtd_ret vrtd_buffer_create_raw( uint64_t alloc_arg, uint64_t size, uint64_t phys_addr, - int qpair_fd, + const int *qpair_fds, + uint32_t qpair_fd_count, enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ) { @@ -214,6 +265,16 @@ enum vrtd_ret vrtd_buffer_create_raw( buffer->buf = MAP_FAILED; buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE; + buffer->qpair_fds[0] = -1; + buffer->qpair_fds[1] = -1; + buffer->qpair_fd_count = 0; + buffer->buf_id = 0; + buffer->transfer_hint = SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR; + + if (qpair_fds == NULL || qpair_fd_count == 0 || qpair_fd_count > 2 || qpair_fds[0] < 0) { + free(buffer); + return VRTD_RET_BAD_LIB_CALL; + } if (page_mode == VRTD_HOST_PAGE_2M) { /* @@ -290,7 +351,18 @@ enum vrtd_ret vrtd_buffer_create_raw( buffer->alloc_arg = alloc_arg; buffer->size = size; buffer->phys_addr = phys_addr; - buffer->qpair_fd = qpair_fd; + buffer->qpair_fd_count = qpair_fd_count; + for (uint32_t i = 0; i < qpair_fd_count; ++i) { + buffer->qpair_fds[i] = qpair_fds[i]; + } + + if (slash_qdma_qpair_buffer_register( + buffer->qpair_fds[0], buffer->buf, buffer->size, + &buffer->buf_id, &buffer->transfer_hint) != 0) { + (void) munmap(buffer->buf, buffer->size); + free(buffer); + return VRTD_RET_INTERNAL_ERROR; + } *buffer_out = buffer; @@ -359,8 +431,18 @@ enum vrtd_ret vrtd_buffer_destroy( return VRTD_RET_BAD_LIB_CALL; } - if (buffer->qpair_fd >= 0) { - (void) close(buffer->qpair_fd); + for (uint32_t i = 0; i < buffer->qpair_fd_count && i < 2; ++i) { + if (buffer->qpair_fds[i] >= 0) { + (void) slash_qdma_qpair_buffer_unregister(buffer->qpair_fds[i], buffer->buf_id); + break; + } + } + + for (uint32_t i = 0; i < buffer->qpair_fd_count && i < 2; ++i) { + if (buffer->qpair_fds[i] >= 0) { + (void) close(buffer->qpair_fds[i]); + buffer->qpair_fds[i] = -1; + } } if (buffer->buf != NULL) { @@ -418,7 +500,8 @@ enum vrtd_ret vrtd_buffer_sync_to_device( return VRTD_RET_INVALID_ARGUMENT; } - assert(buffer->qpair_fd >= 0); + assert(buffer->qpair_fd_count > 0); + assert(buffer->qpair_fds[0] >= 0); assert(buffer->buf != NULL); uint64_t aligned_offset = 0; uint64_t aligned_size = 0; @@ -445,18 +528,18 @@ enum vrtd_ret vrtd_buffer_sync_to_device( return VRTD_RET_INTERNAL_ERROR; } - transfer_ret = vrtd_transfer_pages( - buffer->qpair_fd, bounce, buffer->phys_addr + aligned_offset, - 0, aligned_size, BASE_TRANSFER_STEP_SIZE, false); + transfer_ret = vrtd_transfer_temporary_mapping( + buffer, bounce, buffer->phys_addr + aligned_offset, + aligned_size, false); if (transfer_ret == 0) { memcpy( (uint8_t *)bounce + (offset - aligned_offset), (uint8_t *)buffer->buf + offset, size ); - transfer_ret = vrtd_transfer_pages( - buffer->qpair_fd, bounce, buffer->phys_addr + aligned_offset, - 0, aligned_size, BASE_TRANSFER_STEP_SIZE, true); + transfer_ret = vrtd_transfer_temporary_mapping( + buffer, bounce, buffer->phys_addr + aligned_offset, + aligned_size, true); } (void) munmap(bounce, aligned_size); } else { @@ -465,8 +548,9 @@ enum vrtd_ret vrtd_buffer_sync_to_device( * granule for a read-modify-write, so keep the historical behavior: * expand partial syncs to the backing DMA granule. */ - transfer_ret = vrtd_transfer_pages( - buffer->qpair_fd, buffer->buf, buffer->phys_addr, + transfer_ret = vrtd_transfer_registered( + buffer->qpair_fds, buffer->qpair_fd_count, buffer->transfer_hint, + buffer->buf_id, buffer->phys_addr, aligned_offset, aligned_size, step, true); } if (transfer_ret != 0) { @@ -503,7 +587,8 @@ enum vrtd_ret vrtd_buffer_sync_from_device( return VRTD_RET_INVALID_ARGUMENT; } - assert(buffer->qpair_fd >= 0); + assert(buffer->qpair_fd_count > 0); + assert(buffer->qpair_fds[0] >= 0); assert(buffer->buf != NULL); uint64_t aligned_offset = 0; uint64_t aligned_size = 0; @@ -530,9 +615,9 @@ enum vrtd_ret vrtd_buffer_sync_from_device( return VRTD_RET_INTERNAL_ERROR; } - transfer_ret = vrtd_transfer_pages( - buffer->qpair_fd, bounce, buffer->phys_addr + aligned_offset, - 0, aligned_size, BASE_TRANSFER_STEP_SIZE, false); + transfer_ret = vrtd_transfer_temporary_mapping( + buffer, bounce, buffer->phys_addr + aligned_offset, + aligned_size, false); if (transfer_ret == 0) { memcpy( (uint8_t *)buffer->buf + offset, @@ -542,8 +627,9 @@ enum vrtd_ret vrtd_buffer_sync_from_device( } (void) munmap(bounce, aligned_size); } else { - transfer_ret = vrtd_transfer_pages( - buffer->qpair_fd, buffer->buf, buffer->phys_addr, + transfer_ret = vrtd_transfer_registered( + buffer->qpair_fds, buffer->qpair_fd_count, buffer->transfer_hint, + buffer->buf_id, buffer->phys_addr, aligned_offset, aligned_size, step, false); } if (transfer_ret != 0) { diff --git a/vrt/vrtd/libvrtd/src/requests.c b/vrt/vrtd/libvrtd/src/requests.c index cad284b5..8a1b1b09 100644 --- a/vrt/vrtd/libvrtd/src/requests.c +++ b/vrt/vrtd/libvrtd/src/requests.c @@ -56,13 +56,13 @@ #include /** - * vrtd_recv_response() - Receive a response message from the daemon. + * vrtd_recv_response_fds() - Receive a response message from the daemon. * @fd: Connection socket. * @resp_body_buf: Buffer for the response body (may be NULL if no body expected). * @resp_bufsz: Size of @resp_body_buf. - * @resp_fd: If non-NULL, receives an out-of-band file descriptor - * sent by the daemon via SCM_RIGHTS (e.g. a BAR fd or - * QDMA qpair fd). Set to -1 if no fd was received. + * @resp_fds: Optional array receiving out-of-band file descriptors. + * @max_resp_fds: Capacity of @resp_fds. + * @resp_fd_count: Optional output count of received fds. * * Uses recvmsg() with scatter-gather I/O: the header and body are read * into separate buffers in a single system call. MSG_CMSG_CLOEXEC @@ -70,11 +70,13 @@ * * Return: VRTD_RET_OK on success, or an error code. */ -static enum vrtd_ret vrtd_recv_response( +static enum vrtd_ret vrtd_recv_response_fds( int fd, void *resp_body_buf, size_t resp_bufsz, - int *resp_fd + int *resp_fds, + uint32_t max_resp_fds, + uint32_t *resp_fd_count ) { struct vrtd_resp_header rh = {0}; @@ -85,16 +87,21 @@ static enum vrtd_ret vrtd_recv_response( riov[1].iov_base = resp_body_buf; riov[1].iov_len = resp_bufsz; - char cbuf[CMSG_SPACE(sizeof(int))]; + char cbuf[CMSG_SPACE(2 * sizeof(int))]; struct msghdr rmsg = { .msg_iov = riov, .msg_iovlen = resp_bufsz ? 2 : 1, - .msg_control = resp_fd ? cbuf : NULL, - .msg_controllen = resp_fd ? sizeof(cbuf) : 0, + .msg_control = resp_fds ? cbuf : NULL, + .msg_controllen = resp_fds ? sizeof(cbuf) : 0, }; - if (resp_fd) { - *resp_fd = -1; + if (resp_fd_count) { + *resp_fd_count = 0; + } + if (resp_fds) { + for (uint32_t i = 0; i < max_resp_fds; ++i) { + resp_fds[i] = -1; + } } ssize_t rn = recvmsg(fd, &rmsg, MSG_CMSG_CLOEXEC); @@ -118,11 +125,19 @@ static enum vrtd_ret vrtd_recv_response( return VRTD_RET_BAD_CONN; } - /* Extract file descriptor from SCM_RIGHTS ancillary data, if any. */ + /* Extract file descriptors from SCM_RIGHTS ancillary data, if any. */ for (struct cmsghdr *c = CMSG_FIRSTHDR(&rmsg); c != NULL; c = CMSG_NXTHDR(&rmsg, c)) { if (c->cmsg_level == SOL_SOCKET && c->cmsg_type == SCM_RIGHTS && c->cmsg_len >= CMSG_LEN(sizeof(int))) { - assert(resp_fd != NULL); - memcpy(resp_fd, CMSG_DATA(c), sizeof(int)); + assert(resp_fds != NULL); + size_t payload = c->cmsg_len - CMSG_LEN(0); + uint32_t n = (uint32_t)(payload / sizeof(int)); + if (n > max_resp_fds) { + n = max_resp_fds; + } + memcpy(resp_fds, CMSG_DATA(c), n * sizeof(int)); + if (resp_fd_count) { + *resp_fd_count = n; + } break; } } @@ -130,6 +145,22 @@ static enum vrtd_ret vrtd_recv_response( return (enum vrtd_ret) rh.ret; } +static enum vrtd_ret vrtd_recv_response( + int fd, + void *resp_body_buf, + size_t resp_bufsz, + int *resp_fd +) +{ + uint32_t count = 0; + enum vrtd_ret ret = vrtd_recv_response_fds( + fd, resp_body_buf, resp_bufsz, resp_fd, resp_fd ? 1u : 0u, &count); + if (resp_fd && count == 0) { + *resp_fd = -1; + } + return ret; +} + int vrtd_connect(const char *path) { if (path == NULL) { @@ -232,6 +263,60 @@ enum vrtd_ret vrtd_raw_request( return vrtd_recv_response(fd, resp_body_buf, resp_bufsz, resp_fd); } +static enum vrtd_ret vrtd_raw_request_fds( + int fd, + uint16_t opcode, + const void *req_body, uint16_t req_size, + void *resp_body_buf, size_t resp_bufsz, + int *resp_fds, uint32_t max_resp_fds, uint32_t *resp_fd_count, + const int *req_fd +) +{ + if (req_size > VRTD_MSG_MAX_SIZE - sizeof(struct vrtd_req_header)) { errno = EMSGSIZE; return -1; } + + struct vrtd_req_header h = { + .size = req_size, + .opcode= opcode, + .seqno = 1, + }; + + struct iovec siov[2]; + siov[0].iov_base = &h; + siov[0].iov_len = sizeof(h); + siov[1].iov_base = (void*) req_body; + siov[1].iov_len = req_size; + + char cbuf[CMSG_SPACE(sizeof(int))]; + struct msghdr smsg = { + .msg_iov = siov, + .msg_iovlen = req_size ? 2 : 1, + .msg_control = NULL, + .msg_controllen = 0, + }; + + if (req_fd && *req_fd >= 0) { + smsg.msg_control = cbuf; + smsg.msg_controllen = sizeof(cbuf); + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&smsg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), req_fd, sizeof(int)); + } + + ssize_t sn = sendmsg(fd, &smsg, MSG_NOSIGNAL); + if (sn == -1) { + return VRTD_RET_BAD_CONN; + } + if ((size_t) sn != sizeof(h) + req_size) { + return VRTD_RET_BAD_CONN; + } + + return vrtd_recv_response_fds(fd, resp_body_buf, resp_bufsz, + resp_fds, max_resp_fds, resp_fd_count); +} + enum vrtd_ret vrtd_get_num_devices(int fd, uint32_t *out) { @@ -488,16 +573,24 @@ enum vrtd_ret vrtd_buffer_open( }; struct vrtd_resp_buffer_open resp = {0}; - int qpair_fd = -1; - int ret = vrtd_raw_request(fd, VRTD_REQ_BUFFER_OPEN, - &req, sizeof(req), - &resp, sizeof(resp), - &qpair_fd, NULL); + int qpair_fds[2] = {-1, -1}; + uint32_t qpair_fd_count = 0; + int ret = vrtd_raw_request_fds(fd, VRTD_REQ_BUFFER_OPEN, + &req, sizeof(req), + &resp, sizeof(resp), + qpair_fds, 2, &qpair_fd_count, NULL); if (ret != VRTD_RET_OK) { return ret; } - if (qpair_fd < 0) { + if (qpair_fd_count == 0 || qpair_fd_count > 2 || + resp.qpair_fd_count == 0 || resp.qpair_fd_count > qpair_fd_count || + qpair_fds[0] < 0) { + for (uint32_t i = 0; i < qpair_fd_count; ++i) { + if (qpair_fds[i] >= 0) { + (void) close(qpair_fds[i]); + } + } return VRTD_RET_INTERNAL_ERROR; } @@ -509,12 +602,17 @@ enum vrtd_ret vrtd_buffer_open( alloc_arg, resp.size, resp.phys_addr, - qpair_fd, + qpair_fds, + resp.qpair_fd_count, page_mode, buffer_out ); if (ret != VRTD_RET_OK) { - (void) close(qpair_fd); + for (uint32_t i = 0; i < qpair_fd_count; ++i) { + if (qpair_fds[i] >= 0) { + (void) close(qpair_fds[i]); + } + } return ret; } @@ -546,16 +644,24 @@ enum vrtd_ret vrtd_buffer_open_raw( }; struct vrtd_resp_buffer_open_raw resp = {0}; - int qpair_fd = -1; - int ret = vrtd_raw_request(fd, VRTD_REQ_BUFFER_OPEN_RAW, - &req, sizeof(req), - &resp, sizeof(resp), - &qpair_fd, NULL); + int qpair_fds[2] = {-1, -1}; + uint32_t qpair_fd_count = 0; + int ret = vrtd_raw_request_fds(fd, VRTD_REQ_BUFFER_OPEN_RAW, + &req, sizeof(req), + &resp, sizeof(resp), + qpair_fds, 2, &qpair_fd_count, NULL); if (ret != VRTD_RET_OK) { return ret; } - if (qpair_fd < 0) { + if (qpair_fd_count == 0 || qpair_fd_count > 2 || + resp.qpair_fd_count == 0 || resp.qpair_fd_count > qpair_fd_count || + qpair_fds[0] < 0) { + for (uint32_t i = 0; i < qpair_fd_count; ++i) { + if (qpair_fds[i] >= 0) { + (void) close(qpair_fds[i]); + } + } return VRTD_RET_INTERNAL_ERROR; } @@ -567,12 +673,17 @@ enum vrtd_ret vrtd_buffer_open_raw( 0, /* alloc_arg: not used for raw buffers */ size, phys_addr, - qpair_fd, + qpair_fds, + resp.qpair_fd_count, page_mode, buffer_out ); if (ret != VRTD_RET_OK) { - (void) close(qpair_fd); + for (uint32_t i = 0; i < qpair_fd_count; ++i) { + if (qpair_fds[i] >= 0) { + (void) close(qpair_fds[i]); + } + } return ret; } diff --git a/vrt/vrtd/libvrtdpp/src/buffer.cpp b/vrt/vrtd/libvrtdpp/src/buffer.cpp index 0c68ff22..9242c6b2 100644 --- a/vrt/vrtd/libvrtdpp/src/buffer.cpp +++ b/vrt/vrtd/libvrtdpp/src/buffer.cpp @@ -123,7 +123,7 @@ void *Buffer::data() noexcept int Buffer::getFd() const noexcept { - return buffer ? buffer->qpair_fd : -1; + return buffer ? buffer->qpair_fds[0] : -1; } int Buffer::releaseFd() noexcept @@ -131,8 +131,8 @@ int Buffer::releaseFd() noexcept if (buffer == nullptr) { return -1; } - int ret = buffer->qpair_fd; - buffer->qpair_fd = -1; + int ret = buffer->qpair_fds[0]; + buffer->qpair_fds[0] = -1; return ret; } @@ -151,24 +151,12 @@ bool Buffer::isClosed() const noexcept std::fstream Buffer::fstream(std::ios_base::openmode mode) const { + (void)mode; if (isClosed()) { throw std::runtime_error("Buffer is closed"); } - int fd = getFd(); - if (fd < 0) { - throw std::runtime_error("Buffer FD is invalid"); - } - - std::string path = "/proc/self/fd/" + std::to_string(fd); - - std::fstream stream; - stream.open(path, mode); - if (!stream.is_open()) { - throw std::runtime_error("Failed to open fstream for buffer"); - } - - return stream; + throw std::runtime_error("Buffer qpair fds are ioctl-only; use syncToDevice/syncFromDevice"); } void Buffer::syncToDevice(uint64_t offset, uint64_t size) diff --git a/vrt/vrtd/src/buffer.c b/vrt/vrtd/src/buffer.c index 9bfd6c52..b65fa185 100644 --- a/vrt/vrtd/src/buffer.c +++ b/vrt/vrtd/src/buffer.c @@ -52,7 +52,12 @@ #define VRTD_QDMA_Q_MODE_MM 0u /* Memory-mapped (MM) mode */ #define VRTD_QDMA_DIR_H2C (1u << 0) /* Host-to-Card direction */ #define VRTD_QDMA_DIR_C2H (1u << 1) /* Card-to-Host direction */ -#define VRTD_QDMA_RING_SZ_IDX 0u /* Default ring size index */ +/* + * TODO: make this a vrtd.conf setting. Index 15 is the largest QDMA descriptor + * ring and gives the best sustained transfer speed, but it consumes more + * host-side DMA-coherent memory per queue. + */ +#define VRTD_QDMA_RING_SZ_IDX 15u /* Default ring size index */ /** * Initialise a buffer: allocate device memory, create a QDMA queue pair, @@ -101,8 +106,9 @@ static int buffer_init(struct buffer *buf, .client_id = client_id, .addr = 0, .size = 0, - .qid = 0, - .fd = -1, + .qpair_count = 0, + .qids = {0}, + .fds = {-1, -1}, .allocation_valid = false, .qpair_created = false, }; @@ -172,47 +178,49 @@ static int buffer_init(struct buffer *buf, buf->size = alloc_size; buf->allocation_valid = true; - /* Step 2: Configure and create a QDMA queue pair. If the caller - * supplied custom qpair parameters (e.g. streaming mode), use those; - * otherwise default to memory-mapped mode with the smallest ring size. */ - struct slash_qdma_qpair_add qpair = {0}; - if (qpair_params != NULL) { - qpair = *qpair_params; - } else { - qpair.mode = VRTD_QDMA_Q_MODE_MM; - qpair.h2c_ring_sz = VRTD_QDMA_RING_SZ_IDX; - qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX; - qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX; - } - qpair.dir_mask = dir_mask; - qpair.mm_channel = mm_channel; - qpair.size = sizeof(qpair); + /* Steps 2-4: create/start queue pairs and obtain their fds. Current + * SLASH hardware benefits from two qpairs per registered buffer; future + * backends may choose to send only one. */ + for (uint32_t i = 0; i < VRTD_BUFFER_MAX_QPAIR_FDS; ++i) { + struct slash_qdma_qpair_add qpair = {0}; + + if (qpair_params != NULL) { + qpair = *qpair_params; + } else { + qpair.mode = VRTD_QDMA_Q_MODE_MM; + qpair.h2c_ring_sz = VRTD_QDMA_RING_SZ_IDX; + qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX; + qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX; + } + qpair.dir_mask = dir_mask; + qpair.mm_channel = mm_channel; + qpair.size = sizeof(qpair); - if (slash_qdma_qpair_add(qdma, &qpair) != 0) { - LOG(LOG_ERR, "Failed to add buffer qpair: %m"); - goto fail; - } + if (slash_qdma_qpair_add(qdma, &qpair) != 0) { + LOG(LOG_ERR, "Failed to add buffer qpair %u: %m", (unsigned int)i); + goto fail; + } - buf->qid = qpair.qid; - buf->qpair_created = true; + buf->qids[i] = qpair.qid; + buf->qpair_count = i + 1; + buf->qpair_created = true; - /* Step 3: Start the queue pair so DMA transfers can be issued. */ - if (slash_qdma_qpair_start(qdma, buf->qid) != 0) { - LOG(LOG_ERR, "Failed to start buffer qpair %u: %m", buf->qid); - goto fail; - } + if (slash_qdma_qpair_start(qdma, qpair.qid) != 0) { + LOG(LOG_ERR, "Failed to start buffer qpair %u: %m", qpair.qid); + goto fail; + } - /* Step 4: Obtain a file descriptor for the queue. The client will use - * this fd (passed over the Unix socket via SCM_RIGHTS) to perform - * read/write/mmap against the QDMA queue. */ - int fd = slash_qdma_qpair_get_fd(qdma, buf->qid, O_CLOEXEC); - if (fd < 0) { - LOG(LOG_ERR, "Failed to get fd for buffer qpair %u: %m", buf->qid); - goto fail; + int fd = slash_qdma_qpair_get_fd(qdma, qpair.qid, O_CLOEXEC); + if (fd < 0) { + LOG(LOG_ERR, "Failed to get fd for buffer qpair %u: %m", qpair.qid); + goto fail; + } + buf->fds[i] = fd; } - buf->fd = fd; - LOG(LOG_DEBUG, "Buffer initialized addr=0x%llx size=%llu qid=%u", (unsigned long long)buf->addr, (unsigned long long)buf->size, buf->qid); + LOG(LOG_DEBUG, "Buffer initialized addr=0x%llx size=%llu qpairs=%u", + (unsigned long long)buf->addr, (unsigned long long)buf->size, + (unsigned int)buf->qpair_count); return 0; fail: @@ -306,46 +314,52 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma, .client_id = 0, .addr = phys_addr, .size = size, - .qid = 0, - .fd = -1, + .qpair_count = 0, + .qids = {0}, + .fds = {-1, -1}, .allocation_valid = false, /* no allocator reservation to free */ .qpair_created = false, }; - struct slash_qdma_qpair_add qpair = {0}; - qpair.mode = VRTD_QDMA_Q_MODE_MM; - qpair.h2c_ring_sz = VRTD_QDMA_RING_SZ_IDX; - qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX; - qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX; - qpair.dir_mask = dir_mask; - qpair.mm_channel = mm_channel; - qpair.size = sizeof(qpair); - - if (slash_qdma_qpair_add(qdma, &qpair) != 0) { - LOG(LOG_ERR, "buffer_create_raw: failed to add qpair: %m"); - free(buf); - return NULL; - } + for (uint32_t i = 0; i < VRTD_BUFFER_MAX_QPAIR_FDS; ++i) { + struct slash_qdma_qpair_add qpair = {0}; - buf->qid = qpair.qid; - buf->qpair_created = true; + qpair.mode = VRTD_QDMA_Q_MODE_MM; + qpair.h2c_ring_sz = VRTD_QDMA_RING_SZ_IDX; + qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX; + qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX; + qpair.dir_mask = dir_mask; + qpair.mm_channel = mm_channel; + qpair.size = sizeof(qpair); + + if (slash_qdma_qpair_add(qdma, &qpair) != 0) { + LOG(LOG_ERR, "buffer_create_raw: failed to add qpair %u: %m", (unsigned int)i); + cleanup_buffer(buf); + return NULL; + } - if (slash_qdma_qpair_start(qdma, buf->qid) != 0) { - LOG(LOG_ERR, "buffer_create_raw: failed to start qpair %u: %m", buf->qid); - cleanup_buffer(buf); - return NULL; - } + buf->qids[i] = qpair.qid; + buf->qpair_count = i + 1; + buf->qpair_created = true; - int fd = slash_qdma_qpair_get_fd(qdma, buf->qid, O_CLOEXEC); - if (fd < 0) { - LOG(LOG_ERR, "buffer_create_raw: failed to get fd for qpair %u: %m", buf->qid); - cleanup_buffer(buf); - return NULL; + if (slash_qdma_qpair_start(qdma, qpair.qid) != 0) { + LOG(LOG_ERR, "buffer_create_raw: failed to start qpair %u: %m", qpair.qid); + cleanup_buffer(buf); + return NULL; + } + + int fd = slash_qdma_qpair_get_fd(qdma, qpair.qid, O_CLOEXEC); + if (fd < 0) { + LOG(LOG_ERR, "buffer_create_raw: failed to get fd for qpair %u: %m", qpair.qid); + cleanup_buffer(buf); + return NULL; + } + buf->fds[i] = fd; } - buf->fd = fd; - LOG(LOG_DEBUG, "Raw buffer created phys_addr=0x%llx size=%llu qid=%u", - (unsigned long long)phys_addr, (unsigned long long)size, buf->qid); + LOG(LOG_DEBUG, "Raw buffer created phys_addr=0x%llx size=%llu qpairs=%u", + (unsigned long long)phys_addr, (unsigned long long)size, + (unsigned int)buf->qpair_count); return buf; } @@ -353,12 +367,12 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma, * Tear down a buffer and release all associated resources. * * Resources are released in reverse acquisition order: - * 1. Close the file descriptor (if open). - * 2. Stop and delete the QDMA queue pair (if created). + * 1. Close the file descriptors (if open). + * 2. Stop and delete the QDMA queue pairs (if created). * 3. Free the device memory allocation (if valid). * 4. Zero all fields and free the struct. * - * Each step is guarded by its corresponding flag (fd >= 0, + * Each step is guarded by its corresponding flag (fds[] >= 0, * qpair_created, allocation_valid) so this function is safe to call * after partial initialisation. NULL-safe. */ @@ -368,31 +382,33 @@ void cleanup_buffer(struct buffer *buf) return; } - LOG(LOG_DEBUG, "Freeing buffer addr=0x%llx size=%llu qid=%u", (unsigned long long)buf->addr, (unsigned long long)buf->size, buf->qid); + LOG(LOG_DEBUG, "Freeing buffer addr=0x%llx size=%llu qpairs=%u", + (unsigned long long)buf->addr, (unsigned long long)buf->size, + (unsigned int)buf->qpair_count); - /* Close the QDMA queue fd first, before stopping the queue. */ - if (buf->fd >= 0) { - (void) close(buf->fd); - buf->fd = -1; + /* Close the QDMA queue fds first, before stopping the queues. */ + for (uint32_t i = 0; i < VRTD_BUFFER_MAX_QPAIR_FDS; ++i) { + if (buf->fds[i] >= 0) { + (void) close(buf->fds[i]); + buf->fds[i] = -1; + } } - /* Stop and delete the QDMA queue pair. Errors are logged but + /* Stop and delete the QDMA queue pairs. Errors are logged but * otherwise ignored -- we are on the teardown path and must continue * releasing remaining resources. */ if (buf->qpair_created && buf->qdma != NULL) { - if (slash_qdma_qpair_stop(buf->qdma, buf->qid) != 0) { - LOG( - LOG_WARNING, - "Error stopping buffer qpair %u: %m (ignored)", - buf->qid - ); - } - if (slash_qdma_qpair_del(buf->qdma, buf->qid) != 0) { - LOG( - LOG_WARNING, - "Error deleting buffer qpair %u: %m (ignored)", - buf->qid - ); + for (uint32_t i = 0; i < buf->qpair_count; ++i) { + if (slash_qdma_qpair_stop(buf->qdma, buf->qids[i]) != 0) { + LOG(LOG_WARNING, + "Error stopping buffer qpair %u: %m (ignored)", + buf->qids[i]); + } + if (slash_qdma_qpair_del(buf->qdma, buf->qids[i]) != 0) { + LOG(LOG_WARNING, + "Error deleting buffer qpair %u: %m (ignored)", + buf->qids[i]); + } } } @@ -422,8 +438,11 @@ void cleanup_buffer(struct buffer *buf) buf->allocation_valid = false; buf->addr = 0; buf->size = 0; - buf->qid = 0; - buf->fd = -1; + buf->qpair_count = 0; + memset(buf->qids, 0, sizeof(buf->qids)); + for (uint32_t i = 0; i < VRTD_BUFFER_MAX_QPAIR_FDS; ++i) { + buf->fds[i] = -1; + } free(buf); } diff --git a/vrt/vrtd/src/buffer.h b/vrt/vrtd/src/buffer.h index 5a49e733..4f59759f 100644 --- a/vrt/vrtd/src/buffer.h +++ b/vrt/vrtd/src/buffer.h @@ -48,6 +48,8 @@ #include "array.h" #include "vrtd/wire.h" +#define VRTD_BUFFER_MAX_QPAIR_FDS 2u + /** * @brief A single DMA buffer allocated on a SLASH FPGA device. * @@ -72,11 +74,13 @@ struct buffer { uint64_t addr; /** @brief Size of the allocated memory region in bytes (rounded up to subregion granularity). */ uint64_t size; - /** @brief QDMA queue ID assigned to this buffer's queue pair. */ - uint32_t qid; - /** @brief File descriptor for the QDMA queue pair character device. + /** @brief Number of QDMA queue pairs created for this buffer. */ + uint32_t qpair_count; + /** @brief QDMA queue IDs assigned to this buffer's queue pairs. */ + uint32_t qids[VRTD_BUFFER_MAX_QPAIR_FDS]; + /** @brief File descriptors for the QDMA queue pairs. * Passed to the client via SCM_RIGHTS for direct data transfer. */ - int fd; + int fds[VRTD_BUFFER_MAX_QPAIR_FDS]; /** @brief True if the address-space allocation in the memory map is valid and must be freed. */ bool allocation_valid; /** @brief True if the QDMA queue pair has been created and must be torn down on cleanup. */ diff --git a/vrt/vrtd/src/serve.c b/vrt/vrtd/src/serve.c index 299c72ff..5839efe7 100644 --- a/vrt/vrtd/src/serve.c +++ b/vrt/vrtd/src/serve.c @@ -761,7 +761,7 @@ static int client_handle_in(struct client *client) * Allocate a cmsg buffer large enough for one fd. * CMSG_SPACE includes alignment padding required by the kernel. */ - char cbuf[CMSG_SPACE(sizeof(int))]; + char cbuf[CMSG_SPACE(2 * sizeof(int))]; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, @@ -895,17 +895,24 @@ static int client_handle_out(struct client *client) * The cbuf is zeroed to satisfy kernel expectations about padding. */ if (client->have_out_fd) { + uint32_t fd_count = client->out_fd_count ? client->out_fd_count : 1; + + if (fd_count > 2) { + LOG(LOG_ERR, "Invalid outbound fd count %u", (unsigned int)fd_count); + return -1; + } + memset(cbuf, 0, sizeof cbuf); msg.msg_control = cbuf; - msg.msg_controllen = sizeof cbuf; + msg.msg_controllen = CMSG_SPACE(fd_count * sizeof(int)); struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_RIGHTS; - cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + cmsg->cmsg_len = CMSG_LEN(fd_count * sizeof(int)); - memcpy(CMSG_DATA(cmsg), &client->out_fd, sizeof(int)); + memcpy(CMSG_DATA(cmsg), client->out_fds, fd_count * sizeof(int)); } ssize_t n; @@ -937,6 +944,7 @@ static int client_handle_out(struct client *client) /* Response sent -- clear state so the client can send a new request. */ client->have_response = false; client->have_out_fd = false; + client->out_fd_count = 0; return 0; } @@ -1040,7 +1048,7 @@ static int client_handle_request(struct client *client) req_header->size, CLIENT_OUT_BODY(*client, vrtd_resp_get_bar_fd), &size, - &client->out_fd, + &client->out_fds[0], &client->have_out_fd ); break; @@ -1082,7 +1090,7 @@ static int client_handle_request(struct client *client) req_header->size, CLIENT_OUT_BODY(*client, vrtd_resp_qdma_qpair_get_fd), &size, - &client->out_fd, + &client->out_fds[0], &client->have_out_fd ); break; @@ -1094,7 +1102,7 @@ static int client_handle_request(struct client *client) req_header->size, CLIENT_OUT_BODY(*client, vrtd_resp_buffer_open), &size, - &client->out_fd, + &client->out_fds[0], &client->have_out_fd ); break; @@ -1106,7 +1114,7 @@ static int client_handle_request(struct client *client) req_header->size, CLIENT_OUT_BODY(*client, vrtd_resp_buffer_open_raw), &size, - &client->out_fd, + &client->out_fds[0], &client->have_out_fd ); break; @@ -2016,14 +2024,19 @@ static uint16_t client_handle_request_buffer_open( return VRTD_RET_INTERNAL_ERROR; } - if (buf->fd < 0) { - LOG(LOG_ERR, "Buffer created without valid fd"); + if (buf->qpair_count == 0 || buf->qpair_count > VRTD_BUFFER_MAX_QPAIR_FDS || + buf->fds[0] < 0) { + LOG(LOG_ERR, "Buffer created without valid qpair fd"); return VRTD_RET_INTERNAL_ERROR; } uint64_t real_size = buf->size; - int fd = buf->fd; uint64_t phys_addr = buf->addr; + uint32_t qpair_fd_count = buf->qpair_count; + int fds[VRTD_BUFFER_MAX_QPAIR_FDS]; + for (uint32_t i = 0; i < qpair_fd_count; ++i) { + fds[i] = buf->fds[i]; + } /* * Transfer ownership of the buffer into the device's buffer list. @@ -2037,7 +2050,12 @@ static uint16_t client_handle_request_buffer_open( resp_body->size = real_size; resp_body->phys_addr = phys_addr; - *out_fd = fd; + resp_body->qpair_fd_count = qpair_fd_count; + for (uint32_t i = 0; i < qpair_fd_count; ++i) { + client->out_fds[i] = fds[i]; + } + client->out_fd_count = qpair_fd_count; + *out_fd = fds[0]; *have_out_fd = true; *resp_size = sizeof(*resp_body); @@ -2140,20 +2158,29 @@ static uint16_t client_handle_request_buffer_open_raw( return VRTD_RET_INTERNAL_ERROR; } - if (buf->fd < 0) { - LOG(LOG_ERR, "Raw buffer created without valid fd"); + if (buf->qpair_count == 0 || buf->qpair_count > VRTD_BUFFER_MAX_QPAIR_FDS || + buf->fds[0] < 0) { + LOG(LOG_ERR, "Raw buffer created without valid qpair fd"); return VRTD_RET_INTERNAL_ERROR; } - int fd = buf->fd; + uint32_t qpair_fd_count = buf->qpair_count; + int fds[VRTD_BUFFER_MAX_QPAIR_FDS]; + for (uint32_t i = 0; i < qpair_fd_count; ++i) { + fds[i] = buf->fds[i]; + } if (buffer_ptr_array_push_move(&d->buffers, &buf) != 0) { LOG(LOG_ERR, "Failed to add raw buffer to device buffer list"); return VRTD_RET_INTERNAL_ERROR; } - resp_body->zero = 0; - *out_fd = fd; + resp_body->qpair_fd_count = qpair_fd_count; + for (uint32_t i = 0; i < qpair_fd_count; ++i) { + client->out_fds[i] = fds[i]; + } + client->out_fd_count = qpair_fd_count; + *out_fd = fds[0]; *have_out_fd = true; *resp_size = sizeof(*resp_body); diff --git a/vrt/vrtd/src/serve.h b/vrt/vrtd/src/serve.h index 55cdd9ba..8d4e728c 100644 --- a/vrt/vrtd/src/serve.h +++ b/vrt/vrtd/src/serve.h @@ -73,9 +73,11 @@ struct client { /** @brief True when @c in_fd contains a valid received file descriptor. */ bool have_in_fd; - /** @brief File descriptor to send back to the client via SCM_RIGHTS ancillary data. */ - int out_fd; - /** @brief True when @c out_fd contains a valid file descriptor to transmit. */ + /** @brief File descriptors to send back to the client via SCM_RIGHTS ancillary data. */ + int out_fds[2]; + /** @brief Number of valid descriptors in @c out_fds. */ + uint32_t out_fd_count; + /** @brief True when @c out_fds contains at least one valid file descriptor to transmit. */ bool have_out_fd; /** @brief True when a complete request has been read into @c inb and is awaiting dispatch. */ diff --git a/vrt/vrtd/tests/buffer_test.cpp b/vrt/vrtd/tests/buffer_test.cpp index a3c96b70..e2fbfd59 100644 --- a/vrt/vrtd/tests/buffer_test.cpp +++ b/vrt/vrtd/tests/buffer_test.cpp @@ -36,6 +36,30 @@ static constexpr const char *REAL_QDMA_PATH = "/dev/slash_qdma_ctl0"; static constexpr uint64_t XFER_SIZE = 4096; static constexpr uint64_t CLIENT_ID = 42; +static void qpair_fd_round_trip(int fd, uint64_t addr, const uint8_t *src, uint8_t *dst) +{ + uint8_t write_buf[XFER_SIZE]; + uint8_t read_buf[XFER_SIZE]{}; + std::memcpy(write_buf, src, XFER_SIZE); + + uint32_t write_id = 0; + uint32_t read_id = 0; + ASSERT_EQ(slash_qdma_qpair_buffer_register(fd, write_buf, XFER_SIZE, &write_id, nullptr), 0); + ASSERT_EQ(slash_qdma_qpair_buffer_register(fd, read_buf, XFER_SIZE, &read_id, nullptr), 0); + + ssize_t written = slash_qdma_qpair_transfer( + fd, write_id, 0, addr, XFER_SIZE, SLASH_QDMA_XFER_H2C); + EXPECT_EQ(written, static_cast(XFER_SIZE)); + + ssize_t read_bytes = slash_qdma_qpair_transfer( + fd, read_id, 0, addr, XFER_SIZE, SLASH_QDMA_XFER_C2H); + EXPECT_EQ(read_bytes, static_cast(XFER_SIZE)); + std::memcpy(dst, read_buf, XFER_SIZE); + + EXPECT_EQ(slash_qdma_qpair_buffer_unregister(fd, write_id), 0); + EXPECT_EQ(slash_qdma_qpair_buffer_unregister(fd, read_id), 0); +} + // ─── Null / argument validation (no hardware needed, always run) ────────────── TEST(BufferNullTest, NullQdma) { @@ -156,18 +180,15 @@ TEST_P(BufferTest, LifecycleBidirectional) { VRTD_ALLOC_DIR_BIDIRECTIONAL, XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); ASSERT_NE(buf, nullptr); - EXPECT_GE(buf->fd, 0); + ASSERT_GE(buf->qpair_count, 1u); + EXPECT_GE(buf->fds[0], 0); uint8_t src[XFER_SIZE]; for (size_t i = 0; i < XFER_SIZE; ++i) src[i] = static_cast(i & 0xFF); - ssize_t written = pwrite(buf->fd, src, XFER_SIZE, static_cast(buf->addr)); - EXPECT_EQ(written, static_cast(XFER_SIZE)); - uint8_t dst[XFER_SIZE]{}; - ssize_t read_bytes = pread(buf->fd, dst, XFER_SIZE, static_cast(buf->addr)); - EXPECT_EQ(read_bytes, static_cast(XFER_SIZE)); + qpair_fd_round_trip(buf->fds[0], buf->addr, src, dst); EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); cleanup_buffer(buf); @@ -177,18 +198,15 @@ TEST_P(BufferTest, RawCreateAndIO) { struct buffer *buf = buffer_create_raw(qdma_, DDR_START_ADDRESS, XFER_SIZE, VRTD_ALLOC_DIR_BIDIRECTIONAL, SLASH_QDMA_MM_CHANNEL_AUTO); ASSERT_NE(buf, nullptr); - EXPECT_GE(buf->fd, 0); + ASSERT_GE(buf->qpair_count, 1u); + EXPECT_GE(buf->fds[0], 0); EXPECT_EQ(buf->addr, DDR_START_ADDRESS); EXPECT_FALSE(buf->allocation_valid); uint8_t src[XFER_SIZE]; std::memset(src, 0xCD, sizeof(src)); - ssize_t written = pwrite(buf->fd, src, XFER_SIZE, static_cast(DDR_START_ADDRESS)); - EXPECT_EQ(written, static_cast(XFER_SIZE)); - uint8_t dst[XFER_SIZE]{}; - ssize_t n = pread(buf->fd, dst, XFER_SIZE, static_cast(DDR_START_ADDRESS)); - EXPECT_EQ(n, static_cast(XFER_SIZE)); + qpair_fd_round_trip(buf->fds[0], DDR_START_ADDRESS, src, dst); EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); cleanup_buffer(buf); @@ -202,18 +220,18 @@ TEST_P(BufferTest, QueueExhaustion) { GTEST_SKIP() << "Queue exhaustion test is mock-only"; } - static constexpr int MAX_QUEUES = 64; + static constexpr int MAX_BUFFERS = 32; /* two mock queues per buffer */ std::vector bufs; - bufs.reserve(MAX_QUEUES); + bufs.reserve(MAX_BUFFERS); - for (int i = 0; i < MAX_QUEUES; ++i) { + for (int i = 0; i < MAX_BUFFERS; ++i) { struct buffer *buf = buffer_create_raw(qdma_, DDR_START_ADDRESS + i * XFER_SIZE, XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE, SLASH_QDMA_MM_CHANNEL_AUTO); - ASSERT_NE(buf, nullptr) << "Expected success for queue " << i; + ASSERT_NE(buf, nullptr) << "Expected success for buffer " << i; bufs.push_back(buf); } - /* 65th allocation must fail */ + /* 33rd allocation needs queues 65/66 and must fail. */ struct buffer *overflow = buffer_create_raw(qdma_, DDR_START_ADDRESS, XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE, SLASH_QDMA_MM_CHANNEL_AUTO); EXPECT_EQ(overflow, nullptr); From 37cdd755938ab8f5d905facc8d30c3b72e09e0fe Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Mon, 15 Jun 2026 10:59:38 +0100 Subject: [PATCH 20/23] qdma stack: change policy from dual-channel to the more complex v80 policy Signed-off-by: Vlad-Gabriel Serbu --- docs/reference/kernel-abi/index.rst | 13 +- driver/libslash/README.md | 2 +- driver/libslash/include/slash/qdma.h | 2 +- .../include/slash/uapi/slash_interface.h | 6 +- driver/libslash/src/qdma.c | 2 +- driver/libslash/src/qdma_mock.c | 2 +- driver/libslash/tests/qdma_test.cpp | 4 +- driver/slash_qdma.c | 2 +- driver/tests/test_slash_qdma.c | 8 +- scripts/package-ami.sh | 29 +++- vrt/vrtd/include/vrtd/wire.h | 16 +- vrt/vrtd/libvrtd/src/buffer.c | 112 ++++++++------ vrt/vrtd/libvrtd/src/v80_policy.h | 143 ++++++++++++++++++ vrt/vrtd/src/buffer.c | 47 +++++- vrt/vrtd/tests/CMakeLists.txt | 1 + vrt/vrtd/tests/v80_policy_test.cpp | 124 +++++++++++++++ 16 files changed, 433 insertions(+), 80 deletions(-) create mode 100644 vrt/vrtd/libvrtd/src/v80_policy.h create mode 100644 vrt/vrtd/tests/v80_policy_test.cpp diff --git a/docs/reference/kernel-abi/index.rst b/docs/reference/kernel-abi/index.rst index 8db53a2c..f018d401 100644 --- a/docs/reference/kernel-abi/index.rst +++ b/docs/reference/kernel-abi/index.rst @@ -760,15 +760,22 @@ auto-released when the final fd referencing that context is closed (including on ``transfer_hint`` is advisory and tells userspace which queue topology the kernel expects to be best for this registered buffer on the current hardware. Current SLASH hardware returns -``SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR``; userspace may ignore this value. Known values are: +``SLASH_QDMA_TRANSFER_HINT_V80``; userspace may ignore this value. Known values are: .. code-block:: c enum slash_qdma_transfer_hint { SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR = 1, - SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR = 2, + SLASH_QDMA_TRANSFER_HINT_V80 = 2, }; +``SLASH_QDMA_TRANSFER_HINT_V80`` asks userspace to apply the V80 placement-aware channel policy: +spread a transfer across both AXI-MM channels so each NoC ingress master (NMU) drives an +independent memory endpoint (NSU). The marker is opaque; the client computes the actual split from +the buffer's device address (DDR ranges are halved across the two channels, while HBM ranges are +routed by the 16 GiB half-memory boundary). ``SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR`` keeps all +traffic on a single queue. + **Preconditions:** - ``size`` must cover at least ``length`` (the trailing input field) — otherwise ``-EINVAL`` @@ -780,7 +787,7 @@ best for this registered buffer on the current hardware. Current SLASH hardware - ``buf_id`` is filled with the client-scoped handle, used in ``SLASH_QDMA_QPAIR_IOCTL_TRANSFER``. - ``transfer_hint`` is filled with an advisory transfer topology hint. Current SLASH hardware - returns ``SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR``. + returns ``SLASH_QDMA_TRANSFER_HINT_V80``. - The pages remain pinned and DMA-mapped until the buffer is unregistered or the owning control fd is closed. diff --git a/driver/libslash/README.md b/driver/libslash/README.md index 874931c0..2315691c 100644 --- a/driver/libslash/README.md +++ b/driver/libslash/README.md @@ -115,7 +115,7 @@ int fd = slash_qdma_qpair_get_fd(qdma, qid, O_CLOEXEC); uint32_t buf_id; enum slash_qdma_transfer_hint hint; slash_qdma_qpair_buffer_register(fd, buf, len, &buf_id, &hint); -/* Current SLASH hardware returns SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR. +/* Current SLASH hardware returns SLASH_QDMA_TRANSFER_HINT_V80. * Pass NULL instead of &hint if the application does not care. */ /* H2C: host -> device at dev_addr */ diff --git a/driver/libslash/include/slash/qdma.h b/driver/libslash/include/slash/qdma.h index 72398b88..b7ef1531 100644 --- a/driver/libslash/include/slash/qdma.h +++ b/driver/libslash/include/slash/qdma.h @@ -171,7 +171,7 @@ int slash_qdma_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags); * * The buffer is owned by @qdma and is automatically released when the * handle is closed. Pass the returned @buf_id to slash_qdma_transfer(). - * Current SLASH hardware returns SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR. + * Current SLASH hardware returns SLASH_QDMA_TRANSFER_HINT_V80. * * @return 0 on success, -1 on failure (errno set). */ diff --git a/driver/libslash/include/slash/uapi/slash_interface.h b/driver/libslash/include/slash/uapi/slash_interface.h index 977b8506..fd173909 100644 --- a/driver/libslash/include/slash/uapi/slash_interface.h +++ b/driver/libslash/include/slash/uapi/slash_interface.h @@ -259,8 +259,8 @@ enum slash_qdma_transfer_dir { * queue pair whose direction and ownership checks pass. */ enum slash_qdma_transfer_hint { - SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR = 1, /**< Prefer a single qpair. */ - SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR = 2, /**< Prefer two qpairs. */ + SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR = 1, /**< Prefer a single qpair (all traffic on one channel). */ + SLASH_QDMA_TRANSFER_HINT_V80 = 2, /**< Apply the V80 placement-aware channel policy. */ }; /** @@ -281,7 +281,7 @@ enum slash_qdma_transfer_hint { * (including on process exit) if userspace forgets to unregister them. * * The kernel also returns @transfer_hint. Current SLASH hardware returns - * SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR; userspace may ignore this field. + * SLASH_QDMA_TRANSFER_HINT_V80; userspace may ignore this field. */ struct slash_qdma_buf_register { __u32 size; /**< Struct size for ABI versioning. */ diff --git a/driver/libslash/src/qdma.c b/driver/libslash/src/qdma.c index 135b34af..6a0606ea 100644 --- a/driver/libslash/src/qdma.c +++ b/driver/libslash/src/qdma.c @@ -73,7 +73,7 @@ static int qpair_fallback_register(void *addr, uint64_t length, uint32_t *buf_id qpair_fallback_bufs[i].length = length; *buf_id = i; if (transfer_hint != NULL) { - *transfer_hint = SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR; + *transfer_hint = SLASH_QDMA_TRANSFER_HINT_V80; } return 0; } diff --git a/driver/libslash/src/qdma_mock.c b/driver/libslash/src/qdma_mock.c index 7cf32616..394b7487 100644 --- a/driver/libslash/src/qdma_mock.c +++ b/driver/libslash/src/qdma_mock.c @@ -297,7 +297,7 @@ int slash_qdma_mock_buffer_register(struct slash_qdma *qdma, void *addr, *buf_id = (uint32_t) i; if (transfer_hint != NULL) { - *transfer_hint = SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR; + *transfer_hint = SLASH_QDMA_TRANSFER_HINT_V80; } return 0; diff --git a/driver/libslash/tests/qdma_test.cpp b/driver/libslash/tests/qdma_test.cpp index b2ad3a70..20ea4221 100644 --- a/driver/libslash/tests/qdma_test.cpp +++ b/driver/libslash/tests/qdma_test.cpp @@ -262,8 +262,8 @@ TEST_P(ParametrizedQdmaTest, RegisteredBufferTransfer) { enum slash_qdma_transfer_hint dst_hint = SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR; ASSERT_EQ(slash_qdma_buffer_register(qdma_, src, XFER_SIZE, &src_buf, &src_hint), 0); ASSERT_EQ(slash_qdma_buffer_register(qdma_, dst, XFER_SIZE, &dst_buf, &dst_hint), 0); - EXPECT_EQ(src_hint, SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR); - EXPECT_EQ(dst_hint, SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR); + EXPECT_EQ(src_hint, SLASH_QDMA_TRANSFER_HINT_V80); + EXPECT_EQ(dst_hint, SLASH_QDMA_TRANSFER_HINT_V80); // H2C: push the source buffer to the device. ssize_t written = slash_qdma_transfer(qdma_, queue_fd, src_buf, 0, diff --git a/driver/slash_qdma.c b/driver/slash_qdma.c index d199344f..d0bbf760 100644 --- a/driver/slash_qdma.c +++ b/driver/slash_qdma.c @@ -3110,7 +3110,7 @@ static int slash_qdma_ioctl_buf_register_w(struct miscdevice *misc, /* Copy the assigned buf_id back to userspace. */ req.size = sizeof(req); req.buf_id = buf_id; - req.transfer_hint = SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR; + req.transfer_hint = SLASH_QDMA_TRANSFER_HINT_V80; copy_size = min_t(size_t, user_size, sizeof(req)); if (copy_to_user(uarg, &req, copy_size)) { xa_erase(&client->buffers, buf_id); diff --git a/driver/tests/test_slash_qdma.c b/driver/tests/test_slash_qdma.c index cd65ee40..2da35e7e 100644 --- a/driver/tests/test_slash_qdma.c +++ b/driver/tests/test_slash_qdma.c @@ -992,7 +992,7 @@ TEST_F(qdma, transfer_wrong_direction_returns_enodev) ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, &transfer_hint)); - EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR, transfer_hint); + EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, transfer_hint); /* C2H is not enabled on this qpair. */ ret = qdma_buf_transfer(self->io_fd, buf_id, 0, @@ -1020,7 +1020,7 @@ TEST_F(qdma, transfer_out_of_range_returns_einval) ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, &transfer_hint)); - EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR, transfer_hint); + EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, transfer_hint); /* Slice extends past the registered length. */ ret = qdma_buf_transfer(self->io_fd, buf_id, TRANSFER_SIZE, @@ -1058,8 +1058,8 @@ TEST_F(qdma, registered_buffer_round_trip) &write_id, &write_hint)); ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, xfer_size, &read_id, &read_hint)); - EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR, write_hint); - EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR, read_hint); + EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, write_hint); + EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, read_hint); ret = qdma_buf_transfer(self->io_fd, write_id, 0, dma_addr, xfer_size, SLASH_QDMA_XFER_H2C); diff --git a/scripts/package-ami.sh b/scripts/package-ami.sh index f12cc6f6..66ff2833 100755 --- a/scripts/package-ami.sh +++ b/scripts/package-ami.sh @@ -33,14 +33,23 @@ ARTIFACTS_DIR="${ARTIFACTS_DIR:-$(pwd)/ami}" AMI_BUILD_DIR="$(pwd)/ami-build" AVED_DIR="$(pwd)/submodules/AVED" AMI_DIR="${AVED_DIR}/sw/AMI" -PKG_PY="${AMI_DIR}/scripts/package_data/pkg.py" -GEN_PKG_PY="${AMI_DIR}/scripts/gen_package.py" +AMI_SRC_DIR="${AMI_BUILD_DIR}/src/AMI" +AMI_OUTPUT_DIR="${AMI_BUILD_DIR}/pkg" +PKG_PY="${AMI_SRC_DIR}/scripts/package_data/pkg.py" +GEN_PKG_PY="${AMI_SRC_DIR}/scripts/gen_package.py" rm -rf "${AMI_BUILD_DIR}" mkdir -p "${ARTIFACTS_DIR}" +mkdir -p "$(dirname "${AMI_SRC_DIR}")" +cp -a "${AMI_DIR}" "${AMI_SRC_DIR}" -# Restore submodule files and clean up build directory on exit -trap 'git -C "${AVED_DIR}" checkout -- sw/AMI/scripts/package_data/pkg.py sw/AMI/scripts/gen_package.py; rm -rf "${AMI_BUILD_DIR}"' EXIT +# Clean up build directory on exit. Packaging patches a disposable AMI copy so +# this also works from source trees copied without usable submodule gitdirs. +trap 'rm -rf "${AMI_BUILD_DIR}"' EXIT + +# Avoid stale generated headers from copied build trees. gen_package.py will +# otherwise prefer api/build/ami_version.h over the checked-in version header. +rm -f "${AMI_SRC_DIR}/api/build/ami_version.h" # Patch in Rocky Linux support (RHEL-compatible, RPM-based) sed -i "/^DIST_ID_RHEL /a DIST_ID_ROCKY = 'rocky'" "${PKG_PY}" @@ -48,13 +57,17 @@ sed -i "/^ DIST_ID_RHEL,$/a\\ DIST_ID_ROCKY," "${PKG_PY}" sed -i "s/DIST_RPM = \[DIST_ID_CENTOS, DIST_ID_REDHAT, DIST_ID_REDHAT2, DIST_ID_SLES, DIST_ID_RHEL\]/DIST_RPM = [DIST_ID_CENTOS, DIST_ID_REDHAT, DIST_ID_REDHAT2, DIST_ID_SLES, DIST_ID_RHEL, DIST_ID_ROCKY]/" "${PKG_PY}" sed -i "s/DIST_ID_CENTOS, DIST_ID_REDHAT, DIST_ID_REDHAT2, DIST_ID_RHEL\]/DIST_ID_CENTOS, DIST_ID_REDHAT, DIST_ID_REDHAT2, DIST_ID_RHEL, DIST_ID_ROCKY]/" "${GEN_PKG_PY}" -cd "${AMI_DIR}" +cd "${AMI_SRC_DIR}" # --no_driver skips a pre-flight driver compilation check (build+clean) only; # it does NOT affect which files are included in the package. # We skip it here so the packaging can run in environments (eg. containers) # that may not have linux-headers available to compile the driver. -python3 scripts/gen_package.py --no_driver -o "${AMI_BUILD_DIR}" +# +# --no_gen_version skips AVED's git-based version regeneration. This wrapper is +# often run from copied worktrees where the submodule .git file points back to a +# non-existent source checkout, causing an empty hash and an invalid RPM Release. +python3 scripts/gen_package.py --no_driver --no_gen_version -o "${AMI_OUTPUT_DIR}" # Copy only the package files to the artifacts directory -cp "${AMI_BUILD_DIR}"/*.rpm "${ARTIFACTS_DIR}/" 2>/dev/null || \ -cp "${AMI_BUILD_DIR}"/*.deb "${ARTIFACTS_DIR}/" 2>/dev/null || true +cp "${AMI_OUTPUT_DIR}"/*.rpm "${ARTIFACTS_DIR}/" 2>/dev/null || \ +cp "${AMI_OUTPUT_DIR}"/*.deb "${ARTIFACTS_DIR}/" 2>/dev/null || true diff --git a/vrt/vrtd/include/vrtd/wire.h b/vrt/vrtd/include/vrtd/wire.h index 1bad765b..f5295975 100644 --- a/vrt/vrtd/include/vrtd/wire.h +++ b/vrt/vrtd/include/vrtd/wire.h @@ -308,7 +308,14 @@ struct vrtd_req_buffer_open { struct vrtd_resp_buffer_open { uint64_t size; ///< Allocated size in bytes (rounded up to subregion). uint64_t phys_addr; ///< Device physical address of the allocation. - uint32_t qpair_fd_count; ///< Number of qpair FDs sent via SCM_RIGHTS (1 or 2). + /** + * Number of qpair FDs sent via SCM_RIGHTS (1 or 2). When two FDs are + * sent (an mm_channel == AUTO request), the ordering is fixed: FD[0] is + * pinned to AXI-MM channel 0 and FD[1] to channel 1, so the client can + * apply the V80 placement policy deterministically. A single FD is + * pinned to the explicitly requested channel. + */ + uint32_t qpair_fd_count; } __attribute__((packed)); /** @@ -343,7 +350,12 @@ struct vrtd_req_buffer_open_raw { } __attribute__((packed)); struct vrtd_resp_buffer_open_raw { - uint32_t qpair_fd_count; ///< Number of qpair FDs sent via SCM_RIGHTS (1 or 2). + /** + * Number of qpair FDs sent via SCM_RIGHTS (1 or 2). Same fd-to-channel + * ordering as @ref vrtd_resp_buffer_open: FD[0] -> channel 0, FD[1] -> + * channel 1 for an AUTO request; a single FD pins the requested channel. + */ + uint32_t qpair_fd_count; } __attribute__((packed)); /** diff --git a/vrt/vrtd/libvrtd/src/buffer.c b/vrt/vrtd/libvrtd/src/buffer.c index 54ba151b..c25697b6 100644 --- a/vrt/vrtd/libvrtd/src/buffer.c +++ b/vrt/vrtd/libvrtd/src/buffer.c @@ -46,6 +46,8 @@ #include +#include "v80_policy.h" + #include #include #include @@ -131,6 +133,44 @@ static int vrtd_mmap_regular_base_pages(uint64_t size, void **addr_out) { return 0; } +/* + * Issue a single contiguous transfer of [buf_offset, buf_offset + size) on one + * qpair fd. The QDMA transfer ioctl operates on signed ssize_t lengths, so the + * range is chunked to stay within SSIZE_MAX while preserving step alignment. + */ +static int vrtd_transfer_segment( + int qpair_fd, + uint32_t buf_id, + uint64_t buf_offset, + uint64_t phys_addr, + uint64_t size, + uint64_t step, + uint32_t direction +) { + uint64_t max_chunk = (uint64_t)SSIZE_MAX - ((uint64_t)SSIZE_MAX % step); + uint64_t done = 0; + + if (max_chunk == 0) { + return -EINVAL; + } + + while (done < size) { + uint64_t remaining = size - done; + uint64_t chunk = remaining > max_chunk ? max_chunk : remaining; + uint64_t xfer_offset = buf_offset + done; + uint64_t dev_offset = phys_addr + xfer_offset; + ssize_t ret = slash_qdma_qpair_transfer( + qpair_fd, buf_id, xfer_offset, dev_offset, chunk, direction); + + if (ret <= 0) { + return -EIO; + } + done += (uint64_t) ret; + } + + return 0; +} + static int vrtd_transfer_registered( const int *qpair_fds, uint32_t qpair_fd_count, @@ -142,8 +182,6 @@ static int vrtd_transfer_registered( uint64_t step, bool to_device ) { - uint64_t max_chunk; - uint64_t transferred = 0; uint32_t direction = to_device ? SLASH_QDMA_XFER_H2C : SLASH_QDMA_XFER_C2H; if (size == 0) { @@ -158,56 +196,38 @@ static int vrtd_transfer_registered( return -EINVAL; } - max_chunk = (uint64_t)SSIZE_MAX - ((uint64_t)SSIZE_MAX % step); - if (max_chunk == 0) { - return -EINVAL; + /* + * Decide how the transfer maps onto the available queues. V80 applies the + * placement-aware policy (DDR halved, HBM routed by the half-memory + * boundary); any other hint keeps everything on the primary qpair. + */ + struct vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint32_t nseg; + + if (transfer_hint == SLASH_QDMA_TRANSFER_HINT_V80) { + nseg = vrtd_plan_v80(phys_addr, offset, size, step, qpair_fd_count, segs); + } else { + segs[0].fd_index = 0; + segs[0].offset = offset; + segs[0].size = size; + nseg = 1; } - while (transferred < size) { - uint64_t chunk = size - transferred; - uint64_t done = 0; + for (uint32_t i = 0; i < nseg; ++i) { + uint32_t fd_index = segs[i].fd_index; - if (chunk > max_chunk) { - chunk = max_chunk; + /* The plan only references fds[0]/fds[1]; fall back to the primary + * qpair if a planned fd is somehow unavailable. */ + if (fd_index >= qpair_fd_count || qpair_fds[fd_index] < 0) { + fd_index = 0; } - while (done < chunk) { - uint64_t remaining = chunk - done; - uint64_t xfer_offset = offset + transferred + done; - uint64_t dev_offset = phys_addr + xfer_offset; - uint64_t xfer_size = remaining; - uint32_t fd_index = 0; - ssize_t ret; - - if (transfer_hint == SLASH_QDMA_TRANSFER_HINT_DUAL_QPAIR && - qpair_fd_count > 1 && qpair_fds[1] >= 0 && - remaining > step) { - /* - * Use the second qpair for the upper half of each large chunk. - * This keeps ranges disjoint while preserving alignment. - */ - uint64_t half = (chunk / 2) - ((chunk / 2) % step); - if (half != 0 && done < half) { - xfer_size = half - done; - } else if (half != 0) { - fd_index = 1; - } - } - - ret = slash_qdma_qpair_transfer( - qpair_fds[fd_index], buf_id, xfer_offset, - dev_offset, xfer_size, direction); - - if (ret < 0) { - return -EIO; - } - if (ret == 0) { - return -EIO; - } - done += (uint64_t) ret; + int ret = vrtd_transfer_segment( + qpair_fds[fd_index], buf_id, segs[i].offset, + phys_addr, segs[i].size, step, direction); + if (ret != 0) { + return ret; } - - transferred += chunk; } return 0; diff --git a/vrt/vrtd/libvrtd/src/v80_policy.h b/vrt/vrtd/libvrtd/src/v80_policy.h new file mode 100644 index 00000000..8d70b85e --- /dev/null +++ b/vrt/vrtd/libvrtd/src/v80_policy.h @@ -0,0 +1,143 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software + * and associated documentation files (the "Software"), to deal in the Software without restriction, + * including without limitation the rights to use, copy, modify, merge, publish, distribute, + * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or + * substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT + * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @file v80_policy.h + * @brief Client-side V80 placement-aware channel policy for QDMA transfers. + * + * The kernel returns the opaque SLASH_QDMA_TRANSFER_HINT_V80 marker on buffer + * registration; the actual decision of how to spread a transfer across the + * available QDMA queues lives here, where the buffer's device address is known. + * + * On the V80 a transfer takes two independent NoC paths: the host-side ingress + * master (NMU) is chosen by the queue's mm-channel, while the memory-side + * egress endpoint (NSU / HBM pseudo-channel) is chosen by the device address. + * Sustaining both NMUs requires also spreading across two NSUs. The policy: + * + * - DDR (single NSU): split the range in half so both NMUs stay busy. + * - HBM below the 16 GiB half-boundary: channel 0 only. + * - HBM at/above the half-boundary: channel 1 only. + * - HBM spanning the boundary: split there (below -> ch0, above -> ch1). + * + * The fd-to-channel mapping is the wire contract from vrtd: fds[0] is pinned to + * channel 0 and fds[1] to channel 1 (see vrtd_resp_buffer_open). + */ + +#ifndef VRTD_V80_POLICY_H +#define VRTD_V80_POLICY_H + +#include +#include + +/* + * V80 device-memory geometry (mirrors vrt/vrtd/src/allocator.h and the + * memory-model docs). HBM and DDR are each 64 x 512 MiB = 32 GiB; the HBM + * half-boundary at +16 GiB separates the two NoC slave-unit (NSU) regions. + */ +#define VRTD_V80_HBM_BASE 0x4000000000ULL +#define VRTD_V80_HBM_SIZE (64ULL * 512ULL * 1024ULL * 1024ULL) +#define VRTD_V80_HBM_HALF (VRTD_V80_HBM_SIZE / 2ULL) +#define VRTD_V80_DDR_BASE 0x60000000000ULL +#define VRTD_V80_DDR_SIZE (64ULL * 512ULL * 1024ULL * 1024ULL) + +/** @brief Maximum segments a transfer is split into (one per mm-channel). */ +#define VRTD_V80_MAX_SEGS 2u + +/** @brief One contiguous sub-transfer routed to a specific qpair fd. */ +struct vrtd_xfer_seg { + uint32_t fd_index; /**< Index into the qpair_fds array. */ + uint64_t offset; /**< Buffer-relative byte offset. */ + uint64_t size; /**< Byte count. */ +}; + +/** + * @brief Compute the V80 transfer plan for a buffer range. + * + * Plans the transfer of [@p offset, @p offset + @p size) within a buffer based + * at device address @p phys_addr across @p qpair_fd_count available queues + * (fds[0] == channel 0, fds[1] == channel 1). Split points are aligned down to + * @p step so every emitted segment stays page-aligned. With fewer than two + * queues (or a zero step) the whole range is assigned to fds[0]. + * + * @param phys_addr Device base address of the buffer. + * @param offset Buffer-relative start of the transfer. + * @param size Transfer length in bytes (assumed a multiple of step). + * @param step Transfer/page granule used to align split points. + * @param qpair_fd_count Number of available qpair fds (1 or 2). + * @param segs [out] Receives up to VRTD_V80_MAX_SEGS segments. + * @return Number of segments written to @p segs (1 or 2). + */ +static inline uint32_t vrtd_plan_v80(uint64_t phys_addr, + uint64_t offset, + uint64_t size, + uint64_t step, + uint32_t qpair_fd_count, + struct vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]) +{ + if (qpair_fd_count < 2u || step == 0u) { + segs[0].fd_index = 0u; + segs[0].offset = offset; + segs[0].size = size; + return 1u; + } + + uint64_t start = phys_addr + offset; + uint64_t end = start + size; + bool is_ddr = (phys_addr >= VRTD_V80_DDR_BASE && + phys_addr < VRTD_V80_DDR_BASE + VRTD_V80_DDR_SIZE); + + uint64_t lo_len; + if (is_ddr) { + /* Single DDR NSU: just split the range to drive both NMUs. */ + lo_len = size / 2u; + } else { + /* HBM: route by the 16 GiB half-memory boundary (NSU split). */ + uint64_t boundary = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF; + if (end <= boundary) { + lo_len = size; /* entirely in the lower half -> ch0 */ + } else if (start >= boundary) { + segs[0].fd_index = 1u; /* entirely in the upper half -> ch1 */ + segs[0].offset = offset; + segs[0].size = size; + return 1u; + } else { + lo_len = boundary - start; /* spans the boundary */ + } + } + + lo_len -= lo_len % step; /* keep both segments page-aligned */ + + if (lo_len == 0u || lo_len >= size) { + segs[0].fd_index = 0u; + segs[0].offset = offset; + segs[0].size = size; + return 1u; + } + + segs[0].fd_index = 0u; + segs[0].offset = offset; + segs[0].size = lo_len; + segs[1].fd_index = 1u; + segs[1].offset = offset + lo_len; + segs[1].size = size - lo_len; + return 2u; +} + +#endif /* VRTD_V80_POLICY_H */ diff --git a/vrt/vrtd/src/buffer.c b/vrt/vrtd/src/buffer.c index b65fa185..bf2c5d13 100644 --- a/vrt/vrtd/src/buffer.c +++ b/vrt/vrtd/src/buffer.c @@ -59,6 +59,32 @@ */ #define VRTD_QDMA_RING_SZ_IDX 15u /* Default ring size index */ +/** + * Decide how many qpairs back a buffer and which AXI-MM channel each one uses. + * + * A request of SLASH_QDMA_MM_CHANNEL_AUTO is expanded into two qpairs -- + * @c fds[0] pinned to channel 0 and @c fds[1] to channel 1 -- so the client can + * apply the V80 placement policy with a deterministic fd-to-channel mapping. + * An explicit channel request pins a single qpair to that channel (no split). + * + * @param mm_channel Requested channel (enum slash_qdma_mm_channel). + * @param channels [out] Per-qpair channel value, indexed by qpair number. + * @return Number of qpairs to create (1 or VRTD_BUFFER_MAX_QPAIR_FDS). + */ +static uint32_t buffer_plan_qpair_channels( + uint32_t mm_channel, + uint32_t channels[VRTD_BUFFER_MAX_QPAIR_FDS]) +{ + if (mm_channel == SLASH_QDMA_MM_CHANNEL_AUTO) { + channels[0] = SLASH_QDMA_MM_CHANNEL_0; + channels[1] = SLASH_QDMA_MM_CHANNEL_1; + return VRTD_BUFFER_MAX_QPAIR_FDS; + } + + channels[0] = mm_channel; + return 1u; +} + /** * Initialise a buffer: allocate device memory, create a QDMA queue pair, * start the queue, and obtain a file descriptor for host-side access. @@ -178,10 +204,14 @@ static int buffer_init(struct buffer *buf, buf->size = alloc_size; buf->allocation_valid = true; - /* Steps 2-4: create/start queue pairs and obtain their fds. Current - * SLASH hardware benefits from two qpairs per registered buffer; future - * backends may choose to send only one. */ - for (uint32_t i = 0; i < VRTD_BUFFER_MAX_QPAIR_FDS; ++i) { + /* Steps 2-4: create/start queue pairs and obtain their fds. An AUTO + * request yields two qpairs -- fds[0] on channel 0, fds[1] on channel 1 -- + * so the client's V80 placement policy has a deterministic fd-to-channel + * mapping; an explicit channel pins a single qpair. */ + uint32_t qpair_channels[VRTD_BUFFER_MAX_QPAIR_FDS]; + uint32_t num_qpairs = buffer_plan_qpair_channels(mm_channel, qpair_channels); + + for (uint32_t i = 0; i < num_qpairs; ++i) { struct slash_qdma_qpair_add qpair = {0}; if (qpair_params != NULL) { @@ -193,7 +223,7 @@ static int buffer_init(struct buffer *buf, qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX; } qpair.dir_mask = dir_mask; - qpair.mm_channel = mm_channel; + qpair.mm_channel = qpair_channels[i]; qpair.size = sizeof(qpair); if (slash_qdma_qpair_add(qdma, &qpair) != 0) { @@ -321,7 +351,10 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma, .qpair_created = false, }; - for (uint32_t i = 0; i < VRTD_BUFFER_MAX_QPAIR_FDS; ++i) { + uint32_t qpair_channels[VRTD_BUFFER_MAX_QPAIR_FDS]; + uint32_t num_qpairs = buffer_plan_qpair_channels(mm_channel, qpair_channels); + + for (uint32_t i = 0; i < num_qpairs; ++i) { struct slash_qdma_qpair_add qpair = {0}; qpair.mode = VRTD_QDMA_Q_MODE_MM; @@ -329,7 +362,7 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma, qpair.c2h_ring_sz = VRTD_QDMA_RING_SZ_IDX; qpair.cmpt_ring_sz = VRTD_QDMA_RING_SZ_IDX; qpair.dir_mask = dir_mask; - qpair.mm_channel = mm_channel; + qpair.mm_channel = qpair_channels[i]; qpair.size = sizeof(qpair); if (slash_qdma_qpair_add(qdma, &qpair) != 0) { diff --git a/vrt/vrtd/tests/CMakeLists.txt b/vrt/vrtd/tests/CMakeLists.txt index f5197f45..241f01d4 100644 --- a/vrt/vrtd/tests/CMakeLists.txt +++ b/vrt/vrtd/tests/CMakeLists.txt @@ -29,5 +29,6 @@ add_vrtd_test(hotplug_test hotplug_test.cpp) add_vrtd_test(config_test config_test.cpp) add_vrtd_test(auth_test auth_test.cpp) add_vrtd_test(buffer_test buffer_test.cpp) +add_vrtd_test(v80_policy_test v80_policy_test.cpp) add_vrtd_test(design_writer_test design_writer_test.cpp) add_vrtd_test(device_test device_test.cpp) diff --git a/vrt/vrtd/tests/v80_policy_test.cpp b/vrt/vrtd/tests/v80_policy_test.cpp new file mode 100644 index 00000000..f9050a83 --- /dev/null +++ b/vrt/vrtd/tests/v80_policy_test.cpp @@ -0,0 +1,124 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software + * and associated documentation files (the "Software"), to deal in the Software without restriction, + * including without limitation the rights to use, copy, modify, merge, publish, distribute, + * sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or + * substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT + * NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include + +#include + +#include "../libvrtd/src/v80_policy.h" + +namespace { + +constexpr uint64_t STEP = 4096; +constexpr uint64_t MiB = 1024ULL * 1024ULL; +constexpr uint64_t GiB = 1024ULL * MiB; + +// A single available queue always carries the whole transfer on fds[0]. +TEST(V80Plan, SingleQueueIsWhole) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint32_t n = vrtd_plan_v80(VRTD_V80_HBM_BASE, 0, 512 * MiB, STEP, 1, segs); + ASSERT_EQ(n, 1u); + EXPECT_EQ(segs[0].fd_index, 0u); + EXPECT_EQ(segs[0].offset, 0u); + EXPECT_EQ(segs[0].size, 512 * MiB); +} + +// DDR has a single NSU, so the range is split in half across both channels. +TEST(V80Plan, DdrSplitsInHalf) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint32_t n = vrtd_plan_v80(VRTD_V80_DDR_BASE, 0, 512 * MiB, STEP, 2, segs); + ASSERT_EQ(n, 2u); + EXPECT_EQ(segs[0].fd_index, 0u); + EXPECT_EQ(segs[0].offset, 0u); + EXPECT_EQ(segs[0].size, 256 * MiB); + EXPECT_EQ(segs[1].fd_index, 1u); + EXPECT_EQ(segs[1].offset, 256 * MiB); + EXPECT_EQ(segs[1].size, 256 * MiB); +} + +// A DDR transfer too small to halve along the step boundary stays on fds[0]. +TEST(V80Plan, DdrTinyTransferStaysOnPrimary) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint32_t n = vrtd_plan_v80(VRTD_V80_DDR_BASE, 0, STEP, STEP, 2, segs); + ASSERT_EQ(n, 1u); + EXPECT_EQ(segs[0].fd_index, 0u); + EXPECT_EQ(segs[0].size, STEP); +} + +// An HBM buffer entirely below the half-boundary uses channel 0 only. +TEST(V80Plan, HbmLowerHalfChannel0) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint32_t n = vrtd_plan_v80(VRTD_V80_HBM_BASE, 0, 512 * MiB, STEP, 2, segs); + ASSERT_EQ(n, 1u); + EXPECT_EQ(segs[0].fd_index, 0u); + EXPECT_EQ(segs[0].offset, 0u); + EXPECT_EQ(segs[0].size, 512 * MiB); +} + +// An HBM buffer entirely at/above the half-boundary uses channel 1 only. +TEST(V80Plan, HbmUpperHalfChannel1) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint64_t base = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF + 4 * GiB; + uint32_t n = vrtd_plan_v80(base, 0, 512 * MiB, STEP, 2, segs); + ASSERT_EQ(n, 1u); + EXPECT_EQ(segs[0].fd_index, 1u); + EXPECT_EQ(segs[0].offset, 0u); + EXPECT_EQ(segs[0].size, 512 * MiB); +} + +// A buffer sitting exactly on the boundary belongs to the upper half. +TEST(V80Plan, HbmOnBoundaryIsUpperHalf) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint64_t base = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF; + uint32_t n = vrtd_plan_v80(base, 0, 256 * MiB, STEP, 2, segs); + ASSERT_EQ(n, 1u); + EXPECT_EQ(segs[0].fd_index, 1u); +} + +// An HBM range straddling the boundary splits exactly at it. +TEST(V80Plan, HbmSpanningSplitsAtBoundary) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint64_t base = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF - 256 * MiB; + uint32_t n = vrtd_plan_v80(base, 0, 512 * MiB, STEP, 2, segs); + ASSERT_EQ(n, 2u); + EXPECT_EQ(segs[0].fd_index, 0u); + EXPECT_EQ(segs[0].offset, 0u); + EXPECT_EQ(segs[0].size, 256 * MiB); + EXPECT_EQ(segs[1].fd_index, 1u); + EXPECT_EQ(segs[1].offset, 256 * MiB); + EXPECT_EQ(segs[1].size, 256 * MiB); +} + +// The split point is computed from the absolute device address, so a non-zero +// buffer offset that crosses the boundary is honoured. +TEST(V80Plan, HbmSpanningWithOffset) { + vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; + uint64_t offset = VRTD_V80_HBM_HALF - STEP; // crosses boundary STEP into the range + uint32_t n = vrtd_plan_v80(VRTD_V80_HBM_BASE, offset, 2 * STEP, STEP, 2, segs); + ASSERT_EQ(n, 2u); + EXPECT_EQ(segs[0].fd_index, 0u); + EXPECT_EQ(segs[0].offset, offset); + EXPECT_EQ(segs[0].size, STEP); + EXPECT_EQ(segs[1].fd_index, 1u); + EXPECT_EQ(segs[1].offset, offset + STEP); + EXPECT_EQ(segs[1].size, STEP); +} + +} // namespace From 5f7fe3da339aa3647144160c99432a5f8fe372c5 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Mon, 15 Jun 2026 13:40:33 +0100 Subject: [PATCH 21/23] driver+vrt+smi: drop libqdma sg/channel patches, make transfers 4 KiB-only Signed-off-by: Vlad-Gabriel Serbu --- docs/reference/kernel-abi/index.rst | 13 +- docs/reference/smi/commands.rst | 16 +- driver/Makefile | 5 +- .../include/slash/uapi/slash_interface.h | 7 +- driver/patches/0001-libqdma-sg-mapping.patch | 172 ------------ .../patches/0002-libqdma-versal-channel.patch | 32 --- driver/slash_qdma.c | 250 ++++-------------- driver/tests/test_slash_qdma.c | 57 ---- smi/README.md | 9 +- smi/src/qdma_driver_backend.cpp | 4 +- smi/src/qdma_driver_backend.hpp | 2 +- smi/src/raw_transfer.hpp | 72 ++--- smi/src/smi.cpp | 10 - smi/src/validate.cpp | 125 +++------ smi/src/validate.hpp | 12 - vrt/vrtd/libvrtd/include/vrtd/vrtd.h | 20 +- vrt/vrtd/libvrtd/src/buffer.c | 104 ++------ vrt/vrtd/libvrtd/src/requests.c | 4 - vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp | 11 - vrt/vrtd/libvrtdpp/include/vrtd/device.hpp | 31 +-- vrt/vrtd/libvrtdpp/include/vrtd/session.hpp | 8 +- vrt/vrtd/libvrtdpp/src/device.cpp | 14 +- vrt/vrtd/libvrtdpp/src/session.cpp | 24 +- 23 files changed, 172 insertions(+), 830 deletions(-) delete mode 100644 driver/patches/0001-libqdma-sg-mapping.patch delete mode 100644 driver/patches/0002-libqdma-versal-channel.patch diff --git a/docs/reference/kernel-abi/index.rst b/docs/reference/kernel-abi/index.rst index f018d401..b4bd151a 100644 --- a/docs/reference/kernel-abi/index.rst +++ b/docs/reference/kernel-abi/index.rst @@ -419,12 +419,11 @@ All transfers are synchronous and block until the transfer completes or times ou return value is the number of bytes transferred, and the file position is advanced accordingly. The userspace buffer address and ``count`` must be page-aligned: the address -must be 4 KiB-aligned and ``count`` must be a non-zero multiple of 4 KiB. A -2 MiB-aligned, 2 MiB-multiple transfer backed by 2 MiB hugetlb pages uses one -descriptor per hugepage; all other accepted transfers use one descriptor per -4 KiB base page. Transparent hugepages are not accepted on the 4 KiB path, so -callers using anonymous mappings should apply ``MADV_NOHUGEPAGE`` before -faulting pages when they need deterministic base-page transfers. +must be 4 KiB-aligned and ``count`` must be a non-zero multiple of 4 KiB. The +transfer is backed by 4 KiB base pages, one descriptor per page. Transparent +hugepages are not accepted, so callers using anonymous mappings should apply +``MADV_NOHUGEPAGE`` before faulting pages when they need deterministic +base-page transfers. Multiple fds can be obtained for the same qpair via multiple ``QPAIR_GET_FD`` calls, including from different processes. Concurrent ``read()``/``write()`` calls on the same qpair (from any @@ -781,7 +780,7 @@ traffic on a single queue. - ``size`` must cover at least ``length`` (the trailing input field) — otherwise ``-EINVAL`` - ``flags`` must be 0 - ``user_addr`` must be page-aligned; ``length`` must be a non-zero multiple of the page size -- The buffer must be backed by a single page granule (all 4 KiB base pages or all 2 MiB hugepages) +- The buffer must be backed by 4 KiB base pages **Postconditions:** diff --git a/docs/reference/smi/commands.rst b/docs/reference/smi/commands.rst index a9af357e..2fd1ccb9 100644 --- a/docs/reference/smi/commands.rst +++ b/docs/reference/smi/commands.rst @@ -160,7 +160,7 @@ phase is skipped when ``--ddr-only`` or ``--hbm-only`` is given. .. code-block:: text - v80-smi validate -d [-j|--threads ] [-R|--no-reset] [--page-size <4k|2m>] [--mm-channel ] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--ring-size-index <0-15>] [--bandwidth-iterations ] [--bandwidth-duration ] + v80-smi validate -d [-j|--threads ] [-R|--no-reset] [--mm-channel ] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--ring-size-index <0-15>] [--bandwidth-iterations ] [--bandwidth-duration ] Requirements by mode: @@ -213,24 +213,12 @@ permission. The largest phase maps up to ``4 * N * buffer-size`` of host buffers when both HBM and DDR are enabled, or ``2 * N * buffer-size`` with ``--ddr-only`` or ``--hbm-only``; the command fails early if that exceeds currently available -host memory. With ``--page-size 2m`` that footprint is checked against the free -2 MiB hugepage pool instead of general RAM. +host memory. .. option:: -R, --no-reset Skip the device reset step before running memory tests. -.. option:: --page-size <4k|2m> - - Host staging-buffer page granule used for DMA transfers in every mode - (default VRTD, ``--raw-transfer-test`` and ``--use-qdma-driver``). ``4k`` - (the default) maps the host buffers with regular 4 KiB base pages; ``2m`` - maps them with 2 MiB hugepages. There is no fallback: ``2m`` requires - reserved 2 MiB hugepages and that ``--buffer-size``, ``--offset``, - ``--starting-offset`` (and ``--channel-region-stride`` in paired mode) all be - 2 MiB-aligned, otherwise ``validate`` fails early. Reserve hugepages with, - e.g., ``echo | sudo tee /proc/sys/vm/nr_hugepages``. - .. option:: --mm-channel AXI-MM / NoC channel selection for each buffer's QDMA queue pair, in every diff --git a/driver/Makefile b/driver/Makefile index ab48039b..07d36965 100644 --- a/driver/Makefile +++ b/driver/Makefile @@ -52,10 +52,8 @@ LIBQDMA_PATCH_DIR := patches SLASH_QDMA_OP_DEBUG ?= 0 # Per-transfer timing instrumentation. Set to 1 to emit one dmesg line per -# DMA transfer breaking down the kernel phases (SLASH_QDMA_TIMING) and the -# libqdma submit sub-phases (QDMA_TIMING). Default off (zero overhead). +# DMA transfer breaking down the kernel phases. Default off (zero overhead). SLASH_QDMA_TIMING ?= 0 -QDMA_TIMING ?= 0 # Kcompat feature flags. Defaults are "n"; the all: recipe runs # driver/kcompat/probe.sh against $(KDIR) to detect the actual values @@ -86,7 +84,6 @@ ccflags-y += \ -DTANDEM_BOOT_SUPPORTED=1 \ -DSLASH_QDMA_OP_DEBUG=$(SLASH_QDMA_OP_DEBUG) \ -DSLASH_QDMA_TIMING=$(SLASH_QDMA_TIMING) \ - -DQDMA_TIMING=$(QDMA_TIMING) \ -DSLASH_VERSION_STR=\"$(SLASH_VERSION)\" ifeq ($(SLASH_HAVE_VM_FLAGS_SET),y) diff --git a/driver/libslash/include/slash/uapi/slash_interface.h b/driver/libslash/include/slash/uapi/slash_interface.h index fd173909..7a5dfe55 100644 --- a/driver/libslash/include/slash/uapi/slash_interface.h +++ b/driver/libslash/include/slash/uapi/slash_interface.h @@ -272,9 +272,8 @@ enum slash_qdma_transfer_hint { * re-mapping per transfer. * * \@user_addr must be page-aligned and \@length a non-zero multiple of - * the host page size. The buffer must be backed by a single page - * granule (all 4 KiB base pages or all 2 MiB hugepages), matching the - * transfer data path. + * the host page size. The buffer is backed by 4 KiB base pages, matching + * the transfer data path. * * Buffers are owned by the control-fd open instance they are registered * through, and are automatically unregistered when that fd is closed @@ -317,7 +316,7 @@ struct slash_qdma_buf_unregister { * of bytes transferred is returned as the ioctl return value. * * \@buf_offset and \@length must be aligned to the registered buffer's - * page granule, and \@buf_offset + \@length must not exceed the + * 4 KiB page granule, and \@buf_offset + \@length must not exceed the * registered length. \@direction must be one of enum slash_qdma_transfer_dir * and must be enabled on the queue pair. */ diff --git a/driver/patches/0001-libqdma-sg-mapping.patch b/driver/patches/0001-libqdma-sg-mapping.patch deleted file mode 100644 index c94866ef..00000000 --- a/driver/patches/0001-libqdma-sg-mapping.patch +++ /dev/null @@ -1,172 +0,0 @@ -SLASH local modification to the pinned QDMA submodule (libqdma). - -libqdma: length/offset-aware SG mapping + optional submit timing - -The stock libqdma sgl_map()/sgl_unmap() always DMA-map a fixed PAGE_SIZE per -scatter-gather entry and ignore sg->len / sg->offset. The SLASH driver -(driver/slash_qdma.c) builds SG lists with variable-length entries (a single -base page or one 2 MiB hugetlb page per entry), so the mapping must honour -sg->offset and sg->len. This also documents sg->len in libqdma_export.h and -adds the optional, compile-time gated (QDMA_TIMING) per-request submit timing -that pairs with SLASH_QDMA_TIMING in the SLASH driver. - -Generated against qdma_drv @ 03ac7f3 (pinned submodule commit). -Applied automatically by driver/Makefile (libqdma-patches target, patch -p1). -diff --git a/libqdma_export.c b/libqdma_export.c -index f0524d8..bff6161 100755 ---- a/libqdma_export.c -+++ b/libqdma_export.c -@@ -39,6 +39,8 @@ - #include "qdma_mbox.h" - #include "qdma_platform.h" - -+#include -+ - #ifdef DEBUGFS - #include "qdma_debugfs_queue.h" - -@@ -50,6 +52,27 @@ static bool qdma_debufs_cleanup = true; - - #define QDMA_Q_PEND_LIST_COMPLETION_TIMEOUT 1000 /* msec */ - -+/* -+ * Per-request timing instrumentation for the synchronous MM submit path. -+ * -+ * When QDMA_TIMING is non-zero (compile-time flag, e.g. built with -+ * -DQDMA_TIMING=1), qdma_request_submit() emits one line per request that -+ * splits the submit cost into: -+ * -+ * - sgl_map: DMA-mapping the scatter-gather list (dma_map_page per entry; -+ * this is where IOMMU programming/IOTLB flushes show up). -+ * - proc: qdma_descq_proc_sgt_request() -- descriptor-ring fill plus -+ * the PIDX doorbell MMIO write that kicks the hardware. -+ * - wait: qdma_request_wait_for_cmpl() -- the blocking wait covering -+ * the actual HW data movement and poll-mode completion spin. -+ * -+ * Pairs with SLASH_QDMA_TIMING in the SLASH driver, whose "submit" phase is -+ * exactly the sum of these three. -+ */ -+#ifndef QDMA_TIMING -+#define QDMA_TIMING 0 -+#endif -+ - struct drv_mode_name mode_name_list[] = { - { AUTO_MODE, "auto"}, - { POLL_MODE, "poll"}, -@@ -2324,8 +2347,8 @@ void sgl_unmap(struct pci_dev *pdev, struct qdma_sw_sg *sg, unsigned int sgcnt, - if (!sg->pg) - break; - if (sg->dma_addr) { -- dma_unmap_page(&pdev->dev, sg->dma_addr - sg->offset, -- PAGE_SIZE, dir); -+ dma_unmap_page(&pdev->dev, sg->dma_addr, sg->len, -+ dir); - sg->dma_addr = 0UL; - } - } -@@ -2351,20 +2374,21 @@ int sgl_map(struct pci_dev *pdev, struct qdma_sw_sg *sgl, unsigned int sgcnt, - int i; - struct qdma_sw_sg *sg = sgl; - -- /** Map the sg list onto a dma pages where -- * each page has max of PAGE_SIZE i.e 4K -- */ - for (i = 0; i < sgcnt; i++, sg++) { -- /* !! TODO page size !! */ -- sg->dma_addr = dma_map_page(&pdev->dev, sg->pg, 0, PAGE_SIZE, -- dir); -+ if (!sg->len) { -+ pr_err("map sgl failed, sg %d has zero length.\n", i); -+ if (i) -+ sgl_unmap(pdev, sgl, i, dir); -+ return -EINVAL; -+ } -+ sg->dma_addr = dma_map_page(&pdev->dev, sg->pg, sg->offset, -+ sg->len, dir); - if (unlikely(dma_mapping_error(&pdev->dev, sg->dma_addr))) { - pr_err("map sgl failed, sg %d, %u.\n", i, sg->len); - if (i) - sgl_unmap(pdev, sgl, i, dir); - return -EIO; - } -- sg->dma_addr += sg->offset; - } - - return 0; -@@ -2393,6 +2417,9 @@ ssize_t qdma_request_submit(unsigned long dev_hndl, unsigned long id, - enum dma_data_direction dir; - int wait = 0; - int rv = 0; -+#if QDMA_TIMING -+ ktime_t t_start, t_mapped, t_proc, t_wait; -+#endif - - /** make sure that the dev_hndl passed is Valid */ - if (!xdev) { -@@ -2459,6 +2486,9 @@ ssize_t qdma_request_submit(unsigned long dev_hndl, unsigned long id, - if (descq->conf.st && (descq->conf.q_type == Q_C2H)) - return qdma_request_submit_st_c2h(xdev, descq, req); - -+#if QDMA_TIMING -+ t_start = ktime_get(); -+#endif - if (!req->dma_mapped) { - rv = sgl_map(xdev->conf.pdev, req->sgl, req->sgcnt, dir); - if (rv < 0) { -@@ -2468,6 +2498,9 @@ ssize_t qdma_request_submit(unsigned long dev_hndl, unsigned long id, - } - cb->unmap_needed = 1; - } -+#if QDMA_TIMING -+ t_mapped = ktime_get(); -+#endif - - lock_descq(descq); - /** if the descq is already in online state*/ -@@ -2484,6 +2517,9 @@ ssize_t qdma_request_submit(unsigned long dev_hndl, unsigned long id, - pr_debug("%s: cb 0x%p submitted.\n", descq->conf.name, cb); - - qdma_descq_proc_sgt_request(descq); -+#if QDMA_TIMING -+ t_proc = ktime_get(); -+#endif - - if (!wait) - return 0; -@@ -2492,6 +2528,18 @@ ssize_t qdma_request_submit(unsigned long dev_hndl, unsigned long id, - if (rv < 0) - goto unmap_sgl; - -+#if QDMA_TIMING -+ t_wait = ktime_get(); -+ pr_info("qdma: timing %s %s count=%u sgcnt=%u ep=0x%llx off=%u | sgl_map=%lld proc=%lld wait=%lld total=%lld ns\n", -+ descq->conf.name, req->write ? "H2C" : "C2H", -+ req->count, req->sgcnt, -+ (unsigned long long)req->ep_addr, cb->offset, -+ ktime_to_ns(ktime_sub(t_mapped, t_start)), -+ ktime_to_ns(ktime_sub(t_proc, t_mapped)), -+ ktime_to_ns(ktime_sub(t_wait, t_proc)), -+ ktime_to_ns(ktime_sub(t_wait, t_start))); -+#endif -+ - return cb->offset; - - unmap_sgl: -diff --git a/libqdma_export.h b/libqdma_export.h -index baeb78e..9bd60ee 100755 ---- a/libqdma_export.h -+++ b/libqdma_export.h -@@ -558,7 +558,12 @@ struct qdma_sw_sg { - struct page *pg; - /** offset in current page */ - unsigned int offset; -- /** length of the page */ -+ /** -+ * Length of this scatter-gather entry. The DMA mapping helpers map -+ * exactly this many bytes starting at @offset, so callers must set this -+ * to the full backing granule they intend to expose (for example 4 KiB -+ * for base pages or 2 MiB for huge pages). -+ */ - unsigned int len; - /** dma address of the allocated page */ - dma_addr_t dma_addr; diff --git a/driver/patches/0002-libqdma-versal-channel.patch b/driver/patches/0002-libqdma-versal-channel.patch deleted file mode 100644 index 89bddd43..00000000 --- a/driver/patches/0002-libqdma-versal-channel.patch +++ /dev/null @@ -1,32 +0,0 @@ -SLASH local modification to the pinned QDMA submodule (libqdma). - -libqdma: set descq->channel on the initial queue-add path (Versal) - -qdma_descq_config() only mirrored qconf->mm_channel into descq->channel on the -reconfig path. qdma_queue_add() calls it with reconfig=0, so on Versal hard IP -(QDMA_VERSAL_HARD_IP) the SW-context mm_chn/host_id stayed 0. Mirror mm_channel -into descq->channel on the initial add path too. - -Generated against qdma_drv @ 03ac7f3 (pinned submodule commit). -Applied automatically by driver/Makefile (libqdma-patches target, patch -p1). -diff --git a/qdma_descq.c b/qdma_descq.c -index c2f19d1..b432737 100755 ---- a/qdma_descq.c -+++ b/qdma_descq.c -@@ -1261,6 +1261,16 @@ void qdma_descq_config(struct qdma_descq *descq, struct qdma_queue_conf *qconf, - descq->conf.st = qconf->st; - descq->conf.q_type = qconf->q_type; - -+ /* Below check is applicable only for Versal family. -+ * Mirror mm_channel into descq->channel on the initial add path -+ * too; qdma_queue_add() only calls this with reconfig=0, so -+ * without this the SW-context mm_chn/host_id would always be 0 -+ * (the reconfig-only assignment below is reached solely via -+ * qdma_queue_config()). -+ */ -+ if (descq->xdev->version_info.ip_type == QDMA_VERSAL_HARD_IP) -+ descq->channel = qconf->mm_channel; -+ - } else { - descq->conf.desc_rng_sz_idx = qconf->desc_rng_sz_idx; - descq->conf.cmpl_rng_sz_idx = qconf->cmpl_rng_sz_idx; diff --git a/driver/slash_qdma.c b/driver/slash_qdma.c index d0bbf760..a61a978d 100644 --- a/driver/slash_qdma.c +++ b/driver/slash_qdma.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include @@ -142,62 +141,12 @@ #define SLASH_QDMA_HP_POLL_US 1000 /* busy-wait budget in microseconds */ /* - * The qpair fd data path accepts either a span of 4 KiB base pages or a span - * of 2 MiB hugetlb pages. Every scatter-gather entry within one request uses - * the same granule, which keeps the DMA mapping semantics unambiguous; the two - * granules are never mixed in a single request. A whole transfer (of either - * granule) is submitted to libqdma as a single multi-descriptor request, and - * libqdma refills the descriptor ring as needed -- so the transfer size is not - * bounded by the ring depth. + * The qpair fd data path operates on spans of 4 KiB base pages. Each + * scatter-gather entry is exactly one base page, so a whole transfer is + * submitted to libqdma as a single multi-descriptor request and libqdma + * refills the descriptor ring as needed -- the transfer size is not bounded + * by the ring depth. */ -#define SLASH_QDMA_HUGEPAGE_SIZE (2UL * 1024UL * 1024UL) - -/* - * qdma_huge_desc_size - Experimental descriptor granularity for hugetlb-backed - * raw qpair transfers. - * - * The userspace raw-transfer path prefers 2 MiB hugetlb pages so the host page - * size stays large and stable. By default, each pinned 2 MiB page becomes one - * SGL entry / QDMA descriptor. Reducing this value keeps the same pinned - * hugetlb page but emits several descriptors with increasing offsets inside - * that page, allowing us to test whether descriptor pressure (rather than host - * page size) is what makes dma-perf faster. - * - * Must be a page-aligned divisor of 2 MiB. Examples: - * 2097152 -> current behaviour (1 descriptor per huge page) - * 65536 -> 32 descriptors per huge page - * 4096 -> 512 descriptors per huge page - */ -static unsigned int qdma_huge_desc_size = SLASH_QDMA_HUGEPAGE_SIZE; - -static int slash_qdma_huge_desc_size_set(const char *val, - const struct kernel_param *kp) -{ - unsigned int parsed; - int err; - - err = kstrtouint(val, 0, &parsed); - if (err) - return err; - - if (parsed < PAGE_SIZE || - parsed > SLASH_QDMA_HUGEPAGE_SIZE || - !IS_ALIGNED(parsed, PAGE_SIZE) || - (SLASH_QDMA_HUGEPAGE_SIZE % parsed) != 0) - return -EINVAL; - - return param_set_uint(val, kp); -} - -static const struct kernel_param_ops slash_qdma_huge_desc_size_ops = { - .set = slash_qdma_huge_desc_size_set, - .get = param_get_uint, -}; - -module_param_cb(qdma_huge_desc_size, &slash_qdma_huge_desc_size_ops, - &qdma_huge_desc_size, 0644); -MODULE_PARM_DESC(qdma_huge_desc_size, - "Descriptor size for 2 MiB hugetlb raw transfers; page-aligned divisor of 2 MiB (default 2097152)"); /** * SLASH_QDMA_QTYPE_COUNT - Number of queue types tracked per queue pair. @@ -282,8 +231,7 @@ MODULE_PARM_DESC(qdma_huge_desc_size, * - submit: the whole libqdma qdma_request_submit() call, which covers * SGL DMA-mapping (IOMMU), descriptor-ring fill, the PIDX * doorbell, and the synchronous completion wait (HW transfer + - * poll-mode spin). libqdma can be built with QDMA_TIMING=1 for - * a finer breakdown of this phase. + * poll-mode spin). * - unmap: unpin pages (mark dirty for C2H) and free the SGL. * * Timestamps use ktime_get() (CLOCK_MONOTONIC); the reads are cheap, but @@ -694,9 +642,9 @@ struct slash_qdma_io_cb { * owning client holds the device reference). * @buf_id: Client-scoped handle returned to userspace. * @length: Registered length in bytes. - * @granule: Bytes per SGL entry (PAGE_SIZE for base pages, or the - * hugepage descriptor size). Uniform across all entries, so - * transfer slices can be computed by simple division. + * @granule: Bytes per SGL entry (PAGE_SIZE; one 4 KiB base page each). + * Uniform across all entries, so transfer slices can be computed + * by simple division. * @iocb: Pinned pages and prebuilt scatter-gather list. Each entry's * dma_addr is filled in once at registration so transfers can * submit with req->dma_mapped = 1. @@ -2255,6 +2203,38 @@ static int slash_qdma_ioctl_qpair_add_q(struct miscdevice *misc, "qdma_queue_add done: qid=%u type=%u qhndl=%lu\n", req->qid, qtype, qhndl); + /* + * Reconfigure the queue immediately after adding it. + * + * qdma_queue_add() runs qdma_descq_config(..., reconfig=0), which on + * Versal hard IP does NOT mirror qconf.mm_channel into descq->channel -- + * only the reconfig=1 branch does. descq->channel feeds the SW-context + * mm_chn/host_id programmed when the queue is started; without this step + * it would stay 0 and collapse both queues onto NoC channel 0, defeating + * mm-channel selection. Calling qdma_queue_config() here (the queue is in + * Q_STATE_ENABLED, before start) replays the same qconf through the + * reconfig=1 path, setting descq->channel. This replaces the former + * 0002-libqdma-versal-channel.patch without modifying libqdma. + */ + err = qdma_queue_config(qdma_dev->qdma_handle, qhndl, &qconf, + errbuf, sizeof(errbuf)); + if (err) { + SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, + "qdma_queue_config failed: qid=%u type=%u err=%d (%s)\n", + req->qid, qtype, err, errbuf); + dev_err(&qdma_dev->pdev->dev, + "qdma: queue config failed (qid=%u, type=%u): %d (%s)\n", + req->qid, qtype, err, errbuf); + /* + * The queue was added but is not yet tracked in @entry, so the + * caller's rollback (keyed on its local added[] array) will not + * reach it. Remove it here to avoid leaking the libqdma queue. + */ + slash_qdma_queue_remove_safe(qdma_dev->qdma_handle, qhndl, + errbuf, sizeof(errbuf)); + return err; + } + /* Record the handle and mark this direction as active. */ entry->qhndl[qtype] = qhndl; entry->dir_mask |= dir_bit; @@ -2615,19 +2595,6 @@ static bool slash_qdma_page_is_base_page(struct page *page) return !PageCompound(page); } -static bool slash_qdma_page_is_2m_hugetlb_head(struct page *page) -{ -#ifdef CONFIG_HUGETLB_PAGE - struct page *head = compound_head(page); - - return page == head && - PageHuge(head) && - compound_order(head) == get_order(SLASH_QDMA_HUGEPAGE_SIZE); -#else - return false; -#endif -} - static int slash_qdma_map_user_base_pages_to_sgl(struct slash_qdma_io_cb *iocb, bool write) { @@ -2698,111 +2665,15 @@ static int slash_qdma_map_user_base_pages_to_sgl(struct slash_qdma_io_cb *iocb, return rv; } -static int slash_qdma_map_user_huge_page_to_sgl(struct slash_qdma_io_cb *iocb, - bool write) -{ - unsigned long addr = (unsigned long)iocb->buf; - size_t huge_pages = iocb->len / SLASH_QDMA_HUGEPAGE_SIZE; - unsigned int desc_size = READ_ONCE(qdma_huge_desc_size); - unsigned int descs_per_page; - size_t entries; - unsigned int i; - unsigned int sg_idx = 0; - int rv; - - if ((iocb->len % SLASH_QDMA_HUGEPAGE_SIZE) != 0 || - huge_pages == 0 || huge_pages > UINT_MAX) - return -EINVAL; - - if (desc_size < PAGE_SIZE || - desc_size > SLASH_QDMA_HUGEPAGE_SIZE || - !IS_ALIGNED(desc_size, PAGE_SIZE) || - (SLASH_QDMA_HUGEPAGE_SIZE % desc_size) != 0) - return -EINVAL; - - descs_per_page = SLASH_QDMA_HUGEPAGE_SIZE / desc_size; - if (huge_pages > UINT_MAX / descs_per_page) - return -EINVAL; - entries = huge_pages * descs_per_page; - - rv = slash_qdma_iocb_alloc_sgl(iocb, (unsigned int)entries); - if (rv) - return rv; - - for (i = 0; i < huge_pages; i++) { - unsigned long curr_addr = addr + (i * SLASH_QDMA_HUGEPAGE_SIZE); - struct page *page = NULL; - unsigned int j; - - rv = get_user_pages_fast(curr_addr, 1, 1 /* write */, &page); - if (rv != 1) { - pr_err("slash: qdma: unable to pin 2 MiB user page %u/%zu, %d\n", - i, huge_pages, rv); - rv = rv < 0 ? rv : -EFAULT; - goto err_out; - } - - if (!slash_qdma_page_is_2m_hugetlb_head(page)) { - pr_err("slash: qdma: 2 MiB transfer page %u/%zu is not backed by a 2 MiB hugetlb head page\n", - i, huge_pages); - put_page(page); - rv = -EINVAL; - goto err_out; - } - - flush_dcache_page(page); - - for (j = 0; j < descs_per_page; j++, sg_idx++) { - struct qdma_sw_sg *sg = &iocb->sgl[sg_idx]; - - /* - * The first segment consumes the GUP reference. Additional - * descriptors over the same hugetlb page take explicit references - * so slash_qdma_unmap_user_buf() can release one page ref per SGL - * entry without special casing repeated pages. - */ - if (j != 0) - get_page(page); - - iocb->pages[sg_idx] = page; - iocb->pages_nr = sg_idx + 1; - - sg->next = (sg_idx + 1 < entries) ? &iocb->sgl[sg_idx + 1] : NULL; - sg->pg = page; - sg->offset = j * desc_size; - sg->len = desc_size; - sg->dma_addr = 0UL; - } - } - - SLASH_QDMA_OP_LOG("user transfer path=hugetlb-2m addr=0x%lx len=%zu pages=%zu desc_size=%u descs=%zu write=%d\n", - addr, iocb->len, huge_pages, desc_size, entries, write); - - return 0; - -err_out: - slash_qdma_unmap_user_buf(iocb, write); - slash_qdma_iocb_release(iocb); - return rv; -} - /** * slash_qdma_map_user_buf_to_sgl() - Pin a user buffer and build its SGL. * @iocb: I/O control block. @iocb->buf and @iocb->len must be set. * @write: Transfer direction (true = H2C write, false = C2H read). * - * The buffer must be page-aligned and a whole number of 4 KiB pages. It is - * mapped as either: - * - a span of 2 MiB hugetlb pages (when it is 2 MiB-aligned, a multiple of - * 2 MiB, and actually backed by hugetlb pages), or - * - a span of 4 KiB base pages (every other accepted case). - * - * Each page becomes one SGL entry / one DMA descriptor, and the whole span is - * submitted to libqdma as a single request. - * - * The hugetlb-vs-base decision is made by probing the first page rather than by - * length/alignment alone: a large anonymous (base-page) mapping can happen to - * be 2 MiB-aligned, and must not be mistaken for a hugetlb buffer. + * The buffer must be page-aligned and a whole number of 4 KiB pages, and is + * mapped as a span of 4 KiB base pages: each page becomes one SGL entry / one + * DMA descriptor, and the whole span is submitted to libqdma as a single + * request. * * Return: 0 on success, negative errno on failure. */ @@ -2811,7 +2682,6 @@ static int slash_qdma_map_user_buf_to_sgl(struct slash_qdma_io_cb *iocb, { unsigned long addr = (unsigned long)iocb->buf; size_t len = iocb->len; - bool huge = false; iocb->pages_nr = 0; @@ -2824,30 +2694,6 @@ static int slash_qdma_map_user_buf_to_sgl(struct slash_qdma_io_cb *iocb, return -EINVAL; } - /* - * Only a 2 MiB-aligned, 2 MiB-multiple span can be hugetlb-backed. Probe - * the first page to confirm it actually is a hugetlb page before committing - * to the huge path; otherwise fall through to the base-page path. - */ - if (IS_ALIGNED(addr, SLASH_QDMA_HUGEPAGE_SIZE) && - (len % SLASH_QDMA_HUGEPAGE_SIZE) == 0) { - struct page *probe = NULL; - int probe_ret; - - probe_ret = get_user_pages_fast(addr, 1, 1 /* write */, &probe); - if (probe_ret < 0) - return probe_ret; - if (probe_ret == 0) - return -EFAULT; - if (probe_ret == 1) { - huge = slash_qdma_page_is_2m_hugetlb_head(probe); - put_page(probe); - } - } - - if (huge) - return slash_qdma_map_user_huge_page_to_sgl(iocb, write); - return slash_qdma_map_user_base_pages_to_sgl(iocb, write); } @@ -3010,9 +2856,9 @@ slash_qdma_buf_lookup_get(struct slash_qdma_client *client, u32 buf_id) * @uarg: User pointer to a struct slash_qdma_buf_register. * * Pins the pages backing the user buffer, builds a scatter-gather list - * (reusing the same 4 KiB / 2 MiB granule detection as the per-transfer - * path), DMA-maps every entry once, and inserts the resulting buffer into - * the client's table under a freshly allocated buf_id. + * (one 4 KiB base page per entry), DMA-maps every entry once, and inserts + * the resulting buffer into the client's table under a freshly allocated + * buf_id. * * Return: 0 on success, negative errno on failure. */ diff --git a/driver/tests/test_slash_qdma.c b/driver/tests/test_slash_qdma.c index 2da35e7e..f9b0eb7e 100644 --- a/driver/tests/test_slash_qdma.c +++ b/driver/tests/test_slash_qdma.c @@ -15,16 +15,6 @@ #include #define TRANSFER_SIZE 4096 -#define HUGE_PAGE_SIZE (2 * 1024 * 1024) -#define HUGE_TRANSFER_SIZE (2 * HUGE_PAGE_SIZE) - -#ifndef MAP_HUGE_SHIFT -#define MAP_HUGE_SHIFT 26 -#endif - -#ifndef MAP_HUGE_2MB -#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) -#endif /* ---------- helpers ---------- */ @@ -811,53 +801,6 @@ TEST_F(qdma, multipage_4k_write_read_verify) munmap(read_buf, xfer_size); } -TEST_F(qdma, hugepage_write_read_verify) -{ - uint8_t *write_buf, *read_buf; - uint64_t dma_addr = get_dma_addr(); - uint32_t write_id = 0, read_id = 0; - long ret; - - bring_up_qpair(_metadata, self, 0x3); - - write_buf = mmap(NULL, HUGE_TRANSFER_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB, - -1, 0); - if (write_buf == MAP_FAILED) - SKIP(return, "2 MiB hugepage write mmap failed (errno=%d)", errno); - - read_buf = mmap(NULL, HUGE_TRANSFER_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB, - -1, 0); - if (read_buf == MAP_FAILED) { - munmap(write_buf, HUGE_TRANSFER_SIZE); - SKIP(return, "2 MiB hugepage read mmap failed (errno=%d)", errno); - } - - fill_pattern(write_buf, HUGE_TRANSFER_SIZE); - memset(read_buf, 0, HUGE_TRANSFER_SIZE); - - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, HUGE_TRANSFER_SIZE, - &write_id, NULL)); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, HUGE_TRANSFER_SIZE, - &read_id, NULL)); - - ret = qdma_buf_transfer(self->io_fd, write_id, 0, dma_addr, HUGE_TRANSFER_SIZE, - SLASH_QDMA_XFER_H2C); - ASSERT_EQ(HUGE_TRANSFER_SIZE, ret); - - ret = qdma_buf_transfer(self->io_fd, read_id, 0, dma_addr, HUGE_TRANSFER_SIZE, - SLASH_QDMA_XFER_C2H); - ASSERT_EQ(HUGE_TRANSFER_SIZE, ret); - - EXPECT_EQ(0, memcmp(write_buf, read_buf, HUGE_TRANSFER_SIZE)); - - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, write_id)); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, read_id)); - munmap(write_buf, HUGE_TRANSFER_SIZE); - munmap(read_buf, HUGE_TRANSFER_SIZE); -} - /* ---------- registered buffers ---------- */ /* Register a host buffer via the control fd; returns 0 or -errno. */ diff --git a/smi/README.md b/smi/README.md index 7f3dae56..a7fef4fe 100644 --- a/smi/README.md +++ b/smi/README.md @@ -183,7 +183,7 @@ bandwidth. Raw transfer modes skip reset and bypass the default VRTD buffer path for data movement. ``` -v80-smi validate -d [-j ] [-R] [--page-size <4k|2m>] [--mm-channel ] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--ring-size-index <0-15>] [--bandwidth-iterations ] [--bandwidth-duration ] +v80-smi validate -d [-j ] [-R] [--mm-channel ] [--buffer-size ] [--offset ] [--starting-offset ] [--raw-transfer-test | --use-qdma-driver] [--ddr-only | --hbm-only] [--channel-allocation ] [--channel-region-stride ] [--ring-size-index <0-15>] [--bandwidth-iterations ] [--bandwidth-duration ] ``` | Flag | Description | @@ -191,7 +191,6 @@ v80-smi validate -d [-j ] [-R] [--page-size <4k|2m>] [--mm-channe | `-d,--device` | Board address (required), e.g. `03:00` or `0000:03:00` | | `-j,--threads` | Parallel buffers/threads, 1-64 (default 8). Bidirectional phases use `2 * threads` logical positions in each enabled memory space. | | `-R,--no-reset` | Skip the device reset step before running memory tests | -| `--page-size` | Host staging-buffer page granule for all backends: `4k` (default; 4 KiB base pages) or `2m` (2 MiB hugepages). No fallback: `2m` needs reserved 2 MiB hugepages and 2 MiB-aligned `--buffer-size`/`--offset`/`--starting-offset` (and `--channel-region-stride` in paired mode). | | `--mm-channel` | AXI-MM/NoC channel per buffer queue: `auto` (default; driver stripes by `qid&1`), `0`, or `1`, or a comma-separated list with exactly one entry per buffer position (`2 x --threads` entries, e.g. `-j 1` -> `0,1`); no repeating, wrong length errors. Independent of `--channel-allocation`; also honored by `--use-qdma-driver`. | | `--buffer-size` | Size of each test buffer, accepting bytes or `k`/`K`/`m`/`M` suffixes (default `512M`, maximum `512M`) | | `--offset` | Distance between logical buffer positions (default `512M`) | @@ -221,8 +220,8 @@ the SLASH QDMA driver node must be present. Buffers are placed at `memory_base + starting-offset + position * offset`. The position sequence is `0..N-1` for single-direction phases and `0..2N-1` for bidirectional phases (reads on even positions, writes on odd positions). -`--buffer-size`, `--offset`, and `--starting-offset` must be 4 KiB-aligned (or -2 MiB-aligned with `--page-size 2m`), `--offset` must be at least +`--buffer-size`, `--offset`, and `--starting-offset` must be 4 KiB-aligned, +`--offset` must be at least `--buffer-size`, and the highest buffer must fit within the 64 x 512 MB DDR/HBM address space. If any placement option is specified in default VRTD mode, `validate` uses raw VRTD buffers so the exact @@ -234,7 +233,7 @@ when HBM and DDR are both enabled, or `2 x x ` with currently available host memory. Raw transfer modes can repeat the bandwidth phases without changing buffer -placement or page size. `--bandwidth-iterations` repeats each whole-buffer +placement. `--bandwidth-iterations` repeats each whole-buffer transfer a fixed number of times, while `--bandwidth-duration` runs each bandwidth phase for a wall-clock duration and counts completed whole-buffer transfers. Integrity checks remain one-shot. diff --git a/smi/src/qdma_driver_backend.cpp b/smi/src/qdma_driver_backend.cpp index fd3faf50..da47d6aa 100644 --- a/smi/src/qdma_driver_backend.cpp +++ b/smi/src/qdma_driver_backend.cpp @@ -483,10 +483,10 @@ std::string QdmaDriverDevice::charDevPath(uint32_t qid) const { QdmaDriverBuffer::QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid, uint64_t physAddr, uint64_t size, - raw::PageSize pageSize, int mmChannel) + int mmChannel) : device_(&device), qid_(qid), physAddr_(physAddr) { try { - mapping_ = raw::createHostMapping(size, physAddr, pageSize); + mapping_ = raw::createHostMapping(size, physAddr); // mmChannel < 0 means auto: spread the queue across channels by qid. const uint32_t channel = (mmChannel < 0) diff --git a/smi/src/qdma_driver_backend.hpp b/smi/src/qdma_driver_backend.hpp index 7694fd1b..e0d94fa8 100644 --- a/smi/src/qdma_driver_backend.hpp +++ b/smi/src/qdma_driver_backend.hpp @@ -113,7 +113,7 @@ class QdmaDriverBuffer { /// @param mmChannel Concrete MM channel to pin to, or -1 to spread the /// queue across channels by qid % channel-count. QdmaDriverBuffer(QdmaDriverDevice& device, uint32_t qid, uint64_t physAddr, uint64_t size, - raw::PageSize pageSize, int mmChannel); + int mmChannel); QdmaDriverBuffer(const QdmaDriverBuffer&) = delete; QdmaDriverBuffer& operator=(const QdmaDriverBuffer&) = delete; diff --git a/smi/src/raw_transfer.hpp b/smi/src/raw_transfer.hpp index d9edb3c1..41988d31 100644 --- a/smi/src/raw_transfer.hpp +++ b/smi/src/raw_transfer.hpp @@ -55,30 +55,15 @@ /// -DSLASH_QDMA_TIMING=1), the raw-transfer path logs the wall-clock cost of /// each pwrite/pread syscall plus the aggregate per-transfer time and /// effective bandwidth. This is the userspace counterpart to the kernel's -/// SLASH_QDMA_TIMING and libqdma's QDMA_TIMING breakdowns. +/// SLASH_QDMA_TIMING breakdown. #ifndef SLASH_QDMA_TIMING #define SLASH_QDMA_TIMING 0 #endif -#ifndef MAP_HUGE_SHIFT -#define MAP_HUGE_SHIFT 26 -#endif - -#ifndef MAP_HUGE_2MB -#define MAP_HUGE_2MB (21UL << MAP_HUGE_SHIFT) -#endif - namespace smi::raw { /// Host transfer sizes mirror libvrtd's QDMA staging policy. static constexpr uint64_t BASE_TRANSFER_STEP_SIZE = 4ULL * 1024ULL; -static constexpr uint64_t HUGE_TRANSFER_STEP_SIZE = 2ULL * 1024ULL * 1024ULL; - -/// Host staging-buffer page granule selection for raw transfers. -enum class PageSize { - Base4K, ///< Regular 4 KiB base pages. - Huge2M, ///< 2 MiB hugetlb pages; a mapping failure is fatal (no fallback). -}; [[noreturn]] inline void throwSystemError(const std::string& message) { throw std::runtime_error(message + ": " + std::strerror(errno)); @@ -86,9 +71,8 @@ enum class PageSize { /// A host staging buffer plus the DMA granule it is backed by. /// -/// `step` is HUGE_TRANSFER_STEP_SIZE when a 2 MiB hugetlb mapping succeeded, -/// otherwise BASE_TRANSFER_STEP_SIZE (4 KiB base pages). It is used only for -/// range/alignment validation: either way the whole range is transferred in a +/// `step` is always BASE_TRANSFER_STEP_SIZE (4 KiB base pages). It is used +/// only for range/alignment validation: the whole range is transferred in a /// single syscall and the kernel builds one DMA descriptor per page. struct HostMapping { void* data = nullptr; @@ -96,46 +80,22 @@ struct HostMapping { uint64_t step = 0; }; -/// Create a host staging buffer for raw transfers using the requested page -/// granule. @p pageSize selects 4 KiB base pages or 2 MiB hugetlb pages; there -/// is no fallback, so a 2 MiB request fails (throws) when hugepages cannot be -/// mapped. @p physAddr is the device address this buffer backs and is only used -/// to make error messages actionable. -inline HostMapping createHostMapping(uint64_t size, uint64_t physAddr, PageSize pageSize) { +/// Create a host staging buffer of 4 KiB base pages for raw transfers. @p +/// physAddr is the device address this buffer backs and is only used to make +/// error messages actionable. +inline HostMapping createHostMapping(uint64_t size, uint64_t physAddr) { HostMapping mapping; mapping.size = size; - if (pageSize == PageSize::Huge2M) { - if ((size % HUGE_TRANSFER_STEP_SIZE) != 0) { - throw std::invalid_argument( - "Raw transfer buffer size must be a multiple of 2 MiB to use 2 MiB pages"); - } - - mapping.data = mmap(nullptr, - size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, - -1, - 0); - if (mapping.data == MAP_FAILED) { - char where[64]; - std::snprintf(where, sizeof(where), " at device 0x%llx", - static_cast(physAddr)); - throwSystemError(std::string("Failed to map 2 MiB hugetlb raw transfer host buffer") + - where + " (reserve 2 MiB hugepages or use --page-size 4k)"); - } - mapping.step = HUGE_TRANSFER_STEP_SIZE; - return mapping; - } - - // PageSize::Base4K: map regular base pages. MAP_POPULATE is deliberately - // omitted: it would pre-fault the whole buffer during mmap(), i.e. before - // the MADV_NOHUGEPAGE below can take effect. On hosts with transparent - // hugepages set to "always", those early faults hand back 2 MiB THP compound - // pages, and MADV_NOHUGEPAGE does not split pages that are already faulted - // in. The driver's strict 4 KiB base-page path - // (slash_qdma_map_user_base_page_to_sgl) then rejects every transfer with - // -EINVAL ("4 KiB transfer is not backed by a base page"). + // Map regular base pages. MAP_POPULATE is deliberately omitted: it would + // pre-fault the whole buffer during mmap(), i.e. before the MADV_NOHUGEPAGE + // below can take effect. On hosts with transparent hugepages set to + // "always", those early faults hand back 2 MiB THP compound pages, and + // MADV_NOHUGEPAGE does not split pages that are already faulted in. The + // driver's strict 4 KiB base-page path (slash_qdma_map_user_base_pages_to_sgl) + // then rejects every transfer with -EINVAL ("4 KiB transfer is not backed by + // a base page"). + (void)physAddr; mapping.data = mmap(nullptr, size, PROT_READ | PROT_WRITE, diff --git a/smi/src/smi.cpp b/smi/src/smi.cpp index bba4c3e4..e9da5e3d 100644 --- a/smi/src/smi.cpp +++ b/smi/src/smi.cpp @@ -129,16 +129,6 @@ static int smiMain(int argc, char **argv) { "Number of parallel buffers/threads (1-64)")->default_val(8)->check(CLI::Range(1u, 64u)); validateCommand->add_flag("-R,--no-reset", validateOptions.noReset, "Skip the device reset step before running memory tests"); - const std::map pageSizeMap{ - {"4k", Validate::Options::PageSize::Base4K}, - {"2m", Validate::Options::PageSize::Huge2M}, - }; - validateCommand->add_option("--page-size", validateOptions.pageSize, - "Host staging-buffer page granule for all backends: 4k (4 KiB base pages; default) " - "or 2m (2 MiB hugepages). 2m requires reserved 2 MiB hugepages and 2 MiB-aligned " - "buffer-size/offsets; the allocation fails with no fallback otherwise.") - ->transform(CLI::CheckedTransformer(pageSizeMap, CLI::ignore_case)) - ->default_str("4k"); validateCommand->add_option_function("--mm-channel", [&validateOptions](const std::string& value) { try { diff --git a/smi/src/validate.cpp b/smi/src/validate.cpp index 7cdcae06..67fb5c2b 100644 --- a/smi/src/validate.cpp +++ b/smi/src/validate.cpp @@ -94,29 +94,11 @@ static constexpr uint32_t QDMA_DIR_H2C = 0x1; static constexpr uint32_t QDMA_DIR_C2H = 0x2; static constexpr uint32_t QDMA_RING_SZ_IDX = 0; -static constexpr uint64_t HUGE_PAGE_SIZE = 2ULL * 1024ULL * 1024ULL; - -/// Map the validate page-size option to the raw-transfer host mapping mode. -static smi::raw::PageSize rawPageSize(const Validate::Options& options) { - return options.pageSize == Validate::Options::PageSize::Huge2M - ? smi::raw::PageSize::Huge2M - : smi::raw::PageSize::Base4K; -} - -/// Map the validate page-size option to the vrtd host page mode. -static vrtd::HostPageSize vrtdPageSize(const Validate::Options& options) { - return options.pageSize == Validate::Options::PageSize::Huge2M - ? vrtd::HostPageSize::Huge2M - : vrtd::HostPageSize::Base4K; -} - -/// Required alignment for placement sizes/offsets given the selected page -/// granule: 2 MiB when hugepages are requested, otherwise the QDMA transfer -/// alignment (4 KiB). +/// Required alignment for placement sizes/offsets: the QDMA transfer alignment +/// (4 KiB base pages). static uint64_t requiredAlignment(const Validate::Options& options) { - return options.pageSize == Validate::Options::PageSize::Huge2M - ? HUGE_PAGE_SIZE - : TRANSFER_ALIGNMENT; + (void)options; + return TRANSFER_ALIGNMENT; } /// Per-buffer AXI-MM channel selection. A single-element list applies to every @@ -427,15 +409,6 @@ static void printChannelAllocation(const Validate::Options& options) { } } -/// Print which host staging-buffer page granule is in effect. -static void printPageSize(const Validate::Options& options) { - std::cout << "Host page size: " - << (options.pageSize == Validate::Options::PageSize::Huge2M - ? "2 MiB hugepages" - : "4 KiB base pages") - << std::endl; -} - /// Print the raw-transfer queue ring-size override, when one was requested. static void printRingSizeIndex(const Validate::Options& options) { if (options.ringSizeIndex.has_value()) { @@ -467,29 +440,6 @@ static bool checkHostMemoryBudget(const Validate::Options& options) { : 2ULL * options.threads; const uint64_t requiredBytes = maxConcurrentBuffers * options.bufferSize; - if (options.pageSize == Validate::Options::PageSize::Huge2M) { - // 2 MiB hugepages are reserved separately from general RAM, so check - // the hugetlb pool rather than _SC_AVPHYS_PAGES. - const uint64_t needed = (requiredBytes + HUGE_PAGE_SIZE - 1) / HUGE_PAGE_SIZE; - std::ifstream freeFile("/sys/kernel/mm/hugepages/hugepages-2048kB/free_hugepages"); - uint64_t freePages = 0; - if (!freeFile.is_open() || !(freeFile >> freePages)) { - std::cerr << "Warning: unable to read 2 MiB hugepage availability; " - << "validate needs about " << needed << " free 2 MiB hugepages." - << std::endl; - return true; - } - if (freePages < needed) { - std::cerr << "validate: --page-size 2m needs about " << needed - << " free 2 MiB hugepages, but only " << freePages - << " are free. Reserve more (e.g. 'echo " << needed - << " | sudo tee /proc/sys/vm/nr_hugepages') or reduce --threads/--buffer-size." - << std::endl; - return false; - } - return true; - } - const long pageSize = sysconf(_SC_PAGESIZE); const long availablePages = sysconf(_SC_AVPHYS_PAGES); @@ -624,9 +574,9 @@ class RawQdmaDevice { class RawTransferBuffer { public: RawTransferBuffer(slash_qdma* qdma, uint64_t physAddr, uint64_t size, - smi::raw::PageSize pageSize, slash_qdma_mm_channel mmChannel, + slash_qdma_mm_channel mmChannel, uint32_t ringSizeIndex) - : qdma_{qdma}, physAddr_{physAddr}, size_{size}, pageSize_{pageSize}, + : qdma_{qdma}, physAddr_{physAddr}, size_{size}, mmChannel_{mmChannel}, ringSizeIndex_{ringSizeIndex} { try { createHostMapping(); @@ -686,7 +636,6 @@ class RawTransferBuffer { physAddr_ = other.physAddr_; size_ = other.size_; transferStepSize_ = other.transferStepSize_; - pageSize_ = other.pageSize_; mmChannel_ = other.mmChannel_; ringSizeIndex_ = other.ringSizeIndex_; bufId_ = other.bufId_; @@ -707,7 +656,7 @@ class RawTransferBuffer { } void createHostMapping() { - smi::raw::HostMapping mapping = smi::raw::createHostMapping(size_, physAddr_, pageSize_); + smi::raw::HostMapping mapping = smi::raw::createHostMapping(size_, physAddr_); data_ = mapping.data; transferStepSize_ = mapping.step; } @@ -799,7 +748,6 @@ class RawTransferBuffer { uint64_t physAddr_ = 0; uint64_t size_ = 0; uint64_t transferStepSize_ = 0; - smi::raw::PageSize pageSize_ = smi::raw::PageSize::Base4K; slash_qdma_mm_channel mmChannel_ = SLASH_QDMA_MM_CHANNEL_AUTO; uint32_t ringSizeIndex_ = QDMA_RING_SZ_IDX; uint32_t bufId_ = 0; @@ -1178,12 +1126,12 @@ static vrtd::Buffer openValidateHbmBuffer(const vrtd::Device& device, if (options.placementExplicit) { return device.openRawBuffer(addressFor(HBM_BASE, options, position), options.bufferSize, vrtd::BufferAllocDir::Bidirectional, - vrtdMmChannel(options, position), vrtdPageSize(options)); + vrtdMmChannel(options, position)); } return device.openHbmBuffer(static_cast(position), options.bufferSize, vrtd::BufferAllocDir::Bidirectional, - vrtdMmChannel(options, position), vrtdPageSize(options)); + vrtdMmChannel(options, position)); } static vrtd::Buffer openValidateDdrBuffer(const vrtd::Device& device, @@ -1192,11 +1140,11 @@ static vrtd::Buffer openValidateDdrBuffer(const vrtd::Device& device, if (options.placementExplicit) { return device.openRawBuffer(addressFor(DDR_BASE, options, position), options.bufferSize, vrtd::BufferAllocDir::Bidirectional, - vrtdMmChannel(options, position), vrtdPageSize(options)); + vrtdMmChannel(options, position)); } return device.openDdrBuffer(options.bufferSize, vrtd::BufferAllocDir::Bidirectional, - vrtdMmChannel(options, position), vrtdPageSize(options)); + vrtdMmChannel(options, position)); } static int runRawTransferTest(const std::string& bdf, const Validate::Options& options) { @@ -1211,7 +1159,6 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o const std::string qdmaPath = resolveQdmaDevicePath(bdf); std::cout << "Using raw QDMA device " << qdmaPath << "..." << std::endl; printChannelAllocation(options); - printPageSize(options); printMmChannel(options); printRingSizeIndex(options); printBandwidthRepeatMode(repeat); @@ -1226,7 +1173,7 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o hbmBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { hbmBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, i), - options.bufferSize, rawPageSize(options), + options.bufferSize, slashMmChannel(options, i), ringSizeIndex); } @@ -1247,11 +1194,11 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o for (unsigned i = 0; i < N; ++i) { hbmReadBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options), + options.bufferSize, slashMmChannel(options, 2 * i), ringSizeIndex); hbmWriteBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options), + options.bufferSize, slashMmChannel(options, 2 * i + 1), ringSizeIndex); } @@ -1266,7 +1213,7 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o ddrBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { ddrBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, i), - options.bufferSize, rawPageSize(options), + options.bufferSize, slashMmChannel(options, i), ringSizeIndex); } @@ -1287,11 +1234,11 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o for (unsigned i = 0; i < N; ++i) { ddrReadBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options), + options.bufferSize, slashMmChannel(options, 2 * i), ringSizeIndex); ddrWriteBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options), + options.bufferSize, slashMmChannel(options, 2 * i + 1), ringSizeIndex); } @@ -1305,12 +1252,12 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o parBuffers.reserve(2 * N); for (unsigned i = 0; i < N; ++i) { parBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, i), - options.bufferSize, rawPageSize(options), + options.bufferSize, slashMmChannel(options, i), ringSizeIndex); } for (unsigned i = 0; i < N; ++i) { parBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, i), - options.bufferSize, rawPageSize(options), + options.bufferSize, slashMmChannel(options, i), ringSizeIndex); } @@ -1328,21 +1275,21 @@ static int runRawTransferTest(const std::string& bdf, const Validate::Options& o for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options), + options.bufferSize, slashMmChannel(options, 2 * i), ringSizeIndex); parWriteBuffers.emplace_back(qdma.get(), rawAddressFor(HBM_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options), + options.bufferSize, slashMmChannel(options, 2 * i + 1), ringSizeIndex); } for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options), + options.bufferSize, slashMmChannel(options, 2 * i), ringSizeIndex); parWriteBuffers.emplace_back(qdma.get(), rawAddressFor(DDR_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options), + options.bufferSize, slashMmChannel(options, 2 * i + 1), ringSizeIndex); } @@ -1377,7 +1324,6 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op std::cout << "Using off-the-shelf Xilinx QDMA driver for board " << bdf << "..." << std::endl; printChannelAllocation(options); - printPageSize(options); printMmChannel(options); printRingSizeIndex(options); printBandwidthRepeatMode(repeat); @@ -1400,7 +1346,7 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op hbmBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { hbmBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, i), - options.bufferSize, rawPageSize(options), + options.bufferSize, qdmaDriverMmChannel(options, i)); } @@ -1419,11 +1365,11 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op for (unsigned i = 0; i < N; ++i) { hbmReadBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options), + options.bufferSize, qdmaDriverMmChannel(options, 2 * i)); hbmWriteBuffers.emplace_back(qdma, N + i, rawAddressFor(HBM_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options), + options.bufferSize, qdmaDriverMmChannel(options, 2 * i + 1)); } @@ -1438,7 +1384,7 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op ddrBuffers.reserve(N); for (unsigned i = 0; i < N; ++i) { ddrBuffers.emplace_back(qdma, i, rawAddressFor(DDR_BASE, options, i), - options.bufferSize, rawPageSize(options), + options.bufferSize, qdmaDriverMmChannel(options, i)); } @@ -1457,11 +1403,11 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op for (unsigned i = 0; i < N; ++i) { ddrReadBuffers.emplace_back(qdma, i, rawAddressFor(DDR_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options), + options.bufferSize, qdmaDriverMmChannel(options, 2 * i)); ddrWriteBuffers.emplace_back(qdma, N + i, rawAddressFor(DDR_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options), + options.bufferSize, qdmaDriverMmChannel(options, 2 * i + 1)); } @@ -1475,12 +1421,12 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op parBuffers.reserve(2 * N); for (unsigned i = 0; i < N; ++i) { parBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, i), - options.bufferSize, rawPageSize(options), + options.bufferSize, qdmaDriverMmChannel(options, i)); } for (unsigned i = 0; i < N; ++i) { parBuffers.emplace_back(qdma, N + i, rawAddressFor(DDR_BASE, options, i), - options.bufferSize, rawPageSize(options), + options.bufferSize, qdmaDriverMmChannel(options, i)); } @@ -1494,21 +1440,21 @@ static int runQdmaDriverTest(const std::string& bdf, const Validate::Options& op for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma, i, rawAddressFor(HBM_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options), + options.bufferSize, qdmaDriverMmChannel(options, 2 * i)); parWriteBuffers.emplace_back(qdma, 2 * N + i, rawAddressFor(HBM_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options), + options.bufferSize, qdmaDriverMmChannel(options, 2 * i + 1)); } for (unsigned i = 0; i < N; ++i) { parReadBuffers.emplace_back(qdma, N + i, rawAddressFor(DDR_BASE, options, 2 * i), - options.bufferSize, rawPageSize(options), + options.bufferSize, qdmaDriverMmChannel(options, 2 * i)); parWriteBuffers.emplace_back(qdma, 3 * N + i, rawAddressFor(DDR_BASE, options, 2 * i + 1), - options.bufferSize, rawPageSize(options), + options.bufferSize, qdmaDriverMmChannel(options, 2 * i + 1)); } @@ -1589,7 +1535,6 @@ int Validate::run(const Options& options) { vrtd::Session session; auto device = session.getDeviceByBdf(bdf); - printPageSize(options); printMmChannel(options); // -- Step 2: HBM — integrity then bandwidth -- diff --git a/smi/src/validate.hpp b/smi/src/validate.hpp index 46c6f807..ef15a174 100644 --- a/smi/src/validate.hpp +++ b/smi/src/validate.hpp @@ -57,17 +57,6 @@ class Validate { Paired, ///< Couple mm-channel to a distinct memory region: even positions -> region 0, odd -> region 1. }; - /// @brief Host staging-buffer page granule used for DMA transfers. - /// - /// Selects how the host-side buffer is mapped for every backend (VRTD, - /// raw SLASH, and the off-the-shelf QDMA driver). 2 MiB requires - /// reserved hugepages plus 2 MiB-aligned sizes/addresses; the allocation - /// fails with no fallback otherwise. - enum class PageSize { - Base4K, ///< 4 KiB base pages (default). - Huge2M, ///< 2 MiB hugepages. - }; - /// @brief Per-queue AXI-MM/NoC channel selection for a buffer. /// /// Auto lets the driver stripe by qid&1; Ch0/Ch1 pin the queue to a @@ -86,7 +75,6 @@ class Validate { bool hbmOnly = false; ///< Skip DDR phase (mutually exclusive with ddrOnly). bool rawTransferTest = false; ///< Use libslash raw QDMA transfers instead of VRTD buffers. bool useQdmaDriver = false; ///< Run the raw test over the off-the-shelf Xilinx QDMA driver. - PageSize pageSize = PageSize::Base4K; ///< Host staging-buffer page granule (4 KiB or 2 MiB). /// Per-buffer AXI-MM channel selection, indexed by buffer position /// modulo size (a single entry applies to every buffer). Default auto. std::vector mmChannels{MmChannel::Auto}; diff --git a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h index c8dffc67..3aaa47c0 100644 --- a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h +++ b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h @@ -54,19 +54,6 @@ extern "C" { struct vrtd_buffer; -/** - * @brief Host staging-buffer page granule for DMA buffers. - * - * Selects how the client-side host buffer backing a DMA transfer is mapped. - * libvrtd mmaps the host buffer locally, so this is a client-local concept and - * is never sent to the daemon. There is no automatic fallback: requesting - * #VRTD_HOST_PAGE_2M fails the allocation when 2 MiB hugepages are unavailable. - */ -enum vrtd_host_page_mode { - VRTD_HOST_PAGE_4K = 0, ///< Regular 4 KiB base pages (transparent hugepages disabled). - VRTD_HOST_PAGE_2M = 1, ///< 2 MiB hugetlb pages; allocation fails if they cannot be mapped. -}; - /** * @brief AXI-MM / NoC channel selection for a buffer's QDMA queue pair. * @@ -352,7 +339,6 @@ enum vrtd_ret vrtd_qdma_qpair_get_fd( * @param alloc_arg Allocation argument (HBM region index for HBM). * @param size_in Requested size in bytes. * @param mm_channel AXI-MM/NoC channel selection (one of enum vrtd_mm_channel). - * @param page_mode Host staging-buffer page granule (one of enum vrtd_host_page_mode). * @param buffer_out Output pointer to receive the allocated buffer handle. * * @return #VRTD_RET_OK on success; otherwise a #vrtd_ret error code. @@ -367,7 +353,6 @@ enum vrtd_ret vrtd_buffer_open( uint64_t alloc_arg, uint64_t size_in, enum vrtd_mm_channel mm_channel, - enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ); @@ -383,7 +368,6 @@ enum vrtd_ret vrtd_buffer_open( * @param size Size in bytes. * @param alloc_dir One of #vrtd_alloc_dir. * @param mm_channel AXI-MM/NoC channel selection (one of enum vrtd_mm_channel). - * @param page_mode Host staging-buffer page granule (one of enum vrtd_host_page_mode). * @param buffer_out Output parameter set to the new buffer handle on success. * * @return #VRTD_RET_OK on success; otherwise a #vrtd_ret error code. @@ -397,7 +381,6 @@ enum vrtd_ret vrtd_buffer_open_raw( uint64_t size, uint32_t alloc_dir, enum vrtd_mm_channel mm_channel, - enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ); @@ -553,7 +536,7 @@ struct vrtd_buffer { uint32_t buf_id; enum slash_qdma_transfer_hint transfer_hint; void *buf; - /* Internal DMA granule for the local host mapping: 4096 or 2 MiB. */ + /* Internal DMA granule for the local host mapping (4 KiB base pages). */ uint64_t transfer_step_size; }; @@ -567,7 +550,6 @@ enum vrtd_ret vrtd_buffer_create_raw( uint64_t phys_addr, const int *qpair_fds, uint32_t qpair_fd_count, - enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ); diff --git a/vrt/vrtd/libvrtd/src/buffer.c b/vrt/vrtd/libvrtd/src/buffer.c index c25697b6..e63ad1e3 100644 --- a/vrt/vrtd/libvrtd/src/buffer.c +++ b/vrt/vrtd/libvrtd/src/buffer.c @@ -24,11 +24,9 @@ * DMA buffer lifecycle management for the vrtd C client library. * * Buffers are host-side memory regions used for DMA transfers to/from - * the FPGA. Each buffer is backed by an anonymous mmap whose page granule - * (4 KiB base pages or 2 MiB hugepages) is selected explicitly by the caller - * via enum vrtd_host_page_mode -- there is no automatic fallback -- and - * associated with a QDMA queue pair fd for performing the actual H2C / C2H - * transfers. + * the FPGA. Each buffer is backed by an anonymous mmap of 4 KiB base pages + * (transparent hugepages disabled) and associated with a QDMA queue pair fd + * for performing the actual H2C / C2H transfers. * * Sync operations (sync_to_device / sync_from_device) accept arbitrary * in-buffer ranges. Internally, the QDMA fd requires page-aligned transfer @@ -63,16 +61,7 @@ #include -#ifndef MAP_HUGE_SHIFT -#define MAP_HUGE_SHIFT 26 -#endif - -#ifndef MAP_HUGE_2MB -#define MAP_HUGE_2MB (21UL << MAP_HUGE_SHIFT) -#endif - #define BASE_TRANSFER_STEP_SIZE (4ULL * 1024ULL) // 4K -#define HUGE_TRANSFER_STEP_SIZE (2ULL * 1024ULL * 1024ULL) // 2M /* * Per-sync timing instrumentation. @@ -81,7 +70,7 @@ * -DSLASH_QDMA_TIMING=1), the sync_to/from_device paths log the wall-clock * cost of each transfer ioctl plus the aggregate per-sync time and * effective bandwidth. This is the userspace counterpart to the kernel's - * SLASH_QDMA_TIMING and libqdma's QDMA_TIMING breakdowns. + * SLASH_QDMA_TIMING breakdown. */ #ifndef SLASH_QDMA_TIMING #define SLASH_QDMA_TIMING 0 @@ -271,7 +260,6 @@ enum vrtd_ret vrtd_buffer_create_raw( uint64_t phys_addr, const int *qpair_fds, uint32_t qpair_fd_count, - enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ) { if (buffer_out == NULL) { @@ -296,73 +284,27 @@ enum vrtd_ret vrtd_buffer_create_raw( return VRTD_RET_BAD_LIB_CALL; } - if (page_mode == VRTD_HOST_PAGE_2M) { - /* - * Explicit 2 MiB hugetlb request: there is no fallback. The DMA - * granule and the device address must both be 2 MiB aligned, and the - * hugetlb mapping must succeed, otherwise the allocation fails so the - * caller can react instead of silently transferring over 4 KiB pages. - */ - if ((size % HUGE_TRANSFER_STEP_SIZE) != 0 || - (phys_addr % HUGE_TRANSFER_STEP_SIZE) != 0) { - free(buffer); - return VRTD_RET_INVALID_ARGUMENT; - } - - buffer->buf = mmap( - NULL, /* address (let the kernel choose) */ - size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_2MB | MAP_POPULATE, - -1, /* fd */ - 0 /* offset */ - ); - if (buffer->buf == MAP_FAILED) { - int huge_errno = errno; - syslog( - LOG_ERR, - "libvrtd: 2 MiB hugetlb mapping failed for buffer size=%llu phys_addr=0x%llx errno=%d; " - "reserve 2 MiB hugepages or request 4 KiB pages", - (unsigned long long)size, - (unsigned long long)phys_addr, - huge_errno - ); - free(buffer); - return VRTD_RET_INTERNAL_ERROR; - } - buffer->transfer_step_size = HUGE_TRANSFER_STEP_SIZE; -#if SLASH_QDMA_TIMING - syslog( - LOG_INFO, - "libvrtd: buffer host mapping path=hugetlb-2m size=%llu phys_addr=0x%llx step=%llu", - (unsigned long long)size, - (unsigned long long)phys_addr, - (unsigned long long)buffer->transfer_step_size - ); -#endif - } else { - /* - * Explicit 4 KiB base-page request. Do not use MAP_POPULATE before - * MADV_NOHUGEPAGE: THP=always can fault compound pages before the - * advice takes effect, and the kernel QDMA base-page path intentionally - * rejects those pages (vrtd_mmap_regular_base_pages handles this). - */ - int mmap_ret = vrtd_mmap_regular_base_pages(size, &buffer->buf); - if (mmap_ret != 0) { - free(buffer); - return VRTD_RET_INTERNAL_ERROR; - } - buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE; + /* + * Host staging buffer is always 4 KiB base pages. Do not use MAP_POPULATE + * before MADV_NOHUGEPAGE: THP=always can fault compound pages before the + * advice takes effect, and the kernel QDMA base-page path intentionally + * rejects those pages (vrtd_mmap_regular_base_pages handles this). + */ + int mmap_ret = vrtd_mmap_regular_base_pages(size, &buffer->buf); + if (mmap_ret != 0) { + free(buffer); + return VRTD_RET_INTERNAL_ERROR; + } + buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE; #if SLASH_QDMA_TIMING - syslog( - LOG_INFO, - "libvrtd: buffer host mapping path=regular-4k size=%llu phys_addr=0x%llx step=%llu", - (unsigned long long)size, - (unsigned long long)phys_addr, - (unsigned long long)buffer->transfer_step_size - ); + syslog( + LOG_INFO, + "libvrtd: buffer host mapping path=regular-4k size=%llu phys_addr=0x%llx step=%llu", + (unsigned long long)size, + (unsigned long long)phys_addr, + (unsigned long long)buffer->transfer_step_size + ); #endif - } buffer->sock_fd = sock_fd; buffer->dev = dev; diff --git a/vrt/vrtd/libvrtd/src/requests.c b/vrt/vrtd/libvrtd/src/requests.c index 8a1b1b09..1cb43997 100644 --- a/vrt/vrtd/libvrtd/src/requests.c +++ b/vrt/vrtd/libvrtd/src/requests.c @@ -554,7 +554,6 @@ enum vrtd_ret vrtd_buffer_open( uint64_t alloc_arg, uint64_t size_in, enum vrtd_mm_channel mm_channel, - enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ) { @@ -604,7 +603,6 @@ enum vrtd_ret vrtd_buffer_open( resp.phys_addr, qpair_fds, resp.qpair_fd_count, - page_mode, buffer_out ); if (ret != VRTD_RET_OK) { @@ -626,7 +624,6 @@ enum vrtd_ret vrtd_buffer_open_raw( uint64_t size, uint32_t alloc_dir, enum vrtd_mm_channel mm_channel, - enum vrtd_host_page_mode page_mode, struct vrtd_buffer **buffer_out ) { @@ -675,7 +672,6 @@ enum vrtd_ret vrtd_buffer_open_raw( phys_addr, qpair_fds, resp.qpair_fd_count, - page_mode, buffer_out ); if (ret != VRTD_RET_OK) { diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp index e441a3fd..b2569d91 100644 --- a/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp +++ b/vrt/vrtd/libvrtdpp/include/vrtd/buffer.hpp @@ -48,17 +48,6 @@ enum class BufferAllocDir : uint32_t { DeviceToHost = VRTD_ALLOC_DIR_DEVICE_TO_HOST, }; -/** - * @brief Host staging-buffer page granule for a buffer's DMA mapping. - * - * Mirrors @c vrtd_host_page_mode (values must stay in sync). @c Huge2M fails - * the allocation, with no fallback, when 2 MiB hugepages cannot be mapped. - */ -enum class HostPageSize : uint32_t { - Base4K = 0, ///< Regular 4 KiB base pages. - Huge2M = 1, ///< 2 MiB hugetlb pages; allocation fails if unavailable. -}; - /** * @brief AXI-MM / NoC channel selection for a buffer's QDMA queue pair. * diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp index 5af4fef2..5a220075 100644 --- a/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp +++ b/vrt/vrtd/libvrtdpp/include/vrtd/device.hpp @@ -159,7 +159,6 @@ class Device { * @param allocArg Allocation argument (HBM region index for HBM). * @param allocDir QDMA transfer direction. * @param mmChannel AXI-MM/NoC channel selection (defaults to auto). - * @param page Host staging-buffer page granule (defaults to 4 KiB). * @return An owning @c Buffer. * @throws vrtd::Error on error. */ @@ -167,16 +166,14 @@ class Device { uint64_t size, uint64_t allocArg = 0, BufferAllocDir allocDir = BufferAllocDir::Bidirectional, - MmChannel mmChannel = MmChannel::Auto, - HostPageSize page = HostPageSize::Base4K) const; + MmChannel mmChannel = MmChannel::Auto) const; /** * @brief Convenience helper for DDR allocations. */ Buffer openDdrBuffer(uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional, - MmChannel mmChannel = MmChannel::Auto, - HostPageSize page = HostPageSize::Base4K) const { - return openBuffer(BufferAllocType::Ddr, size, 0, allocDir, mmChannel, page); + MmChannel mmChannel = MmChannel::Auto) const { + return openBuffer(BufferAllocType::Ddr, size, 0, allocDir, mmChannel); } /** @@ -185,9 +182,8 @@ class Device { Buffer openHbmBuffer(uint32_t region, uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional, - MmChannel mmChannel = MmChannel::Auto, - HostPageSize page = HostPageSize::Base4K) const { - return openBuffer(BufferAllocType::Hbm, size, region, allocDir, mmChannel, page); + MmChannel mmChannel = MmChannel::Auto) const { + return openBuffer(BufferAllocType::Hbm, size, region, allocDir, mmChannel); } /** @@ -195,9 +191,8 @@ class Device { */ Buffer openHbmVnocBuffer(uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional, - MmChannel mmChannel = MmChannel::Auto, - HostPageSize page = HostPageSize::Base4K) const { - return openBuffer(BufferAllocType::HbmVnoc, size, 0, allocDir, mmChannel, page); + MmChannel mmChannel = MmChannel::Auto) const { + return openBuffer(BufferAllocType::HbmVnoc, size, 0, allocDir, mmChannel); } /** @@ -210,15 +205,13 @@ class Device { * @param size Size in bytes. * @param allocDir QDMA transfer direction. * @param mmChannel AXI-MM/NoC channel selection (defaults to auto). - * @param page Host staging-buffer page granule (defaults to 4 KiB). * @return An owning @c Buffer. * @throws vrtd::Error on error. */ Buffer openRawBuffer(uint64_t phys_addr, uint64_t size, BufferAllocDir allocDir = BufferAllocDir::Bidirectional, - MmChannel mmChannel = MmChannel::Auto, - HostPageSize page = HostPageSize::Base4K) const; + MmChannel mmChannel = MmChannel::Auto) const; /** * @brief Perform a PCIe hotplug operation for this device. @@ -365,8 +358,8 @@ class Device { uint16_t subsystemDeviceId, std::function fGetBar, std::function fCreateQdmaQpair, - std::function fOpenBuffer, - std::function fOpenBufferRaw, + std::function fOpenBuffer, + std::function fOpenBufferRaw, std::function fHotplugOp, std::function fDesignWrite, std::function fDesignWriteFile, @@ -384,8 +377,8 @@ class Device { std::function fGetBar; std::function fCreateQdmaQpair; - std::function fOpenBuffer; - std::function fOpenBufferRaw; + std::function fOpenBuffer; + std::function fOpenBufferRaw; std::function fHotplugOp; std::function fDesignWrite; std::function fDesignWriteFile; diff --git a/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp b/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp index 32a7ae88..422160c9 100644 --- a/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp +++ b/vrt/vrtd/libvrtdpp/include/vrtd/session.hpp @@ -191,7 +191,6 @@ class Session { * @param allocArg Allocation argument (HBM region index for HBM). * @param allocDir QDMA transfer direction. * @param mmChannel AXI-MM/NoC channel selection for the queue pair. - * @param pageSize Host staging-buffer page granule (4 KiB or 2 MiB). * @return An owning @c Buffer. * @throws vrtd::Error on error. */ @@ -201,8 +200,7 @@ class Session { uint64_t size, uint64_t allocArg, BufferAllocDir allocDir, - MmChannel mmChannel, - HostPageSize pageSize + MmChannel mmChannel ) const; /** @@ -213,7 +211,6 @@ class Session { * @param size Size in bytes. * @param allocDir QDMA transfer direction. * @param mmChannel AXI-MM/NoC channel selection for the queue pair. - * @param pageSize Host staging-buffer page granule (4 KiB or 2 MiB). * @return An owning @c Buffer. * @throws vrtd::Error on error. */ @@ -222,8 +219,7 @@ class Session { uint64_t phys_addr, uint64_t size, BufferAllocDir allocDir, - MmChannel mmChannel, - HostPageSize pageSize + MmChannel mmChannel ) const; /** diff --git a/vrt/vrtd/libvrtdpp/src/device.cpp b/vrt/vrtd/libvrtdpp/src/device.cpp index 37a09120..6fa00791 100644 --- a/vrt/vrtd/libvrtdpp/src/device.cpp +++ b/vrt/vrtd/libvrtdpp/src/device.cpp @@ -31,8 +31,8 @@ Device::Device(uint32_t num, uint16_t subsystemDeviceId, std::function fGetBar, std::function fCreateQdmaQpair, - std::function fOpenBuffer, - std::function fOpenBufferRaw, + std::function fOpenBuffer, + std::function fOpenBufferRaw, std::function fHotplugOp, std::function fDesignWrite, std::function fDesignWriteFile, @@ -98,17 +98,15 @@ Buffer Device::openBuffer(BufferAllocType allocType, uint64_t size, uint64_t allocArg, BufferAllocDir allocDir, - MmChannel mmChannel, - HostPageSize page) const { - return fOpenBuffer(*this, allocType, size, allocArg, allocDir, mmChannel, page); + MmChannel mmChannel) const { + return fOpenBuffer(*this, allocType, size, allocArg, allocDir, mmChannel); } Buffer Device::openRawBuffer(uint64_t phys_addr, uint64_t size, BufferAllocDir allocDir, - MmChannel mmChannel, - HostPageSize page) const { - return fOpenBufferRaw(*this, phys_addr, size, allocDir, mmChannel, page); + MmChannel mmChannel) const { + return fOpenBufferRaw(*this, phys_addr, size, allocDir, mmChannel); } void Device::hotplugOp(HotplugOp op, uint8_t function) const { diff --git a/vrt/vrtd/libvrtdpp/src/session.cpp b/vrt/vrtd/libvrtdpp/src/session.cpp index 6799ee6e..7bbda0bc 100644 --- a/vrt/vrtd/libvrtdpp/src/session.cpp +++ b/vrt/vrtd/libvrtdpp/src/session.cpp @@ -132,11 +132,11 @@ Device Session::getDevice(size_t i) const { info.pci.subsystem_device_id, [&](const Device& device, uint8_t num) { return getBar(device, num); }, [&](const Device& device, const slash_qdma_qpair_add& cfg) { return createQdmaQpair(device, cfg); }, - [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, MmChannel mm, HostPageSize page) { - return openBuffer(device, type, size, arg, dir, mm, page); + [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, MmChannel mm) { + return openBuffer(device, type, size, arg, dir, mm); }, - [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, MmChannel mm, HostPageSize page) { - return openBufferRaw(device, phys_addr, size, dir, mm, page); + [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, MmChannel mm) { + return openBufferRaw(device, phys_addr, size, dir, mm); }, [&](const Device& device, HotplugOp op, uint8_t function) { return hotplugOp(device, op, function); }, [&](const Device& device, int input_fd) { return designWrite(device, input_fd); }, @@ -197,11 +197,11 @@ Device Session::getDeviceByBdf(std::string_view bdf) const { info.pci.subsystem_device_id, [&](const Device& device, uint8_t num) { return getBar(device, num); }, [&](const Device& device, const slash_qdma_qpair_add& cfg) { return createQdmaQpair(device, cfg); }, - [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, MmChannel mm, HostPageSize page) { - return openBuffer(device, type, size, arg, dir, mm, page); + [&](const Device& device, BufferAllocType type, uint64_t size, uint64_t arg, BufferAllocDir dir, MmChannel mm) { + return openBuffer(device, type, size, arg, dir, mm); }, - [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, MmChannel mm, HostPageSize page) { - return openBufferRaw(device, phys_addr, size, dir, mm, page); + [&](const Device& device, uint64_t phys_addr, uint64_t size, BufferAllocDir dir, MmChannel mm) { + return openBufferRaw(device, phys_addr, size, dir, mm); }, [&](const Device& device, HotplugOp op, uint8_t function) { return hotplugOp(device, op, function); }, [&](const Device& device, int input_fd) { return designWrite(device, input_fd); }, @@ -290,8 +290,7 @@ Buffer Session::openBuffer( uint64_t size, uint64_t allocArg, BufferAllocDir allocDir, - MmChannel mmChannel, - HostPageSize pageSize + MmChannel mmChannel ) const { if (isClosed()) { throw Error(VRTD_RET_BAD_LIB_CALL); @@ -307,7 +306,6 @@ Buffer Session::openBuffer( allocArg, size, static_cast(static_cast(mmChannel)), - static_cast(static_cast(pageSize)), &raw ); if (ret != VRTD_RET_OK) { @@ -326,8 +324,7 @@ Buffer Session::openBufferRaw( uint64_t phys_addr, uint64_t size, BufferAllocDir allocDir, - MmChannel mmChannel, - HostPageSize pageSize + MmChannel mmChannel ) const { if (isClosed()) { throw Error(VRTD_RET_BAD_LIB_CALL); @@ -342,7 +339,6 @@ Buffer Session::openBufferRaw( size, static_cast(allocDir), static_cast(static_cast(mmChannel)), - static_cast(static_cast(pageSize)), &raw ); if (ret != VRTD_RET_OK) { From 817670d6b71f0cb86030f84ef7648062c2756245 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Wed, 17 Jun 2026 12:07:08 +0100 Subject: [PATCH 22/23] Changed API to kernel allocated-buffers Signed-off-by: Vlad-Gabriel Serbu --- docs/reference/kernel-abi/index.rst | 232 +-- driver/Makefile | 16 + driver/kcompat/uring_cmd.c | 78 + driver/kcompat/uring_sqe_cmd.c | 59 + driver/libslash/README.md | 20 +- driver/libslash/include/slash/qdma.h | 136 +- .../include/slash/uapi/slash_interface.h | 151 +- driver/libslash/src/qdma.c | 394 +++-- driver/libslash/src/qdma_mock.c | 136 +- driver/libslash/src/qdma_mock.h | 11 +- driver/libslash/tests/qdma_test.cpp | 191 +- driver/slash_qdma.c | 1571 ++++++++++------- driver/tests/test_slash_qdma.c | 573 +++--- smi/src/validate.cpp | 67 +- vrt/src/qdma/qdma_intf.cpp | 45 +- vrt/vrtd/include/vrtd/wire.h | 27 +- vrt/vrtd/libvrtd/include/vrtd/vrtd.h | 16 +- vrt/vrtd/libvrtd/src/buffer.c | 306 ++-- vrt/vrtd/libvrtd/src/requests.c | 56 +- vrt/vrtd/libvrtd/src/v80_policy.h | 49 +- vrt/vrtd/libvrtdpp/src/buffer.cpp | 6 +- vrt/vrtd/src/buffer.c | 50 +- vrt/vrtd/src/buffer.h | 9 +- vrt/vrtd/src/serve.c | 40 +- vrt/vrtd/tests/buffer_test.cpp | 31 +- vrt/vrtd/tests/v80_policy_test.cpp | 22 +- 26 files changed, 2385 insertions(+), 1907 deletions(-) create mode 100644 driver/kcompat/uring_cmd.c create mode 100644 driver/kcompat/uring_sqe_cmd.c diff --git a/docs/reference/kernel-abi/index.rst b/docs/reference/kernel-abi/index.rst index b4bd151a..25045cc5 100644 --- a/docs/reference/kernel-abi/index.rst +++ b/docs/reference/kernel-abi/index.rst @@ -381,38 +381,46 @@ length, and direction. Full lifecycle: }; int io_fd = ioctl(qdma_fd, SLASH_QDMA_IOCTL_QPAIR_GET_FD, &fd_req); - /* Step 4: Register a page-aligned host buffer. */ - struct slash_qdma_buf_register reg = { - .size = sizeof(reg), .user_addr = (uintptr_t)host_buf, .length = nbytes - }; - ioctl(io_fd, SLASH_QDMA_IOCTL_BUF_REGISTER, ®); - - /* Step 5: H2C transfer to device address 0x4000000000 */ + /* Step 4: Create a kernel-owned DMA buffer and mmap it for CPU access. + * The buffer fd is returned by the ioctl; the kernel allocated the pages, + * built the SGL, and DMA-mapped everything once. */ + struct slash_qdma_buf_create bc = { .size = sizeof(bc), .length = nbytes }; + int buf_fd = ioctl(io_fd, SLASH_QDMA_IOCTL_BUF_CREATE, &bc); + void *host_buf = mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, + buf_fd, 0); + + /* Step 5: H2C transfer to device address 0x4000000000. The transfer + * carries an array of per-qpair sub-transfers; a single-channel fd uses + * one sub-transfer with qpair_index 0. */ struct slash_qdma_transfer xfer = { .size = sizeof(xfer), - .buf_id = reg.buf_id, - .buf_offset = 0, - .dev_addr = 0x4000000000LL, - .length = nbytes, - .direction = SLASH_QDMA_XFER_H2C, + .count = 1, + .xfers[0] = { + .qpair_index = 0, + .direction = SLASH_QDMA_XFER_H2C, + .buf_fd = buf_fd, + .buf_offset = 0, + .dev_addr = 0x4000000000LL, + .length = nbytes, + }, }; ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &xfer); /* Step 6: C2H transfer from device address 0x4000000000 */ - xfer.direction = SLASH_QDMA_XFER_C2H; + xfer.xfers[0].direction = SLASH_QDMA_XFER_C2H; ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &xfer); - /* Step 7: Teardown */ - struct slash_qdma_buf_unregister unreg = { - .size = sizeof(unreg), .buf_id = reg.buf_id - }; - ioctl(io_fd, SLASH_QDMA_IOCTL_BUF_UNREGISTER, &unreg); + /* Step 7: Teardown — closing the buffer fd (after munmap) releases it. */ + munmap(host_buf, nbytes); + close(buf_fd); close(io_fd); op.op = 1; ioctl(qdma_fd, SLASH_QDMA_IOCTL_Q_OP, &op); /* STOP */ op.op = 2; ioctl(qdma_fd, SLASH_QDMA_IOCTL_Q_OP, &op); /* DEL */ The qpair fd does **not** support ``read``, ``write``, ``pread``, ``pwrite``, ``mmap``, -``poll``/``select``, or ``splice`` for data movement. +``poll``/``select``, or ``splice`` for data movement. Buffer fds returned by +``SLASH_QDMA_IOCTL_BUF_CREATE`` **are** mappable with ``mmap`` (full length, +offset 0). All transfers are synchronous and block until the transfer completes or times out. The timeout is **10 seconds**; after expiry the call returns ``-ETIME``. Partial transfers are possible; the @@ -685,33 +693,43 @@ removed. ``SLASH_QDMA_IOCTL_QPAIR_GET_FD`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Creates a new file descriptor for data transfer on an existing queue pair. The returned fd is -ioctl-only for data movement: it supports buffer register/unregister and transfer ioctls, but not -``read``, ``write``, ``pread``, ``pwrite``, ``mmap``, ``poll``/``select``, or ``splice``. Multiple -fds can be obtained for the same qpair via multiple calls. The fd is returned as the ``ioctl()`` -return value. +Creates a new file descriptor for data transfer. The fd is a **collection of one or two queue +pairs** (typically one per AXI-MM/NoC channel): a transfer issued on it selects a bound queue pair +by index, so one transfer ioctl can fan across both channels. The returned fd is ioctl-only for +data movement: it supports buffer register/unregister and transfer ioctls, but not ``read``, +``write``, ``pread``, ``pwrite``, ``mmap``, ``poll``/``select``, or ``splice`` (an optional +``io_uring`` ``uring_cmd`` async transfer path is available on capable kernels). Multiple fds can +be obtained for the same qpair(s) via multiple calls. The fd is returned as the ``ioctl()`` return +value. **Interface:** .. code-block:: c + #define SLASH_QDMA_FD_MAX_QPAIRS 2u + #define SLASH_QDMA_IOCTL_QPAIR_GET_FD _IOWR('v', 0x53, struct slash_qdma_qpair_fd_request) struct slash_qdma_qpair_fd_request { - __u32 size; /* [in/out] ABI version */ - __u32 qid; /* [in] Queue pair ID (must exist and be non-empty) */ - __u32 flags; /* [in] fd flags: only O_CLOEXEC is honoured */ + __u32 size; /* [in/out] ABI version */ + __u32 qid; /* [in] Legacy single qpair ID; used when qpair_count == 0 */ + __u32 flags; /* [in] fd flags: only O_CLOEXEC is honoured */ + __u32 qpair_count; /* [in] Number of qpair_ids (1..SLASH_QDMA_FD_MAX_QPAIRS); 0 = use qid */ + __u32 qpair_ids[SLASH_QDMA_FD_MAX_QPAIRS]; /* [in] qpair IDs; index == qpair_index */ }; -**Direction:** ``_IOWR`` — userspace writes ``qid`` and ``flags``; the kernel returns the new fd -as the ``ioctl()`` return value (not as a struct field). +**Direction:** ``_IOWR`` — userspace writes the qpair selection and ``flags``; the kernel returns +the new fd as the ``ioctl()`` return value (not as a struct field). **Preconditions:** -- ``size`` must cover at least ``flags`` (the trailing input field) — otherwise ``-EINVAL`` -- ``qid`` must refer to an existing, non-empty queue pair +- ``size`` must cover at least ``flags`` (the trailing input field of the legacy form) — otherwise ``-EINVAL`` +- The selected queue pairs must exist and be non-empty (``qpair_count == 0`` selects the single ``qid``) +- ``qpair_count`` must not exceed ``SLASH_QDMA_FD_MAX_QPAIRS`` - ``flags & ~O_CLOEXEC == 0`` (any other bits cause ``-EINVAL``) -- The queue pair should be in the started state for I/O to work +- The queue pairs should be in the started state for I/O to work +- Each bound qpair keeps the per-qpair configuration (``mm_channel``, ring sizes, directions) it was + given at ``QPAIR_ADD`` time, so the two channels can be configured independently **Postconditions:** @@ -729,36 +747,43 @@ as the ``ioctl()`` return value (not as a struct field). - ``-ENOMEM`` — allocation failure - Other negative errno from ``anon_inode_getfile()`` or ``get_unused_fd_flags()`` -``SLASH_QDMA_IOCTL_BUF_REGISTER`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``SLASH_QDMA_IOCTL_BUF_CREATE`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Registers a host buffer for DMA. The ioctl may be issued on either the QDMA control fd or a qpair -fd derived from that control fd; both resolve to the same client-scoped buffer table. The kernel -pins the backing pages, builds a scatter-gather list, and DMA-maps it **once**. Subsequent transfers -reference the buffer by ``buf_id`` and reuse the cached, pre-DMA-mapped SGL instead of pinning and -mapping per transfer. Registered buffers are owned by the shared client context and are -auto-released when the final fd referencing that context is closed (including on process exit). +Creates a kernel-owned DMA buffer and returns a mappable fd for it. The ioctl may be issued on +either the QDMA control fd or a qpair fd of the same device. The kernel allocates ``length`` bytes +as a set of 4 KiB base pages (not physically contiguous), builds the transfer scatter-gather list, +and DMA-maps every page **once** — so the steady-state transfer path only slices the prebuilt SGL, +syncs the touched pages, and submits. Userspace maps the returned fd with ``mmap`` to obtain a CPU +pointer and passes the fd in ``struct slash_qdma_subxfer`` to move data. The buffer is bound to the +fd's QDMA device; transfers must use a qpair fd of that same device. **Interface:** .. code-block:: c - #define SLASH_QDMA_IOCTL_BUF_REGISTER _IOWR('v', 0x54, struct slash_qdma_buf_register) + #define SLASH_QDMA_IOCTL_BUF_CREATE _IOWR('v', 0x54, struct slash_qdma_buf_create) - struct slash_qdma_buf_register { - __u32 size; /* [in/out] ABI version */ - __u32 flags; /* [in] Reserved; must be 0 */ - __u64 user_addr; /* [in] Page-aligned host buffer base */ - __u64 length; /* [in] Buffer length in bytes (page multiple) */ - __u32 buf_id; /* [out] Kernel-assigned buffer handle */ + struct slash_qdma_buf_create { + __u32 size; /* [in/out] ABI version */ + __u32 flags; /* [in] Only O_CLOEXEC is honoured */ + __u64 length; /* [in] Buffer length in bytes (page multiple) */ + __u32 granule; /* [out] Bytes per SGL descriptor (host page size) */ __u32 transfer_hint; /* [out] enum slash_qdma_transfer_hint */ }; -**Direction:** ``_IOWR`` — issued on the control fd or a qpair fd. Userspace writes ``flags``, -``user_addr``, ``length``; the kernel writes back ``buf_id`` and ``transfer_hint``. +**Direction:** ``_IOWR`` — issued on the control fd or a qpair fd. Userspace writes ``flags`` and +``length``; the kernel writes back ``granule`` and ``transfer_hint`` and returns the new buffer fd +as the ``ioctl()`` return value (same convention as the BAR/queue-pair fd ioctls). + +The returned fd: + +- is ``mmap``-able (full length, offset 0, ``MAP_SHARED``) for CPU access to the buffer; +- releases the buffer when it (and any mapping) is closed — there is no explicit unregister ioctl; +- keeps its pages (and DMA mapping) alive as long as either the fd or any mapping exists. ``transfer_hint`` is advisory and tells userspace which queue topology the kernel expects to be -best for this registered buffer on the current hardware. Current SLASH hardware returns +best for this buffer on the current hardware. Current SLASH hardware returns ``SLASH_QDMA_TRANSFER_HINT_V80``; userspace may ignore this value. Known values are: .. code-block:: c @@ -778,60 +803,38 @@ traffic on a single queue. **Preconditions:** - ``size`` must cover at least ``length`` (the trailing input field) — otherwise ``-EINVAL`` -- ``flags`` must be 0 -- ``user_addr`` must be page-aligned; ``length`` must be a non-zero multiple of the page size -- The buffer must be backed by 4 KiB base pages +- ``flags`` must contain only ``O_CLOEXEC`` +- ``length`` must be a non-zero multiple of the page size **Postconditions:** -- ``buf_id`` is filled with the client-scoped handle, used in ``SLASH_QDMA_QPAIR_IOCTL_TRANSFER``. -- ``transfer_hint`` is filled with an advisory transfer topology hint. Current SLASH hardware - returns ``SLASH_QDMA_TRANSFER_HINT_V80``. -- The pages remain pinned and DMA-mapped until the buffer is unregistered or the owning control fd - is closed. +- the ``ioctl()`` return value is the new buffer fd (``>= 0``) +- ``granule`` is the per-descriptor page size (4 KiB); ``transfer_hint`` is an advisory topology hint +- the pages stay allocated and DMA-mapped until the fd and all mappings are closed and no transfer + is in flight **Return values:** -- ``0`` — success +- ``>= 0`` — the new buffer fd (success) - ``-EFAULT`` — copy failure -- ``-EINVAL`` — ``size`` too small, non-zero ``flags``, misaligned/zero ``length`` or ``user_addr``, - or a page granule that does not match the transfer data path -- ``-ENOMEM`` — allocation, pinning, or DMA-mapping failure -- ``-EBUSY`` — no buffer IDs available +- ``-EINVAL`` — ``size`` too small, unsupported ``flags`` bits, or misaligned/zero ``length`` +- ``-ENOMEM`` — page allocation or DMA-mapping failure - ``-ENODEV`` — device shutting down +- Other negative errno from ``anon_inode_getfile()`` or ``get_unused_fd_flags()`` -``SLASH_QDMA_IOCTL_BUF_UNREGISTER`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Removes a registered buffer from the owning client's table. This ioctl may be issued on the same -control fd used for registration or on any qpair fd derived from that client context. The pages are -unpinned and the DMA mapping torn down once no in-flight transfer still references the buffer. - -**Interface:** - -.. code-block:: c - - #define SLASH_QDMA_IOCTL_BUF_UNREGISTER _IOWR('v', 0x55, struct slash_qdma_buf_unregister) - - struct slash_qdma_buf_unregister { - __u32 size; /* [in/out] ABI version */ - __u32 buf_id; /* [in] Buffer handle from BUF_REGISTER */ - }; - -**Return values:** - -- ``0`` — success -- ``-EFAULT`` — copy failure -- ``-EINVAL`` — ``size`` too small -- ``-ENOENT`` — ``buf_id`` not found in this client's table +The ``'v'`` ``0x55`` ioctl number is reserved (it was the removed +``SLASH_QDMA_IOCTL_BUF_UNREGISTER``; kernel buffers are now released by closing the fd). ``SLASH_QDMA_QPAIR_IOCTL_TRANSFER`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Performs a DMA transfer using a registered buffer. Unlike ``read``/``write``/``pread``/``pwrite``, +Performs a DMA transfer batch using kernel buffers. Unlike ``read``/``write``/``pread``/``pwrite``, this ioctl is issued on a **queue-pair I/O fd** (from ``SLASH_QDMA_IOCTL_QPAIR_GET_FD``), not the -control device. No pages are pinned or DMA-mapped on this path — that work was amortised at -registration time — so it submits the cached, pre-DMA-mapped SGL slice directly. +control device. The transfer carries an array of per-qpair sub-transfers; sub-transfers that target +distinct queue pairs are submitted **concurrently** (all but the last asynchronously, the last +blocking, then awaited), so a single ioctl can drive both NoC channels in parallel. No pages are +allocated or DMA-mapped on this path — that work was amortised at ``BUF_CREATE`` time — so each +sub-transfer syncs and submits the cached, pre-DMA-mapped SGL slice directly. **Interface:** @@ -839,35 +842,50 @@ registration time — so it submits the cached, pre-DMA-mapped SGL slice directl #define SLASH_QDMA_QPAIR_IOCTL_TRANSFER _IOWR('v', 0x56, struct slash_qdma_transfer) - struct slash_qdma_transfer { - __u32 size; /* [in/out] ABI version */ - __u32 buf_id; /* [in] Registered buffer handle */ - __u64 buf_offset; /* [in] Byte offset within the registered buffer */ - __u64 dev_addr; /* [in] Device-side (endpoint) address */ - __u64 length; /* [in] Number of bytes to transfer */ + struct slash_qdma_subxfer { + __u32 qpair_index; /* [in] Index into the fd's bound qpairs */ __u32 direction; /* [in] 1=H2C (write), 2=C2H (read) */ + __s32 buf_fd; /* [in] Kernel buffer fd from BUF_CREATE */ __u32 pad0; /* padding */ + __u64 buf_offset; /* [in] Byte offset within the buffer */ + __u64 dev_addr; /* [in] Device-side (endpoint) address */ + __u64 length; /* [in] Number of bytes to transfer */ }; -**Direction:** ``_IOWR`` — userspace writes all input fields; the number of bytes transferred is -returned as the ``ioctl()`` return value (not as a struct field). + struct slash_qdma_transfer { + __u32 size; /* [in/out] ABI version */ + __u32 count; /* [in] Number of sub-transfers (1..SLASH_QDMA_FD_MAX_QPAIRS) */ + struct slash_qdma_subxfer xfers[SLASH_QDMA_FD_MAX_QPAIRS]; + }; + +**Direction:** ``_IOWR`` — userspace writes all input fields; the total number of bytes transferred +across all sub-transfers is returned as the ``ioctl()`` return value (not as a struct field). **Preconditions:** -- ``size`` must cover at least ``direction`` (the trailing input field) — otherwise ``-EINVAL`` -- ``direction`` must be 1 (H2C) or 2 (C2H) and must be enabled on the queue pair -- ``buf_id`` must refer to a buffer registered on the same control fd that created this qpair fd -- ``buf_offset`` and ``length`` must be aligned to the buffer's page granule, ``length`` non-zero, - and ``buf_offset + length`` must not exceed the registered length +- ``size`` must cover at least ``count`` (the trailing header field) — otherwise ``-EINVAL`` +- ``count`` must be in ``[1, SLASH_QDMA_FD_MAX_QPAIRS]`` +- each sub-transfer's ``qpair_index`` must be ``< `` the number of qpairs the fd owns +- each ``direction`` must be 1 (H2C) or 2 (C2H) and must be enabled on the selected queue pair +- each ``buf_fd`` must be a buffer fd (from ``BUF_CREATE``) bound to the same device as this qpair fd +- each ``buf_offset`` and ``length`` must be aligned to the buffer's page granule, ``length`` non-zero + and ``<= UINT_MAX``, and ``buf_offset + length`` must not exceed the buffer length **Return values:** -- ``>= 0`` — number of bytes transferred (success) +- ``>= 0`` — total number of bytes transferred (success) - ``-EFAULT`` — copy failure -- ``-EINVAL`` — ``size`` too small, bad ``direction``, or an out-of-range / misaligned slice -- ``-ENOENT`` — ``buf_id`` not found +- ``-EBADF`` — a ``buf_fd`` is not a valid open fd +- ``-EINVAL`` — ``size``/``count`` invalid, bad ``qpair_index``/``direction``, a ``buf_fd`` that is not + a SLASH buffer or belongs to another device, or an out-of-range / misaligned slice - ``-ENODEV`` — device shutting down or the requested direction is not enabled on the qpair -- Other negative errno from libqdma's ``qdma_request_submit()`` +- Other negative errno from libqdma's ``qdma_request_submit()`` (the first sub-transfer error wins) + +An optional asynchronous form of this transfer is exposed via ``io_uring`` ``uring_cmd`` (opcode +``SLASH_QDMA_URING_CMD_TRANSFER``), available only on kernels built with ``CONFIG_IO_URING`` and +``uring_cmd`` support. The SQE inline command carries a single ``__u64`` userspace pointer to a +``struct slash_qdma_transfer``; the completion CQE ``res`` holds the total bytes transferred or a +negative errno. This lets many buffer transfers be kept in flight from a single thread. Device resets and hotplugging: ``/dev/slash_hotplug`` ===================================================== diff --git a/driver/Makefile b/driver/Makefile index 07d36965..ac28900e 100644 --- a/driver/Makefile +++ b/driver/Makefile @@ -62,6 +62,8 @@ SLASH_QDMA_TIMING ?= 0 # absent, the legacy form is the unconditional fallback in slash_compat.h. SLASH_HAVE_VM_FLAGS_SET ?= n SLASH_HAVE_MODULE_IMPORT_NS_TOKEN ?= n +SLASH_HAVE_URING_CMD ?= n +SLASH_HAVE_URING_SQE_CMD ?= n # Set GCOV=1 to instrument the module for kernel gcov coverage. # Not set by default — never enable this in production builds. @@ -94,6 +96,20 @@ ifeq ($(SLASH_HAVE_MODULE_IMPORT_NS_TOKEN),y) ccflags-y += -DSLASH_HAVE_MODULE_IMPORT_NS_TOKEN endif +# Optional io_uring uring_cmd async transfer path. Probed by kcompat; absent on +# kernels without CONFIG_IO_URING or uring_cmd support (e.g. RHEL 9, Ubuntu +# 22.04 GA), where the synchronous transfer ioctl remains the only path. +ifeq ($(SLASH_HAVE_URING_CMD),y) +ccflags-y += -DSLASH_HAVE_URING_CMD +endif + +# Selects the io_uring SQE payload accessor: io_uring_sqe_cmd(cmd->sqe) when +# present (newer kernels + distro backports), else cmd->cmd. Only meaningful +# when SLASH_HAVE_URING_CMD is also set. +ifeq ($(SLASH_HAVE_URING_SQE_CMD),y) +ccflags-y += -DSLASH_HAVE_URING_SQE_CMD +endif + # Force-include the compat header into every TU (including the pinned libqdma # submodule sources we don't modify) so kernel-API shims such as from_timer() # reach third-party code too. Safe on all kernels: the shims are guarded. diff --git a/driver/kcompat/uring_cmd.c b/driver/kcompat/uring_cmd.c new file mode 100644 index 00000000..21e9ef93 --- /dev/null +++ b/driver/kcompat/uring_cmd.c @@ -0,0 +1,78 @@ +/** + * Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. + * This program is free software; you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation; version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without + * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with this program; if + * not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +/* + * Probe for the io_uring uring_cmd *infrastructure* in the exact shape + * slash_qdma.c uses, excluding the SQE payload accessor (that axis changed + * independently and is probed separately by uring_sqe_cmd.c): + * - struct file_operations has a .uring_cmd member, + * - struct io_uring_cmd exposes ->pdu, ->file, and ->cmd_op, + * - io_uring_cmd_complete_in_task() takes a (cmd, issue_flags) callback, + * - io_uring_cmd_done() takes (cmd, ret, res2, issue_flags). + * + * This requires CONFIG_IO_URING and a kernel >= 5.19 with the settled + * (>= 6.1) signatures; anywhere it fails to build, SLASH_HAVE_URING_CMD=n and + * the optional async transfer path is compiled out. The payload pointer is + * read via the SLASH_HAVE_URING_SQE_CMD-selected accessor (see slash_qdma.c): + * io_uring_sqe_cmd(cmd->sqe) on newer kernels, cmd->cmd on older ones. + */ + +#include +#include +#include +#include +#if __has_include() +#include +#endif + +static void conftest_tw(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + io_uring_cmd_done(cmd, 0, 0, issue_flags); +} + +static int conftest_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + void *p = cmd->pdu; + struct file *f = cmd->file; + u32 op = cmd->cmd_op; + + (void)p; + (void)f; + (void)op; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + io_uring_cmd_complete_in_task(cmd, conftest_tw); + return -EIOCBQUEUED; +} + +static const struct file_operations conftest_fops = { + .owner = THIS_MODULE, + .uring_cmd = conftest_uring_cmd, +}; + +static int __init conftest_init(void) +{ + (void)conftest_fops; + return 0; +} + +static void __exit conftest_exit(void) +{ +} + +MODULE_LICENSE("GPL"); +module_init(conftest_init); +module_exit(conftest_exit); diff --git a/driver/kcompat/uring_sqe_cmd.c b/driver/kcompat/uring_sqe_cmd.c new file mode 100644 index 00000000..62020b30 --- /dev/null +++ b/driver/kcompat/uring_sqe_cmd.c @@ -0,0 +1,59 @@ +/** + * Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. + * This program is free software; you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation; version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without + * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with this program; if + * not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +/* + * Probe for the *newer* io_uring uring_cmd SQE payload accessor. + * + * Upstream removed `struct io_uring_cmd::cmd` (a const void * pointing at the + * inline SQE command payload) and replaced it with `->sqe` plus the + * io_uring_sqe_cmd() accessor. This change was backported into distro kernels + * (e.g. Ubuntu 6.8), so a LINUX_VERSION_CODE check is unreliable — probe the + * accessor directly instead. + * + * - SLASH_HAVE_URING_SQE_CMD=y -> use io_uring_sqe_cmd(cmd->sqe) + * - SLASH_HAVE_URING_SQE_CMD=n -> fall back to cmd->cmd (older kernels) + * + * This probe only governs the payload accessor; the rest of the uring_cmd + * infrastructure is probed by uring_cmd.c (SLASH_HAVE_URING_CMD). + */ + +#include +#include +#include +#include +#if __has_include() +#include +#endif + +static int conftest_uring_sqe_cmd(struct io_uring_cmd *cmd) +{ + const void *payload = io_uring_sqe_cmd(cmd->sqe); + + (void)payload; + return 0; +} + +static int __init conftest_init(void) +{ + (void)conftest_uring_sqe_cmd; + return 0; +} + +static void __exit conftest_exit(void) +{ +} + +MODULE_LICENSE("GPL"); +module_init(conftest_init); +module_exit(conftest_exit); diff --git a/driver/libslash/README.md b/driver/libslash/README.md index 2315691c..5dc4c07d 100644 --- a/driver/libslash/README.md +++ b/driver/libslash/README.md @@ -108,23 +108,23 @@ uint32_t qid = req.qid; slash_qdma_qpair_start(qdma, qid); -/* Get an ioctl-only qpair fd for registered-buffer transfers. */ +/* Get an ioctl-only qpair fd for buffer transfers. */ int fd = slash_qdma_qpair_get_fd(qdma, qid, O_CLOEXEC); -/* buf must be page-aligned and a whole number of pages */ -uint32_t buf_id; -enum slash_qdma_transfer_hint hint; -slash_qdma_qpair_buffer_register(fd, buf, len, &buf_id, &hint); -/* Current SLASH hardware returns SLASH_QDMA_TRANSFER_HINT_V80. - * Pass NULL instead of &hint if the application does not care. */ +/* Create a kernel-owned DMA buffer (length must be a whole number of pages) + * and mmap it for CPU access via buf.addr. Current SLASH hardware reports + * SLASH_QDMA_TRANSFER_HINT_V80 in buf.transfer_hint. */ +struct slash_qdma_buffer buf; +slash_qdma_qpair_buffer_create(fd, len, &buf); +/* ... fill buf.addr from the CPU for an H2C transfer ... */ /* H2C: host -> device at dev_addr */ -slash_qdma_qpair_transfer(fd, buf_id, /*buf_offset=*/0, dev_addr, len, +slash_qdma_qpair_transfer(fd, buf.fd, /*buf_offset=*/0, dev_addr, len, SLASH_QDMA_XFER_H2C); /* C2H: device -> host */ -slash_qdma_qpair_transfer(fd, buf_id, 0, dev_addr, len, SLASH_QDMA_XFER_C2H); +slash_qdma_qpair_transfer(fd, buf.fd, 0, dev_addr, len, SLASH_QDMA_XFER_C2H); -slash_qdma_qpair_buffer_unregister(fd, buf_id); +slash_qdma_buffer_destroy(&buf); close(fd); slash_qdma_qpair_stop(qdma, qid); diff --git a/driver/libslash/include/slash/qdma.h b/driver/libslash/include/slash/qdma.h index b7ef1531..6f097288 100644 --- a/driver/libslash/include/slash/qdma.h +++ b/driver/libslash/include/slash/qdma.h @@ -31,18 +31,18 @@ * 6. slash_qdma_qpair_del() — destroy * 7. slash_qdma_close() — close the device * - * The fd from qpair_get_fd() is ioctl-only for data movement: register host - * buffers with slash_qdma_qpair_buffer_register() (or through the owning - * control fd), then transfer with slash_qdma_qpair_transfer() / - * slash_qdma_transfer(). read(), write(), mmap(), and poll() are not + * The fd from qpair_get_fd() is ioctl-only for data movement: create kernel + * buffers with slash_qdma_buffer_create() (or slash_qdma_qpair_buffer_create() + * through a queue-pair fd), then move them with slash_qdma_qpair_transfer() / + * slash_qdma_qpair_transfer_batch(). read(), write(), and poll() are not * available for SLASH transfers. * - * Registered buffers: - * For high-throughput transfers, a host buffer can be registered once - * with slash_qdma_buffer_register() (pinning its pages and DMA-mapping - * it), then moved with slash_qdma_transfer() which references the buffer - * by handle instead of re-pinning per call. Buffers are owned by the - * open QDMA handle and are auto-released when it is closed. + * Kernel buffers: + * For high-throughput transfers, the kernel allocates a DMA buffer once + * (pages + SGL + DMA mapping built at creation), returns a mappable fd, and + * userspace mmaps it for CPU access. Transfers reference the buffer by its + * fd instead of re-pinning per call. Closing the buffer fd (and unmapping) + * releases it. * * Error conventions: int-returning functions return -1 with errno set. * Pointer-returning functions return NULL with errno set. @@ -161,62 +161,88 @@ int slash_qdma_qpair_del(struct slash_qdma *qdma, uint32_t qid); int slash_qdma_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags); /** - * @brief Register a host buffer for DMA, pinning and DMA-mapping it once. + * @brief Obtain a transfer fd bound to one or more queue pairs. * - * @param qdma Open QDMA handle. - * @param addr Page-aligned host buffer base. - * @param length Buffer length in bytes (non-zero multiple of the page size). - * @param buf_id [out] Receives the kernel-assigned buffer handle. - * @param transfer_hint [out] Optional transfer-topology hint; pass NULL to ignore. + * Like slash_qdma_qpair_get_fd(), but the returned fd is a collection of up to + * SLASH_QDMA_FD_MAX_QPAIRS queue pairs. A transfer issued on the fd selects a + * bound queue pair by its index in @qids, so a single transfer can fan across + * both AXI-MM/NoC channels. Each bound queue pair keeps whatever per-qpair + * settings (mm_channel, ring sizes, directions) it was given at add time. * - * The buffer is owned by @qdma and is automatically released when the - * handle is closed. Pass the returned @buf_id to slash_qdma_transfer(). - * Current SLASH hardware returns SLASH_QDMA_TRANSFER_HINT_V80. + * @param qdma Open QDMA handle. + * @param qids Array of @qpair_count queue pair IDs (must be started). + * @param qpair_count Number of entries in @qids (1..SLASH_QDMA_FD_MAX_QPAIRS). + * @param flags Only O_CLOEXEC is accepted. * - * @return 0 on success, -1 on failure (errno set). + * @return Non-negative fd on success, -1 on failure (errno set). + */ +int slash_qdma_qpair_get_fd_multi(struct slash_qdma *qdma, const uint32_t *qids, + uint32_t qpair_count, int flags); + +/** + * @brief A kernel-owned DMA buffer and its CPU mapping. + * + * Created by slash_qdma_buffer_create() / slash_qdma_qpair_buffer_create() and + * released by slash_qdma_buffer_destroy(). @addr is an mmap of the kernel + * buffer fd; write/read it from the CPU and move it with the transfer helpers, + * passing @fd as the sub-transfer's buf_fd. */ -int slash_qdma_buffer_register(struct slash_qdma *qdma, void *addr, - uint64_t length, uint32_t *buf_id, - enum slash_qdma_transfer_hint *transfer_hint); +struct slash_qdma_buffer { + int fd; /**< Buffer fd (close via destroy). */ + void *addr; /**< CPU mapping of the buffer. */ + uint64_t length; /**< Buffer length in bytes. */ + uint32_t granule; /**< Bytes per DMA descriptor (page). */ + enum slash_qdma_transfer_hint transfer_hint; /**< Advisory channel policy. */ +}; /** - * @brief Unregister a buffer previously registered with - * slash_qdma_buffer_register(). + * @brief Create a kernel-owned DMA buffer and mmap it. * - * @param qdma Open QDMA handle. - * @param buf_id Buffer handle to release. + * Allocates @length bytes of kernel memory (DMA-mapped once), returns a buffer + * fd, and mmaps it into @buf_out->addr for CPU access. The buffer is bound to + * @qdma's device; transfers must use a queue-pair fd of the same device. + * + * @param qdma Open QDMA handle. + * @param length Buffer length in bytes (non-zero multiple of the page size). + * @param buf_out [out] Receives the created buffer (fd, mapping, metadata). * * @return 0 on success, -1 on failure (errno set). */ -int slash_qdma_buffer_unregister(struct slash_qdma *qdma, uint32_t buf_id); +int slash_qdma_buffer_create(struct slash_qdma *qdma, uint64_t length, + struct slash_qdma_buffer *buf_out); /** - * @brief Register a host buffer through a queue-pair fd. + * @brief Create a kernel-owned DMA buffer through a queue-pair fd. * - * Same semantics as slash_qdma_buffer_register(), but issues the registration - * ioctl on @p qpair_fd. This is useful for clients that received only qpair + * Same semantics as slash_qdma_buffer_create(), but issues the create ioctl on + * @p qpair_fd. This is the preferred form for clients that received only qpair * fds via SCM_RIGHTS (for example libvrtd clients). * * @return 0 on success, -1 on failure (errno set). */ -int slash_qdma_qpair_buffer_register(int qpair_fd, void *addr, - uint64_t length, uint32_t *buf_id, - enum slash_qdma_transfer_hint *transfer_hint); +int slash_qdma_qpair_buffer_create(int qpair_fd, uint64_t length, + struct slash_qdma_buffer *buf_out); /** - * @brief Unregister a buffer through a queue-pair fd. + * @brief Release a buffer created with slash_qdma_buffer_create() or + * slash_qdma_qpair_buffer_create(). + * + * Unmaps @buf->addr and closes @buf->fd. Safe to call on a zeroed/partial + * buffer (fields are reset). * * @return 0 on success, -1 on failure (errno set). */ -int slash_qdma_qpair_buffer_unregister(int qpair_fd, uint32_t buf_id); +int slash_qdma_buffer_destroy(struct slash_qdma_buffer *buf); /** - * @brief Perform a DMA transfer using a registered buffer. + * @brief Perform a DMA transfer using a single buffer fd. + * + * Convenience wrapper around slash_qdma_qpair_transfer_batch() for a single + * sub-transfer on qpair_index 0. * - * @param qdma Open QDMA handle (used to dispatch to the mock backend). * @param qpair_fd Queue-pair I/O fd from slash_qdma_qpair_get_fd(). - * @param buf_id Registered buffer handle. - * @param buf_offset Byte offset within the registered buffer. + * @param buf_fd Buffer fd (from slash_qdma_buffer_create()). + * @param buf_offset Byte offset within the buffer. * @param dev_addr Device-side (endpoint) address. * @param length Number of bytes to transfer. * @param direction One of enum slash_qdma_transfer_dir (H2C or C2H). @@ -224,24 +250,28 @@ int slash_qdma_qpair_buffer_unregister(int qpair_fd, uint32_t buf_id); * @return Number of bytes transferred (>= 0) on success, -1 on failure * (errno set). */ -ssize_t slash_qdma_transfer(struct slash_qdma *qdma, int qpair_fd, - uint32_t buf_id, uint64_t buf_offset, - uint64_t dev_addr, uint64_t length, - uint32_t direction); +ssize_t slash_qdma_qpair_transfer(int qpair_fd, int buf_fd, + uint64_t buf_offset, uint64_t dev_addr, + uint64_t length, uint32_t direction); /** - * @brief Perform a DMA transfer using only a queue-pair fd. + * @brief Perform a batch of buffer DMA sub-transfers in one call. * - * Same transfer ioctl as slash_qdma_transfer(), but without a device handle. - * This is the preferred form for code that received a qpair fd over - * SCM_RIGHTS. + * Issues a single transfer ioctl carrying @count sub-transfers. The kernel + * runs sub-transfers that target distinct queue pairs concurrently, so one + * call can drive both NoC channels in parallel. Each sub-transfer names a + * bound queue pair by index (see slash_qdma_qpair_get_fd_multi()) and a buffer + * by its buf_fd. * - * @return Number of bytes transferred (>= 0) on success, -1 on failure - * (errno set). + * @param qpair_fd Transfer fd from slash_qdma_qpair_get_fd[_multi](). + * @param xfers Array of @count sub-transfer descriptors. + * @param count Number of sub-transfers (1..SLASH_QDMA_FD_MAX_QPAIRS). + * + * @return Total bytes transferred (>= 0) on success, -1 on failure (errno set). */ -ssize_t slash_qdma_qpair_transfer(int qpair_fd, uint32_t buf_id, - uint64_t buf_offset, uint64_t dev_addr, - uint64_t length, uint32_t direction); +ssize_t slash_qdma_qpair_transfer_batch(int qpair_fd, + const struct slash_qdma_subxfer *xfers, + uint32_t count); #ifdef __cplusplus } /* extern "C" */ diff --git a/driver/libslash/include/slash/uapi/slash_interface.h b/driver/libslash/include/slash/uapi/slash_interface.h index 7a5dfe55..1b1d85cb 100644 --- a/driver/libslash/include/slash/uapi/slash_interface.h +++ b/driver/libslash/include/slash/uapi/slash_interface.h @@ -222,24 +222,45 @@ struct slash_qdma_qpair_op { __u32 op; /**< [in] One of the SLASH_QDMA_QUEUE_OP_* constants. */ }; +/** + * @brief Maximum number of queue pairs a single transfer fd may own. + * + * A transfer fd is a collection of up to this many queue pairs (the intended + * use is one per AXI-MM/NoC channel). A single transfer ioctl issued on the + * fd may fan a buffer transfer across all of them, running up to this many + * hardware DMAs in parallel. Each bound qpair keeps whatever settings it was + * given at SLASH_QDMA_IOCTL_QPAIR_ADD time (mm_channel, ring sizes, etc.), so + * the two channels can be configured independently. + */ +#define SLASH_QDMA_FD_MAX_QPAIRS 2u + /** * @brief Obtain a file descriptor for queue I/O. * - * The returned fd can be used for registered-buffer ioctls to transfer data - * through the queue pair. + * The returned fd is a collection of one or two queue pairs. It can be used + * for registered-buffer ioctls to transfer data through those queue pairs. + * + * The fd is returned as the ioctl return value (same convention as the BAR fd + * ioctl). Data movement is issued via SLASH_QDMA_QPAIR_IOCTL_TRANSFER, whose + * sub-transfers select a bound queue pair by index and a direction (which must + * have been enabled in \@dir_mask when that queue pair was added). * - * The fd is returned as the ioctl return value (same convention as - * the BAR fd ioctl). A single fd is returned per queue pair; - * Data movement is issued via SLASH_QDMA_QPAIR_IOCTL_TRANSFER, using - * whichever directions were enabled in \@dir_mask when the queue pair was - * added. + * Set \@qpair_count to the number of queue pairs to bind and list their IDs in + * \@qpair_ids; the array index becomes the qpair_index used by + * struct slash_qdma_subxfer. For backward compatibility \@qpair_count == 0 + * binds the single queue pair named by \@qid. */ struct slash_qdma_qpair_fd_request { __u32 size; /**< Struct size for ABI versioning. */ /* Userspace to kernel */ - __u32 qid; /**< [in] Queue pair ID. */ + __u32 qid; /**< [in] Legacy single queue pair ID; used only when + * @qpair_count == 0. */ __u32 flags; /**< [in] File descriptor flags. Only O_CLOEXEC is honoured. */ + __u32 qpair_count; /**< [in] Number of valid entries in @qpair_ids + * (1..SLASH_QDMA_FD_MAX_QPAIRS); 0 = use @qid. */ + __u32 qpair_ids[SLASH_QDMA_FD_MAX_QPAIRS]; /**< [in] Queue pair IDs bound to + * this fd; the array index is the qpair_index. */ }; /** @@ -264,70 +285,74 @@ enum slash_qdma_transfer_hint { }; /** - * @brief Register a host buffer for DMA, pinning its pages once. + * @brief Create a kernel-owned DMA buffer and return a mappable fd. * - * The kernel pins the pages backing [user_addr, user_addr + length), - * builds a scatter-gather list, and DMA-maps it once. Subsequent - * transfers reference the buffer by \@buf_id instead of re-pinning and - * re-mapping per transfer. + * The kernel allocates @length bytes of host memory as a set of 4 KiB base + * pages (not physically contiguous), builds the transfer scatter-gather list, + * and DMA-maps every page once. All of this expensive setup happens here, at + * creation time, so the steady-state transfer path only slices the prebuilt + * SGL, syncs the relevant pages, and submits. * - * \@user_addr must be page-aligned and \@length a non-zero multiple of - * the host page size. The buffer is backed by 4 KiB base pages, matching - * the transfer data path. + * The new buffer is returned as an fd (via the ioctl return value, same + * convention as the BAR/queue-pair fd ioctls). Userspace maps it with + * mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED, buf_fd, 0) to obtain + * a CPU pointer, and passes @buf_fd in struct slash_qdma_subxfer to move data. + * The pages stay alive as long as either the fd or any mapping exists, and the + * DMA mapping is torn down once both are gone and no transfer is in flight. * - * Buffers are owned by the control-fd open instance they are registered - * through, and are automatically unregistered when that fd is closed - * (including on process exit) if userspace forgets to unregister them. + * The buffer is bound to the QDMA device of the fd it is created on (control + * fd or queue-pair fd); transfers must use a queue-pair fd of the same device. * - * The kernel also returns @transfer_hint. Current SLASH hardware returns - * SLASH_QDMA_TRANSFER_HINT_V80; userspace may ignore this field. + * \@length must be a non-zero multiple of the host page size. The kernel + * returns the page @granule (bytes per descriptor) and a @transfer_hint; + * current SLASH hardware returns SLASH_QDMA_TRANSFER_HINT_V80. */ -struct slash_qdma_buf_register { - __u32 size; /**< Struct size for ABI versioning. */ +struct slash_qdma_buf_create { + __u32 size; /**< Struct size for ABI versioning. */ /* Userspace to kernel */ - __u32 flags; /**< [in] Reserved; must be 0. */ - __u64 user_addr; /**< [in] Page-aligned host buffer base address. */ - __u64 length; /**< [in] Buffer length in bytes (page multiple). */ + __u32 flags; /**< [in] File descriptor flags. Only O_CLOEXEC is honoured. */ + __u64 length; /**< [in] Buffer length in bytes (page multiple). */ /* Kernel to userspace */ - __u32 buf_id; /**< [out] Kernel-assigned buffer handle. */ + __u32 granule; /**< [out] Bytes per SGL descriptor (host page size). */ __u32 transfer_hint; /**< [out] enum slash_qdma_transfer_hint. */ }; /** - * @brief Unregister a previously registered buffer. + * @brief One per-queue-pair sub-transfer within a transfer batch. * - * Removes the buffer from the owning client's lookup table. The pages - * are unpinned and the DMA mapping torn down once no in-flight transfer - * still references the buffer. + * Moves \@length bytes between the kernel buffer named by \@buf_fd at + * \@buf_offset and the device endpoint address \@dev_addr, on the queue pair + * selected by \@qpair_index (an index into the fd's bound qpairs). + * \@buf_offset and \@length must be aligned to the buffer's 4 KiB page granule, + * and \@buf_offset + \@length must not exceed the buffer length. \@direction + * must be one of enum slash_qdma_transfer_dir and must be enabled on the + * selected queue pair. */ -struct slash_qdma_buf_unregister { - __u32 size; /**< Struct size for ABI versioning. */ - __u32 buf_id; /**< [in] Buffer handle from slash_qdma_buf_register. */ +struct slash_qdma_subxfer { + __u32 qpair_index; /**< [in] Index into the fd's bound qpairs. */ + __u32 direction; /**< [in] enum slash_qdma_transfer_dir (H2C or C2H). */ + __s32 buf_fd; /**< [in] Kernel buffer fd from SLASH_QDMA_IOCTL_BUF_CREATE. */ + __u32 pad0; /**< Padding for natural alignment. */ + __u64 buf_offset; /**< [in] Byte offset within the buffer. */ + __u64 dev_addr; /**< [in] Device-side (endpoint) address. */ + __u64 length; /**< [in] Number of bytes to transfer. */ }; /** - * @brief Perform a DMA transfer using a registered buffer. - * - * Issued on a queue-pair I/O fd (from SLASH_QDMA_IOCTL_QPAIR_GET_FD). - * Transfers \@length bytes between the registered buffer at - * \@buf_offset and the device endpoint address \@dev_addr. The number - * of bytes transferred is returned as the ioctl return value. + * @brief Perform one or more buffer DMA sub-transfers in one call. * - * \@buf_offset and \@length must be aligned to the registered buffer's - * 4 KiB page granule, and \@buf_offset + \@length must not exceed the - * registered length. \@direction must be one of enum slash_qdma_transfer_dir - * and must be enabled on the queue pair. + * Issued on a queue-pair I/O fd (from SLASH_QDMA_IOCTL_QPAIR_GET_FD). The + * kernel submits all \@count sub-transfers and waits for completion, running + * those that target distinct queue pairs concurrently (so a single syscall can + * drive both NoC channels in parallel). The total number of bytes transferred + * across all sub-transfers is returned as the ioctl return value. */ struct slash_qdma_transfer { __u32 size; /**< Struct size for ABI versioning. */ - __u32 buf_id; /**< [in] Registered buffer handle. */ - __u64 buf_offset; /**< [in] Byte offset within the registered buffer. */ - __u64 dev_addr; /**< [in] Device-side (endpoint) address. */ - __u64 length; /**< [in] Number of bytes to transfer. */ - __u32 direction; /**< [in] enum slash_qdma_transfer_dir (H2C or C2H). */ - __u32 pad0; /**< Padding for natural alignment. */ + __u32 count; /**< [in] Number of sub-transfers (1..SLASH_QDMA_FD_MAX_QPAIRS). */ + struct slash_qdma_subxfer xfers[SLASH_QDMA_FD_MAX_QPAIRS]; /**< [in] Sub-transfers. */ }; /** Query QDMA subsystem capabilities. */ @@ -342,16 +367,30 @@ struct slash_qdma_transfer { /** Obtain an I/O file descriptor for a queue pair. */ #define SLASH_QDMA_IOCTL_QPAIR_GET_FD _IOWR('v', 0x53, struct slash_qdma_qpair_fd_request) -/** Register a host buffer (pin + DMA-map once); returns assigned buf_id. */ -#define SLASH_QDMA_IOCTL_BUF_REGISTER _IOWR('v', 0x54, struct slash_qdma_buf_register) +/** + * Create a kernel-owned DMA buffer (allocate pages + build SGL + DMA-map once); + * returns a mappable buffer fd as the ioctl return value. May be issued on the + * control device or a queue-pair I/O fd. + */ +#define SLASH_QDMA_IOCTL_BUF_CREATE _IOWR('v', 0x54, struct slash_qdma_buf_create) -/** Unregister a previously registered buffer. */ -#define SLASH_QDMA_IOCTL_BUF_UNREGISTER _IOWR('v', 0x55, struct slash_qdma_buf_unregister) +/* 'v' 0x55 is reserved (previously SLASH_QDMA_IOCTL_BUF_UNREGISTER, removed: + * kernel buffers are released by closing their fd). */ /** - * Perform a registered-buffer DMA transfer. Issued on a queue-pair I/O - * fd (not the control device); returns the number of bytes transferred. + * Perform a buffer DMA transfer. Issued on a queue-pair I/O fd (not the + * control device); returns the number of bytes transferred. */ #define SLASH_QDMA_QPAIR_IOCTL_TRANSFER _IOWR('v', 0x56, struct slash_qdma_transfer) +/** + * io_uring command opcode (SQE cmd_op) for an asynchronous buffer transfer + * batch, issued on a queue-pair I/O fd via IORING_OP_URING_CMD. The SQE inline + * command carries a single __u64: the userspace pointer to a struct + * slash_qdma_transfer. The completion CQE res holds the total bytes + * transferred (>= 0) or a negative errno. This path is optional and only + * available on kernels with io_uring uring_cmd support. + */ +#define SLASH_QDMA_URING_CMD_TRANSFER 0x56u + #endif diff --git a/driver/libslash/src/qdma.c b/driver/libslash/src/qdma.c index 6a0606ea..efe8c3d3 100644 --- a/driver/libslash/src/qdma.c +++ b/driver/libslash/src/qdma.c @@ -40,107 +40,177 @@ #include #include +#include +#include -#define QPAIR_FALLBACK_MAX_BUFS 128 - -struct qpair_fallback_buf { - int in_use; - void *addr; - uint64_t length; -}; +/* Bounce-copy chunk used by the @mock transfer fallback. */ +#define QDMA_XFER_BOUNCE_CHUNK (1u << 20) /* - * Small process-local fallback table used only when qpair-fd registration - * ioctls return ENOTTY (the memfd-backed @mock path). Real hardware qpair fds - * implement the ioctl in the kernel and never use this table. + * mmap a buffer fd (kernel buffer or @mock memfd) for CPU access. Always + * MAP_SHARED so writes are visible to the kernel/device and to pread/pwrite on + * the same fd. */ -static struct qpair_fallback_buf qpair_fallback_bufs[QPAIR_FALLBACK_MAX_BUFS]; +static int qdma_buffer_mmap(struct slash_qdma_buffer *buf) +{ + void *addr = mmap(NULL, (size_t)buf->length, PROT_READ | PROT_WRITE, + MAP_SHARED, buf->fd, 0); + + if (addr == MAP_FAILED) { + return -1; + } + buf->addr = addr; + return 0; +} -static int qpair_fallback_register(void *addr, uint64_t length, uint32_t *buf_id, - enum slash_qdma_transfer_hint *transfer_hint) +/* + * @mock / fallback buffer: a memfd sized to @length and mmapped shared. Used + * when the BUF_CREATE ioctl is unavailable (the memfd-backed @mock path). + */ +static int qdma_buffer_create_memfd(uint64_t length, + struct slash_qdma_buffer *buf_out) { - uint32_t i; + int fd; + int saved_errno; - if (addr == NULL || length == 0 || buf_id == NULL) { - errno = EINVAL; + fd = memfd_create("slash_qdma_buf", MFD_CLOEXEC); + if (fd < 0) { + return -1; + } + if (ftruncate(fd, (off_t)length) != 0) { + saved_errno = errno; + (void)close(fd); + errno = saved_errno; return -1; } - for (i = 0; i < QPAIR_FALLBACK_MAX_BUFS; ++i) { - if (!qpair_fallback_bufs[i].in_use) { - qpair_fallback_bufs[i].in_use = 1; - qpair_fallback_bufs[i].addr = addr; - qpair_fallback_bufs[i].length = length; - *buf_id = i; - if (transfer_hint != NULL) { - *transfer_hint = SLASH_QDMA_TRANSFER_HINT_V80; - } - return 0; - } + buf_out->fd = fd; + buf_out->length = length; + buf_out->granule = 4096; + buf_out->transfer_hint = SLASH_QDMA_TRANSFER_HINT_V80; + buf_out->addr = NULL; + + if (qdma_buffer_mmap(buf_out) != 0) { + saved_errno = errno; + (void)close(fd); + buf_out->fd = -1; + errno = saved_errno; + return -1; } - errno = ENOSPC; - return -1; + return 0; } -static int qpair_fallback_unregister(uint32_t buf_id) +/* + * Create a kernel buffer via the BUF_CREATE ioctl on @ioctl_fd (control fd or + * queue-pair fd), then mmap it. Falls back to a memfd buffer when the ioctl is + * not implemented (ENOTTY: the @mock path). + */ +static int qdma_buffer_create_on_fd(int ioctl_fd, uint64_t length, + struct slash_qdma_buffer *buf_out) { - if (buf_id >= QPAIR_FALLBACK_MAX_BUFS || !qpair_fallback_bufs[buf_id].in_use) { - errno = ENOENT; + struct slash_qdma_buf_create req; + int fd; + int saved_errno; + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.flags = O_CLOEXEC; + req.length = length; + + fd = ioctl(ioctl_fd, SLASH_QDMA_IOCTL_BUF_CREATE, &req); + if (fd < 0) { + if (errno == ENOTTY) { + return qdma_buffer_create_memfd(length, buf_out); + } + return -1; + } + + buf_out->fd = fd; + buf_out->length = length; + buf_out->granule = req.granule ? req.granule : 4096; + buf_out->transfer_hint = (enum slash_qdma_transfer_hint)req.transfer_hint; + buf_out->addr = NULL; + + if (qdma_buffer_mmap(buf_out) != 0) { + saved_errno = errno; + (void)close(fd); + buf_out->fd = -1; + errno = saved_errno; return -1; } - memset(&qpair_fallback_bufs[buf_id], 0, sizeof(qpair_fallback_bufs[buf_id])); return 0; } -static ssize_t qpair_fallback_transfer(int qpair_fd, uint32_t buf_id, - uint64_t buf_offset, uint64_t dev_addr, - uint64_t length, uint32_t direction) +/* + * @mock transfer fallback: bounce a single sub-transfer between the host buffer + * fd and the queue-pair memfd that stands in for device memory. Only used when + * the transfer ioctl returns ENOTTY. + */ +static ssize_t qdma_fallback_subxfer(int qpair_fd, + const struct slash_qdma_subxfer *x) { - struct qpair_fallback_buf *buf; - char *host; + uint8_t *tmp; uint64_t done = 0; - if (qpair_fd < 0 || buf_id >= QPAIR_FALLBACK_MAX_BUFS || - !qpair_fallback_bufs[buf_id].in_use) { + if (x->buf_fd < 0 || + (x->direction != SLASH_QDMA_XFER_H2C && + x->direction != SLASH_QDMA_XFER_C2H)) { errno = EINVAL; return -1; } - buf = &qpair_fallback_bufs[buf_id]; - if (length == 0 || buf_offset > buf->length || length > buf->length - buf_offset) { - errno = EINVAL; + /* + * For C2H, make sure the device memfd is large enough that reads of + * never-written regions return zeros instead of a short read. Only ever + * grow the file: shrinking would discard data a prior H2C wrote. + */ + if (x->direction == SLASH_QDMA_XFER_C2H) { + struct stat st; + off_t want = (off_t)(x->dev_addr + x->length); + + if (fstat(qpair_fd, &st) == 0 && st.st_size < want) { + (void)ftruncate(qpair_fd, want); + } + } + + tmp = (uint8_t *)malloc(QDMA_XFER_BOUNCE_CHUNK); + if (tmp == NULL) { return -1; } - host = (char *)buf->addr + buf_offset; - while (done < length) { - ssize_t n; + while (done < x->length) { + uint64_t remaining = x->length - done; + size_t chunk = remaining < QDMA_XFER_BOUNCE_CHUNK + ? (size_t)remaining : QDMA_XFER_BOUNCE_CHUNK; + ssize_t r; + ssize_t w; - if (direction == SLASH_QDMA_XFER_H2C) { - n = pwrite(qpair_fd, host + done, (size_t)(length - done), - (off_t)(dev_addr + done)); - } else if (direction == SLASH_QDMA_XFER_C2H) { - n = pread(qpair_fd, host + done, (size_t)(length - done), - (off_t)(dev_addr + done)); + if (x->direction == SLASH_QDMA_XFER_H2C) { + r = pread(x->buf_fd, tmp, chunk, (off_t)(x->buf_offset + done)); + if (r <= 0) { + free(tmp); + return -1; + } + w = pwrite(qpair_fd, tmp, (size_t)r, (off_t)(x->dev_addr + done)); } else { - errno = EINVAL; - return -1; + r = pread(qpair_fd, tmp, chunk, (off_t)(x->dev_addr + done)); + if (r <= 0) { + free(tmp); + return -1; + } + w = pwrite(x->buf_fd, tmp, (size_t)r, (off_t)(x->buf_offset + done)); } - if (n < 0) { - if (errno == EINTR) { - continue; - } + if (w != r) { + free(tmp); return -1; } - if (n == 0) { - break; - } - done += (uint64_t)n; + done += (uint64_t)r; } + free(tmp); return (ssize_t)done; } @@ -352,186 +422,156 @@ int slash_qdma_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags) return fd; } -int slash_qdma_buffer_register(struct slash_qdma *qdma, void *addr, - uint64_t length, uint32_t *buf_id, - enum slash_qdma_transfer_hint *transfer_hint) +int slash_qdma_qpair_get_fd_multi(struct slash_qdma *qdma, const uint32_t *qids, + uint32_t qpair_count, int flags) { - struct slash_qdma_buf_register req; - int ret; + struct slash_qdma_qpair_fd_request req; + uint32_t i; + int fd; - if (qdma == NULL || addr == NULL || buf_id == NULL) { + if (qdma == NULL || qids == NULL || + qpair_count == 0 || qpair_count > SLASH_QDMA_FD_MAX_QPAIRS) { errno = EINVAL; return -1; } if (qdma->priv) { - return slash_qdma_mock_buffer_register(qdma, addr, length, buf_id, - transfer_hint); + return slash_qdma_mock_qpair_get_fd_multi(qdma, qids, qpair_count, + flags); } memset(&req, 0, sizeof(req)); - req.size = sizeof(req); - req.user_addr = (uint64_t)(uintptr_t)addr; - req.length = length; - - ret = ioctl(qdma->fd, SLASH_QDMA_IOCTL_BUF_REGISTER, &req); - if (ret < 0) { - return -1; + req.size = sizeof(req); + req.flags = flags; + req.qid = qids[0]; + req.qpair_count = qpair_count; + for (i = 0; i < qpair_count; ++i) { + req.qpair_ids[i] = qids[i]; } - *buf_id = req.buf_id; - if (transfer_hint != NULL) { - *transfer_hint = req.transfer_hint; + fd = ioctl(qdma->fd, SLASH_QDMA_IOCTL_QPAIR_GET_FD, &req); + if (fd < 0) { + return -1; } - return 0; + return fd; } -int slash_qdma_buffer_unregister(struct slash_qdma *qdma, uint32_t buf_id) +int slash_qdma_buffer_create(struct slash_qdma *qdma, uint64_t length, + struct slash_qdma_buffer *buf_out) { - struct slash_qdma_buf_unregister req; - int ret; - - if (qdma == NULL) { + if (qdma == NULL || buf_out == NULL || length == 0) { errno = EINVAL; return -1; } + /* @mock has no character device: back the buffer with a memfd directly. */ if (qdma->priv) { - return slash_qdma_mock_buffer_unregister(qdma, buf_id); + return qdma_buffer_create_memfd(length, buf_out); } - memset(&req, 0, sizeof(req)); - req.size = sizeof(req); - req.buf_id = buf_id; + return qdma_buffer_create_on_fd(qdma->fd, length, buf_out); +} - ret = ioctl(qdma->fd, SLASH_QDMA_IOCTL_BUF_UNREGISTER, &req); - if (ret < 0) { +int slash_qdma_qpair_buffer_create(int qpair_fd, uint64_t length, + struct slash_qdma_buffer *buf_out) +{ + if (qpair_fd < 0 || buf_out == NULL || length == 0) { + errno = EINVAL; return -1; } - return 0; + return qdma_buffer_create_on_fd(qpair_fd, length, buf_out); } -int slash_qdma_qpair_buffer_register(int qpair_fd, void *addr, - uint64_t length, uint32_t *buf_id, - enum slash_qdma_transfer_hint *transfer_hint) +int slash_qdma_buffer_destroy(struct slash_qdma_buffer *buf) { - struct slash_qdma_buf_register req; - int ret; + int ret = 0; - if (qpair_fd < 0 || addr == NULL || buf_id == NULL) { + if (buf == NULL) { errno = EINVAL; return -1; } - memset(&req, 0, sizeof(req)); - req.size = sizeof(req); - req.user_addr = (uint64_t)(uintptr_t)addr; - req.length = length; - - ret = ioctl(qpair_fd, SLASH_QDMA_IOCTL_BUF_REGISTER, &req); - if (ret < 0) { - if (errno == ENOTTY) { - return qpair_fallback_register(addr, length, buf_id, transfer_hint); + if (buf->addr != NULL && buf->addr != MAP_FAILED && buf->length != 0) { + if (munmap(buf->addr, (size_t)buf->length) != 0) { + ret = -1; } - return -1; } + buf->addr = NULL; - *buf_id = req.buf_id; - if (transfer_hint != NULL) { - *transfer_hint = req.transfer_hint; + if (buf->fd >= 0) { + if (close(buf->fd) != 0) { + ret = -1; + } + buf->fd = -1; } - return 0; + return ret; } -int slash_qdma_qpair_buffer_unregister(int qpair_fd, uint32_t buf_id) +ssize_t slash_qdma_qpair_transfer_batch(int qpair_fd, + const struct slash_qdma_subxfer *xfers, + uint32_t count) { - struct slash_qdma_buf_unregister req; + struct slash_qdma_transfer req; + uint32_t i; int ret; - if (qpair_fd < 0) { + if (qpair_fd < 0 || xfers == NULL || + count == 0 || count > SLASH_QDMA_FD_MAX_QPAIRS) { errno = EINVAL; return -1; } memset(&req, 0, sizeof(req)); - req.size = sizeof(req); - req.buf_id = buf_id; - - ret = ioctl(qpair_fd, SLASH_QDMA_IOCTL_BUF_UNREGISTER, &req); - if (ret < 0) { - if (errno == ENOTTY) { - return qpair_fallback_unregister(buf_id); + req.size = sizeof(req); + req.count = count; + for (i = 0; i < count; ++i) { + if (xfers[i].direction != SLASH_QDMA_XFER_H2C && + xfers[i].direction != SLASH_QDMA_XFER_C2H) { + errno = EINVAL; + return -1; } - return -1; + req.xfers[i] = xfers[i]; } - return 0; -} - -ssize_t slash_qdma_transfer(struct slash_qdma *qdma, int qpair_fd, - uint32_t buf_id, uint64_t buf_offset, - uint64_t dev_addr, uint64_t length, - uint32_t direction) -{ - struct slash_qdma_transfer req; - int ret; + ret = ioctl(qpair_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req); + if (ret < 0) { + if (errno == ENOTTY) { + /* @mock path: bounce each sub-transfer through the memfds. */ + uint64_t total = 0; - if (qdma == NULL || qpair_fd < 0) { - errno = EINVAL; - return -1; - } + for (i = 0; i < count; ++i) { + ssize_t n = qdma_fallback_subxfer(qpair_fd, &xfers[i]); - if (direction != SLASH_QDMA_XFER_H2C && direction != SLASH_QDMA_XFER_C2H) { - errno = EINVAL; + if (n < 0) { + return -1; + } + total += (uint64_t)n; + } + return (ssize_t)total; + } return -1; } - if (qdma->priv) { - return slash_qdma_mock_transfer(qdma, qpair_fd, buf_id, buf_offset, - dev_addr, length, direction); - } - - return slash_qdma_qpair_transfer(qpair_fd, buf_id, buf_offset, dev_addr, - length, direction); + return (ssize_t)ret; } -ssize_t slash_qdma_qpair_transfer(int qpair_fd, uint32_t buf_id, +ssize_t slash_qdma_qpair_transfer(int qpair_fd, int buf_fd, uint64_t buf_offset, uint64_t dev_addr, uint64_t length, uint32_t direction) { - struct slash_qdma_transfer req; - int ret; + struct slash_qdma_subxfer xfer; - if (qpair_fd < 0) { - errno = EINVAL; - return -1; - } - - if (direction != SLASH_QDMA_XFER_H2C && direction != SLASH_QDMA_XFER_C2H) { - errno = EINVAL; - return -1; - } - - memset(&req, 0, sizeof(req)); - req.size = sizeof(req); - req.buf_id = buf_id; - req.buf_offset = buf_offset; - req.dev_addr = dev_addr; - req.length = length; - req.direction = direction; + memset(&xfer, 0, sizeof(xfer)); + xfer.qpair_index = 0; + xfer.direction = direction; + xfer.buf_fd = buf_fd; + xfer.buf_offset = buf_offset; + xfer.dev_addr = dev_addr; + xfer.length = length; - ret = ioctl(qpair_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req); - if (ret < 0) { - if (errno == ENOTTY) { - return qpair_fallback_transfer(qpair_fd, buf_id, buf_offset, - dev_addr, length, direction); - } - return -1; - } - - return (ssize_t)ret; + return slash_qdma_qpair_transfer_batch(qpair_fd, &xfer, 1); } diff --git a/driver/libslash/src/qdma_mock.c b/driver/libslash/src/qdma_mock.c index 394b7487..d72762bb 100644 --- a/driver/libslash/src/qdma_mock.c +++ b/driver/libslash/src/qdma_mock.c @@ -39,7 +39,6 @@ #include #define QDMA_MOCK_MAX_QUEUES 64 -#define QDMA_MOCK_MAX_BUFS 64 struct slash_qdma_mock_qpair { bool in_use; @@ -47,15 +46,8 @@ struct slash_qdma_mock_qpair { int fd; /* backing memfd; -1 when slot is free */ }; -struct slash_qdma_mock_buf { - bool in_use; - void *addr; /* host base address */ - uint64_t length; -}; - struct slash_qdma_mock { struct slash_qdma_mock_qpair queues[QDMA_MOCK_MAX_QUEUES]; - struct slash_qdma_mock_buf bufs[QDMA_MOCK_MAX_BUFS]; }; static struct slash_qdma_mock *mock_ctx(struct slash_qdma *qdma) @@ -266,125 +258,43 @@ int slash_qdma_mock_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flag return new_fd; } -int slash_qdma_mock_buffer_register(struct slash_qdma *qdma, void *addr, - uint64_t length, uint32_t *buf_id, - enum slash_qdma_transfer_hint *transfer_hint) +int slash_qdma_mock_qpair_get_fd_multi(struct slash_qdma *qdma, + const uint32_t *qids, + uint32_t qpair_count, int flags) { struct slash_qdma_mock *ctx; - size_t i; + uint32_t i; + int new_fd; + (void) flags; /* O_CLOEXEC already set on the memfd */ - if (qdma == NULL || addr == NULL || buf_id == NULL || length == 0) { + if (qdma == NULL || qids == NULL || + qpair_count == 0 || qpair_count > SLASH_QDMA_FD_MAX_QPAIRS) { errno = EINVAL; return -1; } ctx = mock_ctx(qdma); - for (i = 0; i < QDMA_MOCK_MAX_BUFS; ++i) { - if (!ctx->bufs[i].in_use) { - break; + for (i = 0; i < qpair_count; ++i) { + if (qids[i] >= QDMA_MOCK_MAX_QUEUES || + !ctx->queues[qids[i]].in_use || !ctx->queues[qids[i]].started) { + errno = EINVAL; + return -1; } } - if (i == QDMA_MOCK_MAX_BUFS) { - errno = ENOSPC; - return -1; - } - - ctx->bufs[i].in_use = true; - ctx->bufs[i].addr = addr; - ctx->bufs[i].length = length; - - *buf_id = (uint32_t) i; - if (transfer_hint != NULL) { - *transfer_hint = SLASH_QDMA_TRANSFER_HINT_V80; - } - - return 0; -} - -int slash_qdma_mock_buffer_unregister(struct slash_qdma *qdma, uint32_t buf_id) -{ - struct slash_qdma_mock *ctx; - - if (qdma == NULL || buf_id >= QDMA_MOCK_MAX_BUFS) { - errno = EINVAL; - return -1; - } - - ctx = mock_ctx(qdma); - - if (!ctx->bufs[buf_id].in_use) { - errno = ENOENT; - return -1; - } - - memset(&ctx->bufs[buf_id], 0, sizeof(ctx->bufs[buf_id])); - - return 0; -} - -ssize_t slash_qdma_mock_transfer(struct slash_qdma *qdma, int qpair_fd, - uint32_t buf_id, uint64_t buf_offset, - uint64_t dev_addr, uint64_t length, - uint32_t direction) -{ - struct slash_qdma_mock *ctx; - struct slash_qdma_mock_buf *buf; - char *host; - uint64_t done = 0; - - if (qdma == NULL || qpair_fd < 0 || buf_id >= QDMA_MOCK_MAX_BUFS) { - errno = EINVAL; - return -1; - } - - ctx = mock_ctx(qdma); - buf = &ctx->bufs[buf_id]; - - if (!buf->in_use) { - errno = ENOENT; - return -1; - } - - if (length == 0 || buf_offset > buf->length || - length > buf->length - buf_offset) { - errno = EINVAL; - return -1; - } - - host = (char *) buf->addr + buf_offset; - /* - * Emulate the device endpoint with the queue's backing memfd: H2C writes - * host data to the memfd at dev_addr, C2H reads it back. Loop to absorb - * short transfers from the underlying file ops. + * The mock backs the device address space with one memfd per queue pair. + * Both NoC channels address the same device memory, so a multi-qpair fd is + * emulated by a single backing store: dup the first queue pair's memfd and + * route every sub-transfer through it. This keeps round-trips consistent + * regardless of which channel a sub-transfer used. */ - while (done < length) { - ssize_t n; - - if (direction == SLASH_QDMA_XFER_H2C) { - n = pwrite(qpair_fd, host + done, (size_t)(length - done), - (off_t)(dev_addr + done)); - } else if (direction == SLASH_QDMA_XFER_C2H) { - n = pread(qpair_fd, host + done, (size_t)(length - done), - (off_t)(dev_addr + done)); - } else { - errno = EINVAL; - return -1; - } - - if (n < 0) { - if (errno == EINTR) { - continue; - } - return -1; - } - if (n == 0) { - break; - } - done += (uint64_t) n; + new_fd = dup(ctx->queues[qids[0]].fd); + if (new_fd < 0) { + return -1; } - return (ssize_t) done; + return new_fd; } + diff --git a/driver/libslash/src/qdma_mock.h b/driver/libslash/src/qdma_mock.h index d30a7294..cd7e54e6 100644 --- a/driver/libslash/src/qdma_mock.h +++ b/driver/libslash/src/qdma_mock.h @@ -35,13 +35,8 @@ int slash_qdma_mock_qpair_start(struct slash_qdma *qdma, uint32_t qid); int slash_qdma_mock_qpair_stop(struct slash_qdma *qdma, uint32_t qid); int slash_qdma_mock_qpair_del(struct slash_qdma *qdma, uint32_t qid); int slash_qdma_mock_qpair_get_fd(struct slash_qdma *qdma, uint32_t qid, int flags); -int slash_qdma_mock_buffer_register(struct slash_qdma *qdma, void *addr, - uint64_t length, uint32_t *buf_id, - enum slash_qdma_transfer_hint *transfer_hint); -int slash_qdma_mock_buffer_unregister(struct slash_qdma *qdma, uint32_t buf_id); -ssize_t slash_qdma_mock_transfer(struct slash_qdma *qdma, int qpair_fd, - uint32_t buf_id, uint64_t buf_offset, - uint64_t dev_addr, uint64_t length, - uint32_t direction); +int slash_qdma_mock_qpair_get_fd_multi(struct slash_qdma *qdma, + const uint32_t *qids, + uint32_t qpair_count, int flags); #endif /* LIBSLASH_QDMA_MOCK_H */ diff --git a/driver/libslash/tests/qdma_test.cpp b/driver/libslash/tests/qdma_test.cpp index 20ea4221..9519302e 100644 --- a/driver/libslash/tests/qdma_test.cpp +++ b/driver/libslash/tests/qdma_test.cpp @@ -101,36 +101,33 @@ TEST(QdmaNullTest, QpaiGetFd) { EXPECT_EQ(errno, EINVAL); } -TEST(QdmaNullTest, BufferRegister) { - uint32_t buf_id = 0; - uint8_t local = 0; +TEST(QdmaNullTest, BufferCreate) { + struct slash_qdma_buffer buf{}; errno = 0; - EXPECT_EQ(slash_qdma_buffer_register(nullptr, &local, 4096, &buf_id, nullptr), -1); + EXPECT_EQ(slash_qdma_buffer_create(nullptr, 4096, &buf), -1); EXPECT_EQ(errno, EINVAL); struct slash_qdma fake{}; fake.fd = -1; errno = 0; - EXPECT_EQ(slash_qdma_buffer_register(&fake, nullptr, 4096, &buf_id, nullptr), -1); + EXPECT_EQ(slash_qdma_buffer_create(&fake, 4096, nullptr), -1); EXPECT_EQ(errno, EINVAL); -} -TEST(QdmaNullTest, BufferUnregister) { errno = 0; - EXPECT_EQ(slash_qdma_buffer_unregister(nullptr, 0), -1); + EXPECT_EQ(slash_qdma_qpair_buffer_create(-1, 4096, &buf), -1); EXPECT_EQ(errno, EINVAL); } -TEST(QdmaNullTest, Transfer) { +TEST(QdmaNullTest, BufferDestroy) { errno = 0; - EXPECT_EQ(slash_qdma_transfer(nullptr, 3, 0, 0, 0, 4096, SLASH_QDMA_XFER_H2C), -1); + EXPECT_EQ(slash_qdma_buffer_destroy(nullptr), -1); EXPECT_EQ(errno, EINVAL); +} - struct slash_qdma fake{}; - fake.fd = -1; +TEST(QdmaNullTest, Transfer) { errno = 0; - /* Invalid direction is rejected before any backend dispatch. */ - EXPECT_EQ(slash_qdma_transfer(&fake, 3, 0, 0, 0, 4096, 0), -1); + /* Invalid qpair fd is rejected. */ + EXPECT_EQ(slash_qdma_qpair_transfer(-1, 4, 0, 0, 4096, SLASH_QDMA_XFER_H2C), -1); EXPECT_EQ(errno, EINVAL); } @@ -192,37 +189,30 @@ TEST_P(ParametrizedQdmaTest, QueueDmaTransfer) { int queue_fd = slash_qdma_qpair_get_fd(qdma_, qid, 0); ASSERT_GE(queue_fd, 0); - // Write a known pattern to DDR (H2C) through the transfer-only ioctl path. - void *src_mem = nullptr; - void *dst_mem = nullptr; - ASSERT_EQ(posix_memalign(&src_mem, 4096, XFER_SIZE), 0); - ASSERT_EQ(posix_memalign(&dst_mem, 4096, XFER_SIZE), 0); - auto *src = static_cast(src_mem); - auto *dst = static_cast(dst_mem); + // Kernel-owned buffers created through the queue-pair fd. + struct slash_qdma_buffer src_buf{}; + struct slash_qdma_buffer dst_buf{}; + ASSERT_EQ(slash_qdma_qpair_buffer_create(queue_fd, XFER_SIZE, &src_buf), 0); + ASSERT_EQ(slash_qdma_qpair_buffer_create(queue_fd, XFER_SIZE, &dst_buf), 0); + auto *src = static_cast(src_buf.addr); + auto *dst = static_cast(dst_buf.addr); for (size_t i = 0; i < XFER_SIZE; ++i) { src[i] = static_cast(i & 0xFF); } std::memset(dst, 0, XFER_SIZE); - uint32_t src_buf = 0; - uint32_t dst_buf = 0; - ASSERT_EQ(slash_qdma_qpair_buffer_register(queue_fd, src, XFER_SIZE, &src_buf, nullptr), 0); - ASSERT_EQ(slash_qdma_qpair_buffer_register(queue_fd, dst, XFER_SIZE, &dst_buf, nullptr), 0); - ssize_t written = slash_qdma_qpair_transfer( - queue_fd, src_buf, 0, DDR_BASE_ADDRESS, XFER_SIZE, SLASH_QDMA_XFER_H2C); + queue_fd, src_buf.fd, 0, DDR_BASE_ADDRESS, XFER_SIZE, SLASH_QDMA_XFER_H2C); EXPECT_EQ(written, static_cast(XFER_SIZE)); // Read back from DDR (C2H) and verify. ssize_t read_bytes = slash_qdma_qpair_transfer( - queue_fd, dst_buf, 0, DDR_BASE_ADDRESS, XFER_SIZE, SLASH_QDMA_XFER_C2H); + queue_fd, dst_buf.fd, 0, DDR_BASE_ADDRESS, XFER_SIZE, SLASH_QDMA_XFER_C2H); EXPECT_EQ(read_bytes, static_cast(XFER_SIZE)); EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); - EXPECT_EQ(slash_qdma_qpair_buffer_unregister(queue_fd, src_buf), 0); - EXPECT_EQ(slash_qdma_qpair_buffer_unregister(queue_fd, dst_buf), 0); - free(src_mem); - free(dst_mem); + EXPECT_EQ(slash_qdma_buffer_destroy(&src_buf), 0); + EXPECT_EQ(slash_qdma_buffer_destroy(&dst_buf), 0); EXPECT_EQ(close(queue_fd), 0); @@ -230,7 +220,7 @@ TEST_P(ParametrizedQdmaTest, QueueDmaTransfer) { EXPECT_EQ(slash_qdma_qpair_del(qdma_, qid), 0); } -TEST_P(ParametrizedQdmaTest, RegisteredBufferTransfer) { +TEST_P(ParametrizedQdmaTest, BufferCreateTransfer) { static constexpr size_t XFER_SIZE = 4096; struct slash_qdma_qpair_add req{}; @@ -244,51 +234,132 @@ TEST_P(ParametrizedQdmaTest, RegisteredBufferTransfer) { int queue_fd = slash_qdma_qpair_get_fd(qdma_, qid, 0); ASSERT_GE(queue_fd, 0); - // Page-aligned host staging buffers, as registration requires. - void *src_mem = nullptr; - void *dst_mem = nullptr; - ASSERT_EQ(posix_memalign(&src_mem, 4096, XFER_SIZE), 0); - ASSERT_EQ(posix_memalign(&dst_mem, 4096, XFER_SIZE), 0); - auto *src = static_cast(src_mem); - auto *dst = static_cast(dst_mem); + // Kernel-owned buffers created through the control handle. + struct slash_qdma_buffer src_buf{}; + struct slash_qdma_buffer dst_buf{}; + ASSERT_EQ(slash_qdma_buffer_create(qdma_, XFER_SIZE, &src_buf), 0); + ASSERT_EQ(slash_qdma_buffer_create(qdma_, XFER_SIZE, &dst_buf), 0); + EXPECT_EQ(src_buf.transfer_hint, SLASH_QDMA_TRANSFER_HINT_V80); + EXPECT_EQ(dst_buf.transfer_hint, SLASH_QDMA_TRANSFER_HINT_V80); + auto *src = static_cast(src_buf.addr); + auto *dst = static_cast(dst_buf.addr); for (size_t i = 0; i < XFER_SIZE; ++i) { src[i] = static_cast(i & 0xFF); } std::memset(dst, 0, XFER_SIZE); - uint32_t src_buf = 0; - uint32_t dst_buf = 0; - enum slash_qdma_transfer_hint src_hint = SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR; - enum slash_qdma_transfer_hint dst_hint = SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR; - ASSERT_EQ(slash_qdma_buffer_register(qdma_, src, XFER_SIZE, &src_buf, &src_hint), 0); - ASSERT_EQ(slash_qdma_buffer_register(qdma_, dst, XFER_SIZE, &dst_buf, &dst_hint), 0); - EXPECT_EQ(src_hint, SLASH_QDMA_TRANSFER_HINT_V80); - EXPECT_EQ(dst_hint, SLASH_QDMA_TRANSFER_HINT_V80); - // H2C: push the source buffer to the device. - ssize_t written = slash_qdma_transfer(qdma_, queue_fd, src_buf, 0, - DDR_BASE_ADDRESS, XFER_SIZE, - SLASH_QDMA_XFER_H2C); + ssize_t written = slash_qdma_qpair_transfer(queue_fd, src_buf.fd, 0, + DDR_BASE_ADDRESS, XFER_SIZE, + SLASH_QDMA_XFER_H2C); EXPECT_EQ(written, static_cast(XFER_SIZE)); // C2H: pull it back into the destination buffer and verify. - ssize_t read_bytes = slash_qdma_transfer(qdma_, queue_fd, dst_buf, 0, - DDR_BASE_ADDRESS, XFER_SIZE, - SLASH_QDMA_XFER_C2H); + ssize_t read_bytes = slash_qdma_qpair_transfer(queue_fd, dst_buf.fd, 0, + DDR_BASE_ADDRESS, XFER_SIZE, + SLASH_QDMA_XFER_C2H); EXPECT_EQ(read_bytes, static_cast(XFER_SIZE)); EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); - EXPECT_EQ(slash_qdma_buffer_unregister(qdma_, src_buf), 0); - EXPECT_EQ(slash_qdma_buffer_unregister(qdma_, dst_buf), 0); - - free(src_mem); - free(dst_mem); + EXPECT_EQ(slash_qdma_buffer_destroy(&src_buf), 0); + EXPECT_EQ(slash_qdma_buffer_destroy(&dst_buf), 0); EXPECT_EQ(close(queue_fd), 0); EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qid), 0); EXPECT_EQ(slash_qdma_qpair_del(qdma_, qid), 0); } +TEST_P(ParametrizedQdmaTest, MultiQpairBatchTransfer) { + // Two 4 KiB halves transferred concurrently across two queue pairs bound to + // a single fd, exercising the get-fd-multi + batch transfer API. + static constexpr size_t HALF = 4096; + static constexpr size_t XFER_SIZE = 2 * HALF; + + uint32_t qids[2] = {0, 0}; + for (int ch = 0; ch < 2; ++ch) { + struct slash_qdma_qpair_add req{}; + req.mode = 0; /* QDMA_Q_MODE_MM */ + req.dir_mask = 0x3; /* H2C | C2H */ + req.mm_channel = static_cast( + ch == 0 ? SLASH_QDMA_MM_CHANNEL_0 : SLASH_QDMA_MM_CHANNEL_1); + ASSERT_EQ(slash_qdma_qpair_add(qdma_, &req), 0); + qids[ch] = req.qid; + ASSERT_EQ(slash_qdma_qpair_start(qdma_, qids[ch]), 0); + } + + int fd = slash_qdma_qpair_get_fd_multi(qdma_, qids, 2, 0); + ASSERT_GE(fd, 0); + + struct slash_qdma_buffer src_buf{}; + struct slash_qdma_buffer dst_buf{}; + ASSERT_EQ(slash_qdma_qpair_buffer_create(fd, XFER_SIZE, &src_buf), 0); + ASSERT_EQ(slash_qdma_qpair_buffer_create(fd, XFER_SIZE, &dst_buf), 0); + auto *src = static_cast(src_buf.addr); + auto *dst = static_cast(dst_buf.addr); + for (size_t i = 0; i < XFER_SIZE; ++i) { + src[i] = static_cast((i * 7 + 1) & 0xFF); + } + std::memset(dst, 0, XFER_SIZE); + + // H2C: lower half on qpair 0, upper half on qpair 1, in one ioctl. + struct slash_qdma_subxfer h2c[2]{}; + h2c[0] = {0, SLASH_QDMA_XFER_H2C, src_buf.fd, 0, 0, DDR_BASE_ADDRESS, HALF}; + h2c[1] = {1, SLASH_QDMA_XFER_H2C, src_buf.fd, 0, HALF, DDR_BASE_ADDRESS + HALF, HALF}; + EXPECT_EQ(slash_qdma_qpair_transfer_batch(fd, h2c, 2), + static_cast(XFER_SIZE)); + + // C2H: read both halves back across both channels in one ioctl. + struct slash_qdma_subxfer c2h[2]{}; + c2h[0] = {0, SLASH_QDMA_XFER_C2H, dst_buf.fd, 0, 0, DDR_BASE_ADDRESS, HALF}; + c2h[1] = {1, SLASH_QDMA_XFER_C2H, dst_buf.fd, 0, HALF, DDR_BASE_ADDRESS + HALF, HALF}; + EXPECT_EQ(slash_qdma_qpair_transfer_batch(fd, c2h, 2), + static_cast(XFER_SIZE)); + + EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); + + EXPECT_EQ(slash_qdma_buffer_destroy(&src_buf), 0); + EXPECT_EQ(slash_qdma_buffer_destroy(&dst_buf), 0); + + EXPECT_EQ(close(fd), 0); + for (int ch = 0; ch < 2; ++ch) { + EXPECT_EQ(slash_qdma_qpair_stop(qdma_, qids[ch]), 0); + EXPECT_EQ(slash_qdma_qpair_del(qdma_, qids[ch]), 0); + } +} + +TEST(QdmaNullTest, QpairGetFdMultiInvalid) { + uint32_t qids[2] = {0, 1}; + errno = 0; + EXPECT_EQ(slash_qdma_qpair_get_fd_multi(nullptr, qids, 2, 0), -1); + EXPECT_EQ(errno, EINVAL); + + struct slash_qdma fake{}; + fake.fd = -1; + errno = 0; + EXPECT_EQ(slash_qdma_qpair_get_fd_multi(&fake, qids, 0, 0), -1); + EXPECT_EQ(errno, EINVAL); + + errno = 0; + EXPECT_EQ(slash_qdma_qpair_get_fd_multi(&fake, qids, 3, 0), -1); + EXPECT_EQ(errno, EINVAL); +} + +TEST(QdmaNullTest, TransferBatchInvalid) { + struct slash_qdma_subxfer x{}; + x.direction = SLASH_QDMA_XFER_H2C; + errno = 0; + EXPECT_EQ(slash_qdma_qpair_transfer_batch(-1, &x, 1), -1); + EXPECT_EQ(errno, EINVAL); + + errno = 0; + EXPECT_EQ(slash_qdma_qpair_transfer_batch(3, nullptr, 1), -1); + EXPECT_EQ(errno, EINVAL); + + errno = 0; + EXPECT_EQ(slash_qdma_qpair_transfer_batch(3, &x, 0), -1); + EXPECT_EQ(errno, EINVAL); +} + TEST_P(ParametrizedQdmaTest, QueueFdReadWriteRejectedOnHardware) { if (mock) { GTEST_SKIP() << "mock qpair fds are memfds and still support read/write"; diff --git a/driver/slash_qdma.c b/driver/slash_qdma.c index a61a978d..3b63148e 100644 --- a/driver/slash_qdma.c +++ b/driver/slash_qdma.c @@ -51,6 +51,7 @@ #include #include +#include #include #include #include @@ -71,6 +72,31 @@ #include #include +#if defined(SLASH_HAVE_URING_CMD) +#include +#if __has_include() +#include +#endif + +/** + * slash_qdma_uring_cmd_payload() - Pointer to a uring_cmd's inline SQE payload. + * @cmd: The io_uring command. + * + * Abstracts the kernel API change that removed struct io_uring_cmd::cmd in + * favour of ->sqe + io_uring_sqe_cmd(). The accessor is selected at build + * time by the kcompat probe (SLASH_HAVE_URING_SQE_CMD); both forms return the + * same inline command payload pointer. + */ +static inline const void *slash_qdma_uring_cmd_payload(struct io_uring_cmd *cmd) +{ +#if defined(SLASH_HAVE_URING_SQE_CMD) + return io_uring_sqe_cmd(cmd->sqe); +#else + return cmd->cmd; +#endif +} +#endif + /* * Direction bitmask constants. * @@ -106,12 +132,10 @@ offsetofend(struct slash_qdma_qpair_op, op) #define SLASH_QDMA_QPAIR_GET_FD_MIN_SIZE \ offsetofend(struct slash_qdma_qpair_fd_request, flags) -#define SLASH_QDMA_BUF_REGISTER_MIN_SIZE \ - offsetofend(struct slash_qdma_buf_register, length) -#define SLASH_QDMA_BUF_UNREGISTER_MIN_SIZE \ - offsetofend(struct slash_qdma_buf_unregister, buf_id) +#define SLASH_QDMA_BUF_CREATE_MIN_SIZE \ + offsetofend(struct slash_qdma_buf_create, length) #define SLASH_QDMA_TRANSFER_MIN_SIZE \ - offsetofend(struct slash_qdma_transfer, direction) + offsetofend(struct slash_qdma_transfer, count) /* * CPM5 Host Profile indirect-context programming. @@ -165,14 +189,6 @@ */ #define SLASH_QDMA_MAX_QPAIRS 256 -/* - * Upper bound on the number of pages pinned per get_user_pages_fast() call when - * mapping a multi-page base-page transfer. Bounds the work done in a single - * GUP call (and keeps the per-call page count within int range) while still - * pinning large buffers in only a handful of iterations. - */ -#define SLASH_QDMA_GUP_BATCH 8192u - /** * SLASH_QDMA_QPAIR_ID_RANGE - XArray allocation range for qpair IDs. * @@ -182,17 +198,6 @@ */ #define SLASH_QDMA_QPAIR_ID_RANGE XA_LIMIT(0, SLASH_QDMA_MAX_QPAIRS - 1) -/** - * SLASH_QDMA_MAX_BUFS - Maximum number of registered DMA buffers per client. - * - * Each control-fd open instance gets its own buffer-id space; this bounds - * the xarray allocation range used by SLASH_QDMA_IOCTL_BUF_REGISTER. - */ -#define SLASH_QDMA_MAX_BUFS 4096 - -/** XArray allocation range for registered buffer IDs ([0, 4095]). */ -#define SLASH_QDMA_BUF_ID_RANGE XA_LIMIT(0, SLASH_QDMA_MAX_BUFS - 1) - /* * Debug logging infrastructure. * @@ -219,23 +224,12 @@ #endif /* - * Per-transfer timing instrumentation. + * Per-transfer timing instrumentation (compile-time flag). * - * When SLASH_QDMA_TIMING is non-zero (compile-time flag, e.g. built with - * -DSLASH_QDMA_TIMING=1), slash_qdma_qpair_read_write() emits one dev_info - * line per transfer breaking down the wall-clock cost of the kernel-side - * phases: - * - * - map: pin user pages, validate page shape, build the SGL - * (slash_qdma_map_user_buf_to_sgl()). - * - submit: the whole libqdma qdma_request_submit() call, which covers - * SGL DMA-mapping (IOMMU), descriptor-ring fill, the PIDX - * doorbell, and the synchronous completion wait (HW transfer + - * poll-mode spin). - * - unmap: unpin pages (mark dirty for C2H) and free the SGL. - * - * Timestamps use ktime_get() (CLOCK_MONOTONIC); the reads are cheap, but - * the whole block compiles out entirely when the flag is 0. + * Retained for parity with the userspace SLASH_QDMA_TIMING knob. With the + * kernel-owned buffer model all the expensive setup (page allocation, SGL + * build, DMA mapping) happens once at SLASH_QDMA_IOCTL_BUF_CREATE time, so the + * steady-state transfer cost is dominated by the libqdma submit/completion. */ #ifndef SLASH_QDMA_TIMING #define SLASH_QDMA_TIMING 0 @@ -592,95 +586,57 @@ slash_qdma_qpair_remove(struct slash_qdma_dev *qdma_dev, u32 qid) /** * struct slash_qdma_qpair_file_ctx - Private data for an anon_inode qpair fd. - * @qdma_dev: Back-pointer to the owning QDMA device (ref held). - * @entry: The queue pair entry this fd operates on (ref held). - * @qid: Queue pair ID, cached for debug logging. + * @qdma_dev: Back-pointer to the owning QDMA device (ref held). + * @entries: The queue pair entries this fd operates on (one ref each). + * A transfer sub-transfer's qpair_index selects an entry here. + * @qids: Queue pair IDs, cached for debug logging. + * @n_qpairs: Number of valid entries in @entries / @qids + * (1..SLASH_QDMA_FD_MAX_QPAIRS). * * Allocated in slash_qdma_ioctl_qpair_get_fd_w() and freed in - * slash_qdma_qpair_release(). Both @qdma_dev and @entry have their - * reference counts incremented when the ctx is created, and decremented - * when the fd is closed. + * slash_qdma_qpair_release(). @qdma_dev and each entry have their reference + * counts incremented when the ctx is created, and decremented when the fd is + * closed. */ struct slash_qdma_qpair_file_ctx { struct slash_qdma_dev *qdma_dev; - struct slash_qdma_qpair_entry *entry; - struct slash_qdma_client *client; - u32 qid; + struct slash_qdma_qpair_entry *entries[SLASH_QDMA_FD_MAX_QPAIRS]; + u32 qids[SLASH_QDMA_FD_MAX_QPAIRS]; + u32 n_qpairs; }; /** - * struct slash_qdma_io_cb - I/O control block for a single DMA transfer. - * @buf: User-space buffer address (source for H2C, destination for C2H). - * @len: Transfer length in bytes. - * @pages_nr: Number of user pages pinned by get_user_pages_fast(). - * @sgl: Scatter-gather list of qdma_sw_sg entries, one per pinned page. - * Allocated as a single contiguous block together with @pages. - * @pages: Array of struct page pointers for the pinned user pages. - * Points into the same allocation as @sgl (immediately after it). - * @req: The libqdma request structure submitted to qdma_request_submit(). - * - * This is a stack-local structure (allocated in slash_qdma_qpair_read_write) - * that bundles all per-transfer state. The SGL and page array are heap- - * allocated in slash_qdma_map_user_buf_to_sgl() and freed in - * slash_qdma_iocb_release(). - */ -struct slash_qdma_io_cb { - void __user *buf; - size_t len; - unsigned int pages_nr; - struct qdma_sw_sg *sgl; - struct page **pages; - struct qdma_request req; -}; - -/** - * struct slash_qdma_buf - A registered (persistently pinned) host buffer. - * @ref: Reference count. The owning client's xarray holds one ref; - * each in-flight transfer takes a temporary ref so an - * unregister cannot tear the buffer down under active DMA. - * @qdma_dev: Device whose DMA mappings back this buffer (non-owning; the - * owning client holds the device reference). - * @buf_id: Client-scoped handle returned to userspace. - * @length: Registered length in bytes. - * @granule: Bytes per SGL entry (PAGE_SIZE; one 4 KiB base page each). - * Uniform across all entries, so transfer slices can be computed - * by simple division. - * @iocb: Pinned pages and prebuilt scatter-gather list. Each entry's - * dma_addr is filled in once at registration so transfers can - * submit with req->dma_mapped = 1. - * - * Registered buffers amortise the per-transfer cost of pinning pages, - * building the SGL, and programming the IOMMU: that work happens once at - * registration, and every transfer reuses the cached, pre-DMA-mapped SGL. + * struct slash_qdma_buf - A kernel-owned, mmap-able DMA buffer. + * @ref: Reference count. The buffer fd holds one ref, each live VMA + * (mmap) holds one ref, and each in-flight transfer holds a + * temporary ref so a close cannot tear the buffer down under + * active DMA or while userspace still has it mapped. + * @qdma_dev: Device whose DMA mappings back this buffer (holds a device + * reference for the lifetime of the buffer object). + * @length: Buffer length in bytes (a multiple of @granule). + * @granule: Bytes per SGL entry / page (PAGE_SIZE). Uniform across all + * entries, so transfer slices are computed by simple division. + * @pages_nr: Number of base pages backing the buffer (length / granule). + * @pages: Array of @pages_nr kernel pages (alloc_page()), not physically + * contiguous. Used both for the CPU mmap and the DMA SGL. + * @sgl: Prebuilt scatter-gather list, one entry per page, each with its + * dma_addr filled in once at creation so transfers submit with + * req->dma_mapped = 1. + * @dma_mapped: True once @sgl entries have been DMA-mapped. + * + * All expensive setup (page allocation, SGL construction, DMA mapping) happens + * once at creation; the transfer fast path only slices @sgl, syncs the touched + * pages, and submits. */ struct slash_qdma_buf { struct kref ref; struct slash_qdma_dev *qdma_dev; - u32 buf_id; u64 length; - u64 granule; - struct slash_qdma_io_cb iocb; -}; - -/** - * struct slash_qdma_client - Per-open state for the QDMA control device. - * @ref: Reference count. The control fd holds the initial ref; each - * qpair I/O fd handed out via QPAIR_GET_FD takes another so that - * handle-based transfers can resolve buffer IDs even if the - * control fd is closed first. - * @qdma_dev: Owning QDMA device (holds a device reference). - * @buffers: XArray mapping buf_id -> &struct slash_qdma_buf. Buffers are - * owned by this client and auto-freed when the control fd closes. - * - * Replaces the bare device pointer previously stored in the control fd's - * file->private_data. Tying registered buffers to this per-open context - * makes cleanup automatic: if userspace exits or is killed without - * unregistering, the control fd release path drops every buffer. - */ -struct slash_qdma_client { - struct kref ref; - struct slash_qdma_dev *qdma_dev; - struct xarray buffers; + u32 granule; + unsigned int pages_nr; + struct page **pages; + struct qdma_sw_sg *sgl; + bool dma_mapped; }; /* ───────────────────────────────────────────────────────────────────── @@ -724,33 +680,38 @@ static int slash_qdma_ioctl_qpair_op_apply(struct slash_qdma_dev *qdma_dev, const char *op_name, bool stop_on_err); static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc, - struct slash_qdma_client *client, + struct slash_qdma_dev *qdma_dev, void __user *uarg); -static int slash_qdma_ioctl_buf_register_w(struct miscdevice *misc, - struct slash_qdma_client *client, - void __user *uarg); -static int slash_qdma_ioctl_buf_unregister_w(struct miscdevice *misc, - struct slash_qdma_client *client, - void __user *uarg); +static int slash_qdma_ioctl_buf_create_w(struct miscdevice *misc, + struct slash_qdma_dev *qdma_dev, + void __user *uarg); static void slash_qdma_buf_release(struct kref *ref); static void slash_qdma_buf_put(struct slash_qdma_buf *buf); -static void slash_qdma_client_release(struct kref *ref); static long slash_qdma_qpair_transfer(struct file *file, void __user *uarg); static int slash_qdma_qpair_release(struct inode *inode, struct file *file); static long slash_qdma_qpair_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +#if defined(SLASH_HAVE_URING_CMD) +static int slash_qdma_qpair_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags); +#endif /** * slash_qdma_qpair_fops - File operations for per-qpair anon_inode fds. * - * ioctl performs registered-buffer transfers and buffer registration - * operations that share the owning control fd's client context. - * release drops the refs on the qpair entry and device. + * ioctl performs buffer DMA transfers and buffer creation for clients that + * only hold a queue-pair fd. + * uring_cmd (optional) is the asynchronous equivalent of the transfer ioctl, + * available only on kernels with io_uring uring_cmd support. + * release drops the refs on the bound qpair entries and device. */ static const struct file_operations slash_qdma_qpair_fops = { .owner = THIS_MODULE, .unlocked_ioctl = slash_qdma_qpair_ioctl, +#if defined(SLASH_HAVE_URING_CMD) + .uring_cmd = slash_qdma_qpair_uring_cmd, +#endif .release = slash_qdma_qpair_release, }; @@ -1672,16 +1633,14 @@ static void slash_qdma_conf_options(struct qdma_dev_conf *conf, struct pci_dev * */ static long slash_qdma_fop_ioctl(struct file *file, unsigned int op, unsigned long arg) { - struct slash_qdma_client *client = file->private_data; - struct slash_qdma_dev *qdma_dev; + struct slash_qdma_dev *qdma_dev = file->private_data; struct miscdevice *misc; void __user *uarg = (void __user *)arg; long ret = 0; - if (!client || !client->qdma_dev) + if (!qdma_dev) return -ENODEV; - qdma_dev = client->qdma_dev; misc = &qdma_dev->misc; SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, "ioctl op=0x%x\n", op); @@ -1708,15 +1667,11 @@ static long slash_qdma_fop_ioctl(struct file *file, unsigned int op, unsigned lo break; case SLASH_QDMA_IOCTL_QPAIR_GET_FD: - ret = slash_qdma_ioctl_qpair_get_fd_w(misc, client, uarg); + ret = slash_qdma_ioctl_qpair_get_fd_w(misc, qdma_dev, uarg); break; - case SLASH_QDMA_IOCTL_BUF_REGISTER: - ret = slash_qdma_ioctl_buf_register_w(misc, client, uarg); - break; - - case SLASH_QDMA_IOCTL_BUF_UNREGISTER: - ret = slash_qdma_ioctl_buf_unregister_w(misc, client, uarg); + case SLASH_QDMA_IOCTL_BUF_CREATE: + ret = slash_qdma_ioctl_buf_create_w(misc, qdma_dev, uarg); break; default: @@ -1744,7 +1699,6 @@ static int slash_qdma_fop_open(struct inode *inode, struct file *file) struct miscdevice *misc = file->private_data; struct slash_qdma_dev *qdma_dev = container_of(misc, struct slash_qdma_dev, misc); - struct slash_qdma_client *client; mutex_lock(&qdma_dev->lock); if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) { @@ -1754,22 +1708,7 @@ static int slash_qdma_fop_open(struct inode *inode, struct file *file) kref_get(&qdma_dev->ref); mutex_unlock(&qdma_dev->lock); - /* - * Allocate a per-open client context to own any buffers registered - * through this fd. The control fd holds the initial client ref; it - * is dropped (and the buffers torn down) in slash_qdma_fop_release(). - */ - client = kzalloc(sizeof(*client), GFP_KERNEL); - if (!client) { - kref_put(&qdma_dev->ref, slash_qdma_dev_release); - return -ENOMEM; - } - - kref_init(&client->ref); - client->qdma_dev = qdma_dev; - xa_init_flags(&client->buffers, XA_FLAGS_ALLOC); - - file->private_data = client; + file->private_data = qdma_dev; return 0; } @@ -1786,12 +1725,12 @@ static int slash_qdma_fop_open(struct inode *inode, struct file *file) */ static int slash_qdma_fop_release(struct inode *inode, struct file *file) { - struct slash_qdma_client *client = file->private_data; + struct slash_qdma_dev *qdma_dev = file->private_data; - if (!client) + if (!qdma_dev) return 0; - kref_put(&client->ref, slash_qdma_client_release); + kref_put(&qdma_dev->ref, slash_qdma_dev_release); file->private_data = NULL; @@ -2503,279 +2442,233 @@ static int slash_qdma_ioctl_qpair_op_apply(struct slash_qdma_dev *qdma_dev, } /* ───────────────────────────────────────────────────────────────────── - * DMA I/O: user buffer mapping, SGL construction, and transfer + * Kernel DMA buffers: page allocation, SGL, DMA mapping, mmap + * + * A buffer owns a set of individually-allocated 4 KiB base pages (not + * physically contiguous). At creation time the pages are allocated, a + * one-descriptor-per-page SGL is built, and every page is DMA-mapped once; + * the steady-state transfer path then only slices the SGL, syncs the touched + * pages for the relevant DMA direction, and submits. The same pages are also + * exposed to userspace through the buffer fd's mmap, so the CPU and the DMA + * engine share one allocation, coherent only at the transfer boundaries. * ───────────────────────────────────────────────────────────────────── */ /** - * slash_qdma_iocb_release() - Free resources in an I/O control block. - * @iocb: The IOCB to clean up. + * slash_qdma_buf_dma_unmap() - Tear down the cached DMA mapping of a buffer. + * @buf: Buffer whose SGL entries were DMA-mapped. * - * Frees the combined SGL + page-pointer allocation and clears the - * pointers. Does not unpin pages — that must be done separately via - * slash_qdma_unmap_user_buf() before calling this. + * Unmaps every SGL entry that carries a non-zero dma_addr and clears it. + * Safe to call on a partially-mapped buffer (used on the create error path). */ -static inline void slash_qdma_iocb_release(struct slash_qdma_io_cb *iocb) +static void slash_qdma_buf_dma_unmap(struct slash_qdma_buf *buf) { - if (iocb->pages) - iocb->pages = NULL; + struct device *dev = &buf->qdma_dev->pdev->dev; + unsigned int i; - kvfree(iocb->sgl); - iocb->sgl = NULL; - iocb->buf = NULL; + if (!buf->sgl || !buf->dma_mapped) + return; + + for (i = 0; i < buf->pages_nr; i++) { + struct qdma_sw_sg *sg = &buf->sgl[i]; + + if (sg->dma_addr) { + dma_unmap_page(dev, sg->dma_addr, sg->len, DMA_BIDIRECTIONAL); + sg->dma_addr = 0UL; + } + } + + buf->dma_mapped = false; } /** - * slash_qdma_unmap_user_buf() - Unpin user pages after a DMA transfer. - * @iocb: I/O control block with pinned pages. - * @write: Transfer direction from the device's perspective. If false - * (i.e., a C2H/read transfer), the pages were written to by the - * device and must be marked dirty so the VM knows the page - * contents have changed. + * slash_qdma_buf_dma_map() - DMA-map every SGL entry of a buffer. + * @buf: Buffer with a freshly built SGL. + * + * Maps each page with DMA_BIDIRECTIONAL so the same cached mapping serves both + * H2C and C2H transfers. On any failure all previously mapped entries are + * unmapped before returning. * - * Iterates over pinned pages, marks them dirty if this was a read (C2H) - * transfer (because the device wrote data into those user pages), and - * releases each page reference acquired by get_user_pages_fast(). + * Return: 0 on success, negative errno on failure. */ -static void slash_qdma_unmap_user_buf(struct slash_qdma_io_cb *iocb, bool write) +static int slash_qdma_buf_dma_map(struct slash_qdma_buf *buf) { - int i; + struct device *dev = &buf->qdma_dev->pdev->dev; + unsigned int i; - if (!iocb->pages || !iocb->pages_nr) - return; + for (i = 0; i < buf->pages_nr; i++) { + struct qdma_sw_sg *sg = &buf->sgl[i]; - for (i = 0; i < iocb->pages_nr; i++) { - if (iocb->pages[i]) { - /* - * For C2H (read) transfers (!write), the device wrote into - * these user pages, so mark them dirty to inform the VM. - */ - if (!write) - set_page_dirty(iocb->pages[i]); - put_page(iocb->pages[i]); - } else { - break; + sg->dma_addr = dma_map_page(dev, sg->pg, sg->offset, sg->len, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(dev, sg->dma_addr)) { + sg->dma_addr = 0UL; + pr_err("slash: qdma: buffer DMA map failed at entry %u/%u\n", + i, buf->pages_nr); + buf->dma_mapped = true; /* allow unmap of the entries done so far */ + slash_qdma_buf_dma_unmap(buf); + return -ENOMEM; } } - if (i != iocb->pages_nr) - pr_err("slash: qdma: sgl pages %d/%u.\n", i, iocb->pages_nr); - - iocb->pages_nr = 0; + buf->dma_mapped = true; + return 0; } -static int slash_qdma_iocb_alloc_sgl(struct slash_qdma_io_cb *iocb, - unsigned int entries) +/** + * slash_qdma_buf_free_pages() - Free a buffer's pages and SGL. + * @buf: Buffer to tear down. + * + * Releases each allocated page (put_page() so pages still mapped into a VMA + * stay alive until the last mapping is torn down) and frees the SGL/page + * arrays. The DMA mapping must already have been removed. + */ +static void slash_qdma_buf_free_pages(struct slash_qdma_buf *buf) { - size_t entry_size = sizeof(struct qdma_sw_sg) + sizeof(struct page *); - struct qdma_sw_sg *sg; - - if (!entries || entries > SIZE_MAX / entry_size) - return -EINVAL; + unsigned int i; - /* - * A large base-page transfer needs one entry per 4 KiB page (e.g. ~5 MiB - * of SGL for a 512 MiB transfer), which exceeds kmalloc's limit, so use - * kvcalloc(). The SGL is only ever touched by the CPU (libqdma DMA-maps - * the pages it references), so a vmalloc-backed allocation is fine. - */ - sg = kvcalloc(entries, entry_size, GFP_KERNEL); - if (!sg) { - pr_err("slash: qdma: sgl allocation failed for %u entries\n", - entries); - return -ENOMEM; + if (buf->pages) { + for (i = 0; i < buf->pages_nr; i++) { + if (buf->pages[i]) + put_page(buf->pages[i]); + } } - iocb->sgl = sg; - iocb->pages = (struct page **)(sg + entries); - return 0; -} - -static bool slash_qdma_page_is_base_page(struct page *page) -{ - return !PageCompound(page); + kvfree(buf->pages); + buf->pages = NULL; + kvfree(buf->sgl); + buf->sgl = NULL; + buf->pages_nr = 0; } -static int slash_qdma_map_user_base_pages_to_sgl(struct slash_qdma_io_cb *iocb, - bool write) +/** + * slash_qdma_buf_alloc() - Allocate pages, build the SGL, and DMA-map. + * @buf: Buffer with @length and @qdma_dev set; @granule defaults to PAGE_SIZE. + * + * Allocates @length / PAGE_SIZE individual base pages (not contiguous), builds + * a one-page-per-entry SGL, and DMA-maps every page. All of this is the + * amortised, do-it-once setup cost paid by SLASH_QDMA_IOCTL_BUF_CREATE. + * + * Return: 0 on success, negative errno on failure (partial state cleaned up). + */ +static int slash_qdma_buf_alloc(struct slash_qdma_buf *buf) { - unsigned long addr = (unsigned long)iocb->buf; - size_t entries = iocb->len / PAGE_SIZE; - unsigned int pinned = 0; + size_t entries = buf->length / PAGE_SIZE; unsigned int i; int rv; - if ((iocb->len % PAGE_SIZE) != 0 || entries == 0 || entries > UINT_MAX) + if (buf->length == 0 || (buf->length % PAGE_SIZE) != 0 || + entries == 0 || entries > UINT_MAX) return -EINVAL; - rv = slash_qdma_iocb_alloc_sgl(iocb, (unsigned int)entries); - if (rv) - return rv; + buf->granule = PAGE_SIZE; + buf->pages_nr = (unsigned int)entries; - /* - * Pin every base page in the span. get_user_pages_fast() may return - * fewer pages than requested, so loop (in bounded batches) until the - * whole buffer is pinned. - */ - while (pinned < entries) { - unsigned int want = min_t(unsigned int, - (unsigned int)entries - pinned, - SLASH_QDMA_GUP_BATCH); - int got = get_user_pages_fast(addr + (size_t)pinned * PAGE_SIZE, - (int)want, 1 /* write */, - iocb->pages + pinned); - - if (got <= 0) { - pr_err("slash: qdma: unable to pin 4 KiB user pages %u/%zu, %d\n", - pinned, entries, got); - rv = (got < 0) ? got : -EFAULT; - goto err_out; - } + buf->pages = kvcalloc(entries, sizeof(*buf->pages), GFP_KERNEL); + if (!buf->pages) + return -ENOMEM; - pinned += (unsigned int)got; - iocb->pages_nr = pinned; + buf->sgl = kvcalloc(entries, sizeof(*buf->sgl), GFP_KERNEL); + if (!buf->sgl) { + kvfree(buf->pages); + buf->pages = NULL; + return -ENOMEM; } for (i = 0; i < entries; i++) { - struct qdma_sw_sg *sg = &iocb->sgl[i]; + struct page *pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + struct qdma_sw_sg *sg = &buf->sgl[i]; - if (!slash_qdma_page_is_base_page(iocb->pages[i])) { - pr_err("slash: qdma: 4 KiB transfer page %u/%zu is not backed by a base page\n", - i, entries); - rv = -EINVAL; - goto err_out; + if (!pg) { + rv = -ENOMEM; + goto err_free; } - flush_dcache_page(iocb->pages[i]); - - sg->next = (i + 1 < entries) ? &iocb->sgl[i + 1] : NULL; - sg->pg = iocb->pages[i]; + buf->pages[i] = pg; + sg->next = (i + 1 < entries) ? &buf->sgl[i + 1] : NULL; + sg->pg = pg; sg->offset = 0; sg->len = PAGE_SIZE; sg->dma_addr = 0UL; } - SLASH_QDMA_OP_LOG("user transfer path=base-4k addr=0x%lx len=%zu pages=%zu write=%d\n", - addr, iocb->len, entries, write); + rv = slash_qdma_buf_dma_map(buf); + if (rv < 0) + goto err_free; return 0; -err_out: - slash_qdma_unmap_user_buf(iocb, write); - slash_qdma_iocb_release(iocb); +err_free: + slash_qdma_buf_free_pages(buf); return rv; } /** - * slash_qdma_map_user_buf_to_sgl() - Pin a user buffer and build its SGL. - * @iocb: I/O control block. @iocb->buf and @iocb->len must be set. - * @write: Transfer direction (true = H2C write, false = C2H read). - * - * The buffer must be page-aligned and a whole number of 4 KiB pages, and is - * mapped as a span of 4 KiB base pages: each page becomes one SGL entry / one - * DMA descriptor, and the whole span is submitted to libqdma as a single - * request. - * - * Return: 0 on success, negative errno on failure. - */ -static int slash_qdma_map_user_buf_to_sgl(struct slash_qdma_io_cb *iocb, - bool write) -{ - unsigned long addr = (unsigned long)iocb->buf; - size_t len = iocb->len; - - iocb->pages_nr = 0; - - if (!addr || !len || addr > ULONG_MAX - len) - return -EINVAL; - - if (!IS_ALIGNED(addr, PAGE_SIZE) || (len % PAGE_SIZE) != 0) { - pr_err("slash: qdma: unsupported user transfer addr=0x%lx len=%zu (must be page-aligned and a multiple of 4 KiB)\n", - addr, len); - return -EINVAL; - } - - return slash_qdma_map_user_base_pages_to_sgl(iocb, write); -} - -/* ───────────────────────────────────────────────────────────────────── - * Registered buffers: persistent pin + DMA mapping - * ───────────────────────────────────────────────────────────────────── */ - -/** - * slash_qdma_buf_dma_unmap() - Tear down the cached DMA mapping of a buffer. - * @buf: Registered buffer whose SGL entries were DMA-mapped. + * slash_qdma_buf_sync_for_device() - Hand a transfer slice to the device. + * @buf: Buffer being transferred. + * @start_entry: First page index of the slice. + * @n_entries: Number of pages in the slice. + * @dir: DMA direction (DMA_TO_DEVICE for H2C, DMA_FROM_DEVICE for C2H). * - * Unmaps every SGL entry that carries a non-zero dma_addr and clears it. - * Safe to call on a partially-mapped buffer (used on the registration - * error path). + * Synchronises CPU-written data out to the device (and/or invalidates CPU + * caches) for exactly the pages a sub-transfer touches. On cache-coherent + * hosts these are no-ops; on others they bound coherency to the transfer. */ -static void slash_qdma_buf_dma_unmap(struct slash_qdma_buf *buf) +static void slash_qdma_buf_sync_for_device(struct slash_qdma_buf *buf, + u64 start_entry, u64 n_entries, + enum dma_data_direction dir) { struct device *dev = &buf->qdma_dev->pdev->dev; - unsigned int i; - - if (!buf->iocb.sgl) - return; + u64 i; - for (i = 0; i < buf->iocb.pages_nr; i++) { - struct qdma_sw_sg *sg = &buf->iocb.sgl[i]; + for (i = 0; i < n_entries; i++) { + struct qdma_sw_sg *sg = &buf->sgl[start_entry + i]; - if (sg->dma_addr) { - dma_unmap_page(dev, sg->dma_addr, sg->len, DMA_BIDIRECTIONAL); - sg->dma_addr = 0UL; - } + dma_sync_single_for_device(dev, sg->dma_addr, sg->len, dir); } } /** - * slash_qdma_buf_dma_map() - DMA-map every SGL entry of a registered buffer. - * @buf: Registered buffer with a freshly built (pinned) SGL. - * - * Maps each entry with DMA_BIDIRECTIONAL so the same cached mapping serves - * both H2C and C2H transfers. On any failure all previously mapped entries - * are unmapped before returning. + * slash_qdma_buf_sync_for_cpu() - Reclaim a transfer slice for the CPU. + * @buf: Buffer being transferred. + * @start_entry: First page index of the slice. + * @n_entries: Number of pages in the slice. + * @dir: DMA direction (DMA_FROM_DEVICE for a completed C2H read). * - * Return: 0 on success, negative errno on failure. + * Makes device-written data visible to the CPU for exactly the pages a C2H + * sub-transfer touched. Called after the transfer completes. */ -static int slash_qdma_buf_dma_map(struct slash_qdma_buf *buf) +static void slash_qdma_buf_sync_for_cpu(struct slash_qdma_buf *buf, + u64 start_entry, u64 n_entries, + enum dma_data_direction dir) { struct device *dev = &buf->qdma_dev->pdev->dev; - unsigned int i; + u64 i; - for (i = 0; i < buf->iocb.pages_nr; i++) { - struct qdma_sw_sg *sg = &buf->iocb.sgl[i]; + for (i = 0; i < n_entries; i++) { + struct qdma_sw_sg *sg = &buf->sgl[start_entry + i]; - sg->dma_addr = dma_map_page(dev, sg->pg, sg->offset, sg->len, - DMA_BIDIRECTIONAL); - if (dma_mapping_error(dev, sg->dma_addr)) { - sg->dma_addr = 0UL; - pr_err("slash: qdma: buffer DMA map failed at entry %u/%u\n", - i, buf->iocb.pages_nr); - slash_qdma_buf_dma_unmap(buf); - return -ENOMEM; - } + dma_sync_single_for_cpu(dev, sg->dma_addr, sg->len, dir); } - - return 0; } /** - * slash_qdma_buf_release() - kref release callback for a registered buffer. + * slash_qdma_buf_release() - kref release callback for a buffer. * @ref: kref embedded in the slash_qdma_buf being freed. * - * Runs when the last reference drops (table ref plus any in-flight transfer - * refs). Tears down the DMA mapping, unpins the pages (marking them dirty in - * case a C2H transfer wrote into them), frees the SGL, and frees the struct. + * Runs when the last reference drops (fd ref, every live VMA ref, and any + * in-flight transfer ref). Tears down the DMA mapping, frees the pages and + * SGL, drops the device reference, and frees the struct. */ static void slash_qdma_buf_release(struct kref *ref) { struct slash_qdma_buf *buf = container_of(ref, struct slash_qdma_buf, ref); + struct slash_qdma_dev *qdma_dev = buf->qdma_dev; slash_qdma_buf_dma_unmap(buf); - /* write=false marks the pages dirty: a C2H transfer may have written. */ - slash_qdma_unmap_user_buf(&buf->iocb, false); - slash_qdma_iocb_release(&buf->iocb); + slash_qdma_buf_free_pages(buf); + if (qdma_dev) + kref_put(&qdma_dev->ref, slash_qdma_dev_release); kfree(buf); } @@ -2789,97 +2682,171 @@ static void slash_qdma_buf_put(struct slash_qdma_buf *buf) kref_put(&buf->ref, slash_qdma_buf_release); } +/* ───────────────────────────────────────────────────────────────────── + * Buffer fd: mmap support and lifetime + * ───────────────────────────────────────────────────────────────────── */ + /** - * slash_qdma_client_release() - kref release callback for a control-fd client. - * @ref: kref embedded in the slash_qdma_client being freed. + * slash_qdma_buf_vm_open() - VMA open callback (fork / VMA split). + * @vma: The VMA gaining an independent reference. * - * Runs when the control fd and all qpair fds derived from it have closed. - * By this point the buffer table has already been drained in - * slash_qdma_fop_release(); here we just release the device reference and - * free the client. + * Each live VMA holds one buffer reference so the pages (and DMA mapping) + * outlive the buffer fd if userspace keeps the mapping after close(). */ -static void slash_qdma_client_release(struct kref *ref) +static void slash_qdma_buf_vm_open(struct vm_area_struct *vma) { - struct slash_qdma_client *client = - container_of(ref, struct slash_qdma_client, ref); - struct slash_qdma_buf *buf; - unsigned long index; + struct slash_qdma_buf *buf = vma->vm_private_data; + + if (buf) + slash_qdma_buf_get(buf); +} + +/** + * slash_qdma_buf_vm_close() - VMA close callback (munmap / exit). + * @vma: The VMA being torn down. + */ +static void slash_qdma_buf_vm_close(struct vm_area_struct *vma) +{ + struct slash_qdma_buf *buf = vma->vm_private_data; + + if (buf) + slash_qdma_buf_put(buf); +} + +static const struct vm_operations_struct slash_qdma_buf_vm_ops = { + .open = slash_qdma_buf_vm_open, + .close = slash_qdma_buf_vm_close, +}; + +/** + * slash_qdma_buf_mmap() - mmap a kernel buffer's pages into userspace. + * @file: The buffer fd. + * @vma: The mapping request. + * + * Maps the whole buffer (offset 0, full length) into the calling process. + * The pages are ordinary kernel pages, so vm_map_pages_zero() inserts them + * directly; each VMA takes a buffer reference (initial one here, duplicated by + * the .open callback) so the pages stay valid for the life of the mapping. + * + * Return: 0 on success, negative errno on failure. + */ +static int slash_qdma_buf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct slash_qdma_buf *buf = file->private_data; + unsigned long span = vma->vm_end - vma->vm_start; + int rv; + + if (!buf) + return -ENODEV; + + /* Only a full, offset-0 mapping of the buffer is supported. */ + if (vma->vm_pgoff != 0) + return -EINVAL; + if (span != (unsigned long)buf->length) + return -EINVAL; /* - * Auto-unregister any buffers the client forgot (or had no chance) to - * release. This runs when the final fd referencing the shared client - * context closes (control fd or any derived qpair fd), so registrations - * remain usable after the control fd closes as long as a qpair fd is - * still alive. + * Normal page mapping (no VM_PFNMAP): keep it from being expanded beyond + * the buffer and excluded from core dumps. */ - xa_for_each(&client->buffers, index, buf) { - xa_erase(&client->buffers, index); - slash_qdma_buf_put(buf); - } - xa_destroy(&client->buffers); - if (client->qdma_dev) - kref_put(&client->qdma_dev->ref, slash_qdma_dev_release); - kfree(client); + slash_vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); + + rv = vm_map_pages_zero(vma, buf->pages, buf->pages_nr); + if (rv) + return rv; + + vma->vm_ops = &slash_qdma_buf_vm_ops; + vma->vm_private_data = buf; + slash_qdma_buf_get(buf); /* dropped by vm_close when this VMA goes away */ + + return 0; } /** - * slash_qdma_buf_lookup_get() - Look up a buffer by id and take a ref. - * @client: Owning client context. - * @buf_id: Buffer handle. + * slash_qdma_buf_fop_release() - Release callback for a buffer fd. + * @inode: Unused (anon inode). + * @file: The buffer fd being closed. * - * Returns the buffer with an extra reference held, or NULL if no such - * buffer exists. The xa_lock serialises against unregister/teardown so the - * buffer cannot be freed between lookup and the kref_get. + * Drops the fd's buffer reference. Pages survive until any remaining VMA + * references are dropped too. + * + * Return: Always 0. */ -static struct slash_qdma_buf * -slash_qdma_buf_lookup_get(struct slash_qdma_client *client, u32 buf_id) +static int slash_qdma_buf_fop_release(struct inode *inode, struct file *file) { - struct slash_qdma_buf *buf; + struct slash_qdma_buf *buf = file->private_data; - xa_lock(&client->buffers); - buf = xa_load(&client->buffers, buf_id); - if (buf) - slash_qdma_buf_get(buf); - xa_unlock(&client->buffers); + (void)inode; - return buf; + if (buf) { + slash_qdma_buf_put(buf); + file->private_data = NULL; + } + + return 0; +} + +/** + * slash_qdma_buf_fops - File operations for buffer fds. + * + * mmap maps the buffer's pages for CPU access. + * release drops the fd's reference on the buffer. + */ +static const struct file_operations slash_qdma_buf_fops = { + .owner = THIS_MODULE, + .mmap = slash_qdma_buf_mmap, + .release = slash_qdma_buf_fop_release, +}; + +/** + * slash_qdma_buf_from_file() - Resolve a buffer fd to its buffer object. + * @file: A file obtained from fget() on a candidate buffer fd. + * + * Return: The buffer if @file is a SLASH buffer fd, else NULL. + */ +static struct slash_qdma_buf *slash_qdma_buf_from_file(struct file *file) +{ + if (!file || file->f_op != &slash_qdma_buf_fops) + return NULL; + return file->private_data; } /* ───────────────────────────────────────────────────────────────────── - * IOCTL: buffer register / unregister + * IOCTL: buffer create * ───────────────────────────────────────────────────────────────────── */ /** - * slash_qdma_ioctl_buf_register_w() - Pin and DMA-map a host buffer. - * @misc: Miscdevice handle (for logging). - * @client: Owning control-fd client. - * @uarg: User pointer to a struct slash_qdma_buf_register. + * slash_qdma_ioctl_buf_create_w() - Allocate a kernel buffer and return its fd. + * @misc: Miscdevice handle (for logging). + * @qdma_dev: QDMA device the buffer is bound to (for DMA mapping). + * @uarg: User pointer to a struct slash_qdma_buf_create. * - * Pins the pages backing the user buffer, builds a scatter-gather list - * (one 4 KiB base page per entry), DMA-maps every entry once, and inserts - * the resulting buffer into the client's table under a freshly allocated - * buf_id. + * Allocates the buffer's pages, builds the SGL, and DMA-maps everything once, + * then wraps it in an anon_inode fd whose mmap exposes the pages for CPU + * access. The fd is returned as the ioctl return value (same convention as + * the BAR/queue-pair fd ioctls). Closing the fd (and unmapping any VMA) + * releases the buffer. * - * Return: 0 on success, negative errno on failure. + * Return: The new buffer fd (>= 0) on success, negative errno on failure. */ -static int slash_qdma_ioctl_buf_register_w(struct miscdevice *misc, - struct slash_qdma_client *client, - void __user *uarg) +static int slash_qdma_ioctl_buf_create_w(struct miscdevice *misc, + struct slash_qdma_dev *qdma_dev, + void __user *uarg) { - struct slash_qdma_buf_register req; - struct slash_qdma_dev *qdma_dev = client->qdma_dev; + struct slash_qdma_buf_create req; struct slash_qdma_buf *buf; + struct file *file; __u32 user_size = 0; size_t copy_size; - u32 buf_id; + int fd; int rv; if (copy_from_user(&user_size, uarg, sizeof(user_size))) return -EFAULT; - if (user_size < SLASH_QDMA_BUF_REGISTER_MIN_SIZE) { + if (user_size < SLASH_QDMA_BUF_CREATE_MIN_SIZE) { dev_warn(misc->this_device, - "qdma: BUF_REGISTER size too small (%u)\n", user_size); + "qdma: BUF_CREATE size too small (%u)\n", user_size); return -EINVAL; } @@ -2887,224 +2854,217 @@ static int slash_qdma_ioctl_buf_register_w(struct miscdevice *misc, if (copy_from_user(&req, uarg, min_t(size_t, user_size, sizeof(req)))) return -EFAULT; - if (req.flags != 0) + if (req.flags & ~O_CLOEXEC) return -EINVAL; if (req.length == 0 || (req.length % PAGE_SIZE) != 0) return -EINVAL; - if ((req.user_addr % PAGE_SIZE) != 0) - return -EINVAL; - buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) return -ENOMEM; kref_init(&buf->ref); + /* The buffer holds a device reference for its whole lifetime. */ + kref_get(&qdma_dev->ref); buf->qdma_dev = qdma_dev; buf->length = req.length; - buf->iocb.buf = (void __user *)(unsigned long)req.user_addr; - buf->iocb.len = (size_t)req.length; - /* - * Pin the pages and build the SGL once. Pin writable (write=true) so - * the same registration serves C2H transfers, where the device writes - * into the pages. - */ - rv = slash_qdma_map_user_buf_to_sgl(&buf->iocb, true); + rv = slash_qdma_buf_alloc(buf); if (rv < 0) { + kref_put(&qdma_dev->ref, slash_qdma_dev_release); kfree(buf); return rv; } - if (buf->iocb.pages_nr == 0 || !buf->iocb.sgl) { - slash_qdma_unmap_user_buf(&buf->iocb, false); - slash_qdma_iocb_release(&buf->iocb); - kfree(buf); - return -EINVAL; - } - - buf->granule = buf->iocb.sgl[0].len; - - rv = slash_qdma_buf_dma_map(buf); - if (rv < 0) { - slash_qdma_unmap_user_buf(&buf->iocb, false); - slash_qdma_iocb_release(&buf->iocb); - kfree(buf); + file = anon_inode_getfile("slash_qdma_buf", &slash_qdma_buf_fops, buf, + O_RDWR | (req.flags & O_CLOEXEC)); + if (IS_ERR(file)) { + rv = PTR_ERR(file); + slash_qdma_buf_put(buf); /* drops the only ref: frees buf + dev ref */ return rv; } - rv = xa_alloc(&client->buffers, &buf_id, buf, - SLASH_QDMA_BUF_ID_RANGE, GFP_KERNEL); - if (rv < 0) { - slash_qdma_buf_dma_unmap(buf); - slash_qdma_unmap_user_buf(&buf->iocb, false); - slash_qdma_iocb_release(&buf->iocb); - kfree(buf); - return rv; + fd = get_unused_fd_flags(req.flags & O_CLOEXEC); + if (fd < 0) { + fput(file); /* triggers buf release */ + return fd; } - buf->buf_id = buf_id; - SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, - "buf register: id=%u addr=0x%llx len=%llu granule=%llu entries=%u\n", - buf_id, (unsigned long long)req.user_addr, - (unsigned long long)req.length, - (unsigned long long)buf->granule, - buf->iocb.pages_nr); + "buf create: fd=%d len=%llu granule=%u pages=%u\n", + fd, (unsigned long long)req.length, + buf->granule, buf->pages_nr); - /* Copy the assigned buf_id back to userspace. */ + /* Fill the output fields before installing the fd. */ req.size = sizeof(req); - req.buf_id = buf_id; + req.granule = buf->granule; req.transfer_hint = SLASH_QDMA_TRANSFER_HINT_V80; copy_size = min_t(size_t, user_size, sizeof(req)); if (copy_to_user(uarg, &req, copy_size)) { - xa_erase(&client->buffers, buf_id); - slash_qdma_buf_put(buf); + put_unused_fd(fd); + fput(file); return -EFAULT; } if (user_size > sizeof(req)) { if (clear_user((void __user *)((unsigned long)uarg + sizeof(req)), user_size - sizeof(req))) { - xa_erase(&client->buffers, buf_id); - slash_qdma_buf_put(buf); + put_unused_fd(fd); + fput(file); return -EFAULT; } } - return 0; + fd_install(fd, file); + + return fd; } /** - * slash_qdma_ioctl_buf_unregister_w() - Drop a registered buffer. - * @misc: Miscdevice handle (unused). - * @client: Owning control-fd client. - * @uarg: User pointer to a struct slash_qdma_buf_unregister. + * struct slash_qdma_xfer_req - Runtime state for one sub-transfer submission. + * @qreq: libqdma request (built by slash_qdma_xfer_prep()). + * @done: Completion signalled by @qreq.fp_done for async submissions. + * @buf: Kernel buffer the transfer references (one ref held). + * @qhndl: Resolved libqdma queue handle for the direction/qpair. + * @start_entry: First page index of the buffer slice being transferred. + * @n_entries: Number of pages in the slice (for the DMA sync). + * @dma_dir: DMA direction for the streaming sync calls. + * @is_c2h: True for a C2H (device-to-host) sub-transfer, so the slice + * is synced back for the CPU after completion. + * @bytes_done: Bytes transferred, filled on completion. + * @err: Negative errno if the sub-transfer failed, else 0. + * @async_inflight: True once queued asynchronously and awaiting fp_done. + * + * Allocated as an array (one per sub-transfer) for the duration of a transfer + * batch. @qreq must outlive the in-flight async request, so the array stays + * alive until every async completion has fired. + */ +struct slash_qdma_xfer_req { + struct qdma_request qreq; + struct completion done; + struct slash_qdma_buf *buf; + unsigned long qhndl; + u64 start_entry; + u64 n_entries; + enum dma_data_direction dma_dir; + bool is_c2h; + unsigned int bytes_done; + int err; + bool async_inflight; +}; + +/** + * slash_qdma_xfer_done() - libqdma fp_done callback for async sub-transfers. + * @qreq: The completed request (embedded in a slash_qdma_xfer_req). + * @bytes_done: Bytes transferred. + * @err: Negative errno on failure, else 0. * - * Removes the buffer from the client table so no new transfer can find it, - * then drops the table's reference. Actual unpin/unmap is deferred to the - * buffer's release callback once any in-flight transfer has finished. + * Records the result and wakes the submitter waiting on @done. Runs in + * libqdma worker-thread context. * - * Return: 0 on success, negative errno on failure. + * Return: Always 0 (libqdma may free/re-task the request). */ -static int slash_qdma_ioctl_buf_unregister_w(struct miscdevice *misc, - struct slash_qdma_client *client, - void __user *uarg) +static int slash_qdma_xfer_done(struct qdma_request *qreq, + unsigned int bytes_done, int err) { - struct slash_qdma_buf_unregister req; - struct slash_qdma_buf *buf; - __u32 user_size = 0; - - (void)misc; - - if (copy_from_user(&user_size, uarg, sizeof(user_size))) - return -EFAULT; - - if (user_size < SLASH_QDMA_BUF_UNREGISTER_MIN_SIZE) - return -EINVAL; - - memset(&req, 0, sizeof(req)); - if (copy_from_user(&req, uarg, min_t(size_t, user_size, sizeof(req)))) - return -EFAULT; - - buf = xa_erase(&client->buffers, req.buf_id); - if (!buf) - return -ENOENT; - - slash_qdma_buf_put(buf); + struct slash_qdma_xfer_req *xr = + container_of(qreq, struct slash_qdma_xfer_req, qreq); + xr->bytes_done = bytes_done; + xr->err = err; + complete(&xr->done); return 0; } /** - * slash_qdma_qpair_transfer() - Registered-buffer DMA transfer on a qpair fd. - * @file: Anon_inode file for the queue pair. - * @uarg: User pointer to a struct slash_qdma_transfer. - * - * Looks up the registered buffer by id in the owning client, validates the - * requested slice against the buffer's page granule and length, resolves the - * queue handle for the requested direction, and submits the cached, - * pre-DMA-mapped SGL slice (req->dma_mapped = 1) to libqdma. - * - * Unlike the legacy read/write path, no pages are pinned or DMA-mapped here: - * that work was amortised at registration time. - * - * Return: number of bytes transferred (>= 0) on success, negative errno on - * failure. - */ -static long slash_qdma_qpair_transfer(struct file *file, void __user *uarg) + * slash_qdma_xfer_prep() - Validate one sub-transfer and build its request. + * @qdma_dev: QDMA device. + * @entry: Queue pair entry selected by the sub-transfer's qpair_index. + * @desc: User-supplied sub-transfer descriptor. + * @xr: [out] Receives a built (but not yet submitted) request, and a + * reference on the kernel buffer it targets. + * + * Shared submit core used by both the synchronous transfer ioctl and the + * optional io_uring uring_cmd path. Resolves the buffer fd named by the + * descriptor and refs the buffer, validates the slice against the buffer's + * page granule and length, resolves the queue handle for the requested + * direction, syncs the slice for the device, and fills the cached, + * pre-DMA-mapped SGL slice into @xr->qreq (dma_mapped = 1, fp_done = NULL). + * No pages are allocated or DMA-mapped here; that was amortised at creation. + * + * On success the caller owns the buffer ref in @xr->buf and must release it + * with slash_qdma_buf_put() once the request is no longer in flight. + * + * Return: 0 on success, negative errno on failure (no ref held on failure). + */ +static int slash_qdma_xfer_prep(struct slash_qdma_dev *qdma_dev, + struct slash_qdma_qpair_entry *entry, + const struct slash_qdma_subxfer *desc, + struct slash_qdma_xfer_req *xr) { - struct slash_qdma_qpair_file_ctx *ctx = file->private_data; - struct slash_qdma_transfer req; - struct slash_qdma_dev *qdma_dev; - struct slash_qdma_qpair_entry *entry; - struct slash_qdma_client *client; struct slash_qdma_buf *buf; - struct qdma_request qreq; + struct file *file; unsigned long qhndl; bool write; u32 dir_bit; enum queue_type_t qtype; + enum dma_data_direction dma_dir; u64 start_entry, n_entries; - __u32 user_size = 0; - ssize_t res; - - if (!ctx) - return -EINVAL; - - qdma_dev = ctx->qdma_dev; - entry = ctx->entry; - client = ctx->client; - if (!qdma_dev || !entry || !client) - return -ENODEV; - - if (copy_from_user(&user_size, uarg, sizeof(user_size))) - return -EFAULT; - - if (user_size < SLASH_QDMA_TRANSFER_MIN_SIZE) - return -EINVAL; - - memset(&req, 0, sizeof(req)); - if (copy_from_user(&req, uarg, min_t(size_t, user_size, sizeof(req)))) - return -EFAULT; - - switch (req.direction) { + switch (desc->direction) { case SLASH_QDMA_XFER_H2C: write = true; dir_bit = SLASH_QDMA_DIR_H2C; qtype = Q_H2C; + dma_dir = DMA_TO_DEVICE; break; case SLASH_QDMA_XFER_C2H: write = false; dir_bit = SLASH_QDMA_DIR_C2H; qtype = Q_C2H; + dma_dir = DMA_FROM_DEVICE; break; default: return -EINVAL; } - /* Resolve and ref the registered buffer. */ - buf = slash_qdma_buf_lookup_get(client, req.buf_id); - if (!buf) - return -ENOENT; + /* libqdma's request count is a 32-bit byte count. */ + if (desc->length == 0 || desc->length > UINT_MAX) + return -EINVAL; + + /* Resolve the buffer fd and take a ref that outlives the fd. */ + file = fget(desc->buf_fd); + if (!file) + return -EBADF; + buf = slash_qdma_buf_from_file(file); + if (!buf) { + fput(file); + return -EINVAL; + } + /* DMA mappings are device-specific: the buffer must belong to this device. */ + if (buf->qdma_dev != qdma_dev) { + fput(file); + return -EINVAL; + } + slash_qdma_buf_get(buf); + fput(file); /* Validate the requested slice against the buffer's page granule. */ - if (buf->granule == 0 || req.length == 0 || - (req.buf_offset % buf->granule) != 0 || - (req.length % buf->granule) != 0) { + if (buf->granule == 0 || + (desc->buf_offset % buf->granule) != 0 || + (desc->length % buf->granule) != 0) { slash_qdma_buf_put(buf); return -EINVAL; } - if (req.buf_offset > buf->length || - req.length > buf->length - req.buf_offset) { + if (desc->buf_offset > buf->length || + desc->length > buf->length - desc->buf_offset) { slash_qdma_buf_put(buf); return -EINVAL; } - start_entry = req.buf_offset / buf->granule; - n_entries = req.length / buf->granule; - if (start_entry + n_entries > buf->iocb.pages_nr) { + start_entry = desc->buf_offset / buf->granule; + n_entries = desc->length / buf->granule; + if (start_entry + n_entries > buf->pages_nr) { slash_qdma_buf_put(buf); return -EINVAL; } @@ -3126,68 +3086,403 @@ static long slash_qdma_qpair_transfer(struct file *file, void __user *uarg) mutex_unlock(&qdma_dev->lock); /* - * Submit the cached SGL slice. dma_mapped = 1 tells libqdma the SGL is - * already DMA-mapped (dma_addr filled at registration), so it skips the - * per-request map/unmap entirely. + * Hand the touched pages to the device. The mapping is persistent + * (dma_mapped = 1); only this slice is synced, so coherency cost scales + * with the transfer, not the whole buffer. */ - memset(&qreq, 0, sizeof(qreq)); - qreq.sgcnt = (unsigned int)n_entries; - qreq.sgl = &buf->iocb.sgl[start_entry]; - qreq.write = write ? 1 : 0; - qreq.dma_mapped = 1; - qreq.udd_len = 0; - qreq.ep_addr = (u64)req.dev_addr; - qreq.count = (unsigned int)req.length; - qreq.timeout_ms = 10 * 1000; - qreq.fp_done = NULL; - qreq.h2c_eot = 1; + slash_qdma_buf_sync_for_device(buf, start_entry, n_entries, dma_dir); - SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, - "transfer: qid=%u buf=%u off=%llu dev=0x%llx len=%llu dir=%s\n", - ctx->qid, req.buf_id, - (unsigned long long)req.buf_offset, - (unsigned long long)req.dev_addr, - (unsigned long long)req.length, - write ? "H2C" : "C2H"); + /* + * Build the request from the cached SGL slice. dma_mapped = 1 tells + * libqdma the SGL is already DMA-mapped (dma_addr filled at creation), + * so it skips the per-request map/unmap entirely. + */ + memset(&xr->qreq, 0, sizeof(xr->qreq)); + xr->qreq.sgcnt = (unsigned int)n_entries; + xr->qreq.sgl = &buf->sgl[start_entry]; + xr->qreq.write = write ? 1 : 0; + xr->qreq.dma_mapped = 1; + xr->qreq.udd_len = 0; + xr->qreq.ep_addr = (u64)desc->dev_addr; + xr->qreq.count = (unsigned int)desc->length; + xr->qreq.timeout_ms = 10 * 1000; + xr->qreq.fp_done = NULL; + xr->qreq.h2c_eot = 1; + + xr->buf = buf; + xr->qhndl = qhndl; + xr->start_entry = start_entry; + xr->n_entries = n_entries; + xr->dma_dir = dma_dir; + xr->is_c2h = !write; + xr->bytes_done = 0; + xr->err = 0; + xr->async_inflight = false; + return 0; +} + +/** + * slash_qdma_xfer_finish() - Post-completion DMA sync + buffer ref drop. + * @xr: A prepared (and now completed) sub-transfer request. + * + * For a C2H sub-transfer that moved data, makes the device-written pages + * visible to the CPU before releasing the buffer reference taken in prep. + */ +static void slash_qdma_xfer_finish(struct slash_qdma_xfer_req *xr) +{ + if (xr->is_c2h && xr->bytes_done) + slash_qdma_buf_sync_for_cpu(xr->buf, xr->start_entry, xr->n_entries, + xr->dma_dir); + slash_qdma_buf_put(xr->buf); +} + +/** + * slash_qdma_qpair_transfer() - Buffer DMA transfer batch on a queue-pair fd. + * @file: Anon_inode file for the queue-pair collection. + * @uarg: User pointer to a struct slash_qdma_transfer (1..N sub-transfers). + * + * Validates and prepares every sub-transfer, then submits them so those that + * target distinct queue pairs run concurrently: all but the last are submitted + * asynchronously (fp_done set), the last is submitted synchronously (blocking), + * and the async ones are then waited on. A single sub-transfer therefore takes + * the plain blocking path with no async overhead. + * + * Return: total number of bytes transferred (>= 0) on success; the first + * sub-transfer error (negative errno) on failure. + */ +static long slash_qdma_qpair_transfer(struct file *file, void __user *uarg) +{ + struct slash_qdma_qpair_file_ctx *ctx = file->private_data; + struct slash_qdma_dev *qdma_dev; + struct slash_qdma_transfer req; + struct slash_qdma_xfer_req *xrs; + __u32 user_size = 0; + u32 count, i, last; + u64 total = 0; + int first_err = 0; + ssize_t res; + + if (!ctx) + return -EINVAL; + + qdma_dev = ctx->qdma_dev; + + if (!qdma_dev || ctx->n_qpairs == 0) + return -ENODEV; + + if (copy_from_user(&user_size, uarg, sizeof(user_size))) + return -EFAULT; + + if (user_size < SLASH_QDMA_TRANSFER_MIN_SIZE) + return -EINVAL; + + memset(&req, 0, sizeof(req)); + if (copy_from_user(&req, uarg, min_t(size_t, user_size, sizeof(req)))) + return -EFAULT; + + count = req.count; + if (count == 0 || count > SLASH_QDMA_FD_MAX_QPAIRS) + return -EINVAL; - res = qdma_request_submit(qdma_dev->qdma_handle, qhndl, &qreq); + xrs = kcalloc(count, sizeof(*xrs), GFP_KERNEL); + if (!xrs) + return -ENOMEM; + + /* Validate and prepare every sub-transfer (each takes a buffer ref). */ + for (i = 0; i < count; i++) { + const struct slash_qdma_subxfer *d = &req.xfers[i]; + int rv; - slash_qdma_buf_put(buf); + if (d->qpair_index >= ctx->n_qpairs) + rv = -EINVAL; + else + rv = slash_qdma_xfer_prep(qdma_dev, + ctx->entries[d->qpair_index], d, + &xrs[i]); + if (rv) { + while (i-- > 0) + slash_qdma_buf_put(xrs[i].buf); + kfree(xrs); + return rv; + } + SLASH_QDMA_OP_DEV_LOG(&qdma_dev->pdev->dev, + "transfer[%u]: qid=%u buf_fd=%d off=%llu dev=0x%llx len=%llu dir=%s\n", + i, ctx->qids[d->qpair_index], d->buf_fd, + (unsigned long long)d->buf_offset, + (unsigned long long)d->dev_addr, + (unsigned long long)d->length, + d->direction == SLASH_QDMA_XFER_H2C ? "H2C" : "C2H"); + } + + last = count - 1; + + /* + * Submit all but the last asynchronously so the sub-transfers run on their + * (distinct) queue pairs in parallel; libqdma calls fp_done on completion. + */ + for (i = 0; i < last; i++) { + init_completion(&xrs[i].done); + xrs[i].qreq.fp_done = slash_qdma_xfer_done; + res = qdma_request_submit(qdma_dev->qdma_handle, xrs[i].qhndl, + &xrs[i].qreq); + if (res < 0) + xrs[i].err = (int)res; /* not queued: fp_done will not fire */ + else + xrs[i].async_inflight = true; + } + + /* Submit the last sub-transfer synchronously (blocks until complete). */ + res = qdma_request_submit(qdma_dev->qdma_handle, xrs[last].qhndl, + &xrs[last].qreq); if (res < 0) - return (long)res; + xrs[last].err = (int)res; + else + xrs[last].bytes_done = (unsigned int)res; + + /* Wait for the async sub-transfers, then aggregate (first error wins). */ + for (i = 0; i < last; i++) { + if (xrs[i].async_inflight) + wait_for_completion(&xrs[i].done); + } + + for (i = 0; i < count; i++) { + if (xrs[i].err && !first_err) + first_err = xrs[i].err; + total += xrs[i].bytes_done; + slash_qdma_xfer_finish(&xrs[i]); + } + + kfree(xrs); + + if (first_err) + return (long)first_err; + + return (long)total; +} + +#if defined(SLASH_HAVE_URING_CMD) +/** + * struct slash_qdma_uring_cmd_ctx - Async state for one uring_cmd transfer. + * @cmd: The io_uring command being served. + * @xrs: Per-sub-transfer requests (buffer refs held until completion). + * @count: Number of sub-transfers. + * @outstanding: Sub-transfers not yet completed; the one that drops it to 0 + * schedules the completion task-work. + * @total_bytes: Aggregate bytes transferred. + * @first_err: First negative errno seen, or 0. + * + * Heap-allocated for the lifetime of an asynchronous transfer; a pointer to it + * is stashed in cmd->pdu so the completion task-work can recover it. + */ +struct slash_qdma_uring_cmd_ctx { + struct io_uring_cmd *cmd; + struct slash_qdma_xfer_req xrs[SLASH_QDMA_FD_MAX_QPAIRS]; + u32 count; + atomic_t outstanding; + atomic_long_t total_bytes; + atomic_t first_err; +}; + +/** + * slash_qdma_uring_cmd_complete() - Task-work that finishes a uring_cmd. + * @cmd: The io_uring command. + * @issue_flags: io_uring issue flags for io_uring_cmd_done(). + * + * Runs in task context once all sub-transfers have completed: drops the + * buffer refs, completes the CQE with the total bytes (or first error), and + * frees the command context. + */ +static void slash_qdma_uring_cmd_complete(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct slash_qdma_uring_cmd_ctx *uc; + int err; + long ret; + u32 i; - return (long)res; + memcpy(&uc, cmd->pdu, sizeof(uc)); + err = atomic_read(&uc->first_err); + ret = err ? err : atomic_long_read(&uc->total_bytes); + + for (i = 0; i < uc->count; i++) + slash_qdma_xfer_finish(&uc->xrs[i]); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); + kfree(uc); } +/** + * slash_qdma_uring_xfer_done() - fp_done for async uring_cmd sub-transfers. + * @qreq: Completed request (embedded in a slash_qdma_xfer_req). + * @bytes_done: Bytes transferred. + * @err: Negative errno on failure, else 0. + * + * Accumulates the result and, when the last sub-transfer of the command + * completes, schedules the completion task-work. Runs in libqdma worker + * context. + * + * Return: Always 0. + */ +static int slash_qdma_uring_xfer_done(struct qdma_request *qreq, + unsigned int bytes_done, int err) +{ + struct slash_qdma_xfer_req *xr = + container_of(qreq, struct slash_qdma_xfer_req, qreq); + struct slash_qdma_uring_cmd_ctx *uc = + (struct slash_qdma_uring_cmd_ctx *)qreq->uld_data; + + xr->bytes_done = bytes_done; + xr->err = err; + if (bytes_done) + atomic_long_add(bytes_done, &uc->total_bytes); + if (err) + atomic_cmpxchg(&uc->first_err, 0, err); + + if (atomic_dec_and_test(&uc->outstanding)) + io_uring_cmd_complete_in_task(uc->cmd, + slash_qdma_uring_cmd_complete); + return 0; +} + +/** + * slash_qdma_qpair_uring_cmd() - Asynchronous transfer batch via io_uring. + * @cmd: The io_uring command; its inline SQE data is a single __u64 + * userspace pointer to a struct slash_qdma_transfer. + * @issue_flags: io_uring issue flags. + * + * The optional async sibling of SLASH_QDMA_QPAIR_IOCTL_TRANSFER: it prepares + * every sub-transfer, submits them all asynchronously (so they run on their + * distinct queue pairs concurrently), and completes the CQE from task-work + * once they all finish. Many such commands can be in flight at once, which is + * the intended multi-buffer optimization. + * + * Return: -EIOCBQUEUED once submission is under way (completion arrives via + * the CQE); a negative errno if the command is rejected before any + * sub-transfer is queued; -EAGAIN to defer a non-blocking issue. + */ +static int slash_qdma_qpair_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct file *file = cmd->file; + struct slash_qdma_qpair_file_ctx *ctx = file->private_data; + struct slash_qdma_dev *qdma_dev; + struct slash_qdma_uring_cmd_ctx *uc; + struct slash_qdma_transfer req; + u64 uptr = 0; + u32 count, i; + ssize_t res; + + if (cmd->cmd_op != SLASH_QDMA_URING_CMD_TRANSFER) + return -EOPNOTSUPP; + + if (!ctx) + return -EINVAL; + + qdma_dev = ctx->qdma_dev; + if (!qdma_dev || ctx->n_qpairs == 0) + return -ENODEV; + + /* + * Copying the descriptor from userspace may fault and sleep, so defer a + * non-blocking issue to a blocking io_uring context. + */ + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + /* The SQE inline command carries the user pointer to the descriptor. */ + memcpy(&uptr, slash_qdma_uring_cmd_payload(cmd), sizeof(uptr)); + + memset(&req, 0, sizeof(req)); + if (copy_from_user(&req, u64_to_user_ptr(uptr), sizeof(req))) + return -EFAULT; + + count = req.count; + if (count == 0 || count > SLASH_QDMA_FD_MAX_QPAIRS) + return -EINVAL; + + uc = kzalloc(sizeof(*uc), GFP_KERNEL); + if (!uc) + return -ENOMEM; + + uc->cmd = cmd; + uc->count = count; + atomic_set(&uc->outstanding, count); + atomic_long_set(&uc->total_bytes, 0); + atomic_set(&uc->first_err, 0); + + /* Validate and prepare every sub-transfer before queueing any of them. */ + for (i = 0; i < count; i++) { + const struct slash_qdma_subxfer *d = &req.xfers[i]; + int rv; + + if (d->qpair_index >= ctx->n_qpairs) + rv = -EINVAL; + else + rv = slash_qdma_xfer_prep(qdma_dev, + ctx->entries[d->qpair_index], d, + &uc->xrs[i]); + if (rv) { + while (i-- > 0) + slash_qdma_buf_put(uc->xrs[i].buf); + kfree(uc); + return rv; + } + uc->xrs[i].qreq.uld_data = (unsigned long)uc; + uc->xrs[i].qreq.fp_done = slash_qdma_uring_xfer_done; + } + + /* Stash the context for the completion task-work. */ + memcpy(cmd->pdu, &uc, sizeof(uc)); + + /* + * Submit all sub-transfers asynchronously. Completion (success or the + * inline submit-failure path below) is funnelled through the outstanding + * counter so the CQE is posted exactly once from task-work. + */ + for (i = 0; i < count; i++) { + res = qdma_request_submit(qdma_dev->qdma_handle, uc->xrs[i].qhndl, + &uc->xrs[i].qreq); + if (res < 0) { + /* Not queued: fp_done will not fire, account for it here. */ + uc->xrs[i].err = (int)res; + atomic_cmpxchg(&uc->first_err, 0, (int)res); + if (atomic_dec_and_test(&uc->outstanding)) + io_uring_cmd_complete_in_task(uc->cmd, + slash_qdma_uring_cmd_complete); + } + } + + return -EIOCBQUEUED; +} +#endif /* SLASH_HAVE_URING_CMD */ + /** * slash_qdma_qpair_ioctl() - Ioctl handler for per-qpair anon_inode fds. * @file: Anon_inode file. * @cmd: Ioctl command number. * @arg: User-space argument. * - * Supports SLASH_QDMA_QPAIR_IOCTL_TRANSFER (registered-buffer DMA transfer). + * Supports SLASH_QDMA_IOCTL_BUF_CREATE (allocate a kernel buffer for clients + * that hold only a queue-pair fd) and SLASH_QDMA_QPAIR_IOCTL_TRANSFER (buffer + * DMA transfer). * - * Return: bytes transferred (>= 0) for TRANSFER, or -ENOTTY for any other - * command. + * Return: bytes transferred (>= 0) for TRANSFER, a new fd for BUF_CREATE, or + * -ENOTTY for any other command. */ static long slash_qdma_qpair_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct slash_qdma_qpair_file_ctx *ctx = file->private_data; - if (!ctx || !ctx->client || !ctx->qdma_dev) + if (!ctx || !ctx->qdma_dev) return -ENODEV; switch (cmd) { - case SLASH_QDMA_IOCTL_BUF_REGISTER: - return slash_qdma_ioctl_buf_register_w(&ctx->qdma_dev->misc, - ctx->client, - (void __user *)arg); - case SLASH_QDMA_IOCTL_BUF_UNREGISTER: - return slash_qdma_ioctl_buf_unregister_w(&ctx->qdma_dev->misc, - ctx->client, - (void __user *)arg); + case SLASH_QDMA_IOCTL_BUF_CREATE: + return slash_qdma_ioctl_buf_create_w(&ctx->qdma_dev->misc, + ctx->qdma_dev, + (void __user *)arg); case SLASH_QDMA_QPAIR_IOCTL_TRANSFER: return slash_qdma_qpair_transfer(file, (void __user *)arg); default: @@ -3213,14 +3508,15 @@ static long slash_qdma_qpair_ioctl(struct file *file, static int slash_qdma_qpair_release(struct inode *inode, struct file *file) { struct slash_qdma_qpair_file_ctx *ctx = file->private_data; + u32 i; (void)inode; if (ctx) { - if (ctx->entry) - slash_qdma_qpair_put(ctx->entry); - if (ctx->client) - kref_put(&ctx->client->ref, slash_qdma_client_release); + for (i = 0; i < ctx->n_qpairs; i++) { + if (ctx->entries[i]) + slash_qdma_qpair_put(ctx->entries[i]); + } if (ctx->qdma_dev) kref_put(&ctx->qdma_dev->ref, slash_qdma_dev_release); kfree(ctx); @@ -3235,35 +3531,39 @@ static int slash_qdma_qpair_release(struct inode *inode, struct file *file) * ───────────────────────────────────────────────────────────────────── */ /** - * slash_qdma_ioctl_qpair_get_fd_w() - Create an anon_inode fd for a queue pair. + * slash_qdma_ioctl_qpair_get_fd_w() - Create an anon_inode fd for queue I/O. * @misc: Miscdevice handle (unused). * @qdma_dev: QDMA device. * @uarg: User-space pointer to a slash_qdma_qpair_fd_request struct. * * Creates an anonymous inode file descriptor that userspace can use for - * registered-buffer transfer ioctls on the specified queue pair. The fd - * holds references to the qpair entry, client context, and device, - * preventing any of them from being freed while the fd is open. + * buffer transfer ioctls. The fd is a collection of one or two queue pairs + * (see slash_qdma_qpair_fd_request): @qpair_count == 0 binds the single qpair + * named by @qid (back-compat), otherwise @qpair_count IDs from @qpair_ids are + * bound, their array index becoming the transfer qpair_index. * - * The only supported flag is O_CLOEXEC (close-on-exec). + * The fd holds references to each bound qpair entry and the device, preventing + * either from being freed while the fd is open. Each bound qpair keeps the + * per-qpair configuration (mm_channel, ring sizes, directions) it was given at + * add time, so the channels can differ. * - * The fd is ioctl-only for data movement; transfers pass the device-side - * address in struct slash_qdma_transfer. + * The only supported flag is O_CLOEXEC (close-on-exec). * - * Error handling: on any failure after resources are acquired, all - * refs and allocations are cleaned up before returning. + * Error handling: on any failure after resources are acquired, all refs and + * allocations are cleaned up before returning. * * Return: The new fd (>= 0) on success, negative errno on failure. */ static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc, - struct slash_qdma_client *client, + struct slash_qdma_dev *qdma_dev, void __user *uarg) { - struct slash_qdma_dev *qdma_dev = client->qdma_dev; struct slash_qdma_qpair_fd_request req; __u32 user_size = 0; + __u32 ids[SLASH_QDMA_FD_MAX_QPAIRS]; + u32 n_qpairs; + u32 i; size_t copy_size; - struct slash_qdma_qpair_entry *entry; struct slash_qdma_qpair_file_ctx *ctx; struct file *file; int fd; @@ -3287,55 +3587,70 @@ static int slash_qdma_ioctl_qpair_get_fd_w(struct miscdevice *misc, if (req.flags & ~O_CLOEXEC) return -EINVAL; - /* Look up the qpair entry and take refs while holding the lock. */ + /* + * Resolve the requested qpair-id set. qpair_count == 0 is the legacy + * single-qpair form using @qid; otherwise bind @qpair_count ids. + */ + if (req.qpair_count == 0) { + n_qpairs = 1; + ids[0] = req.qid; + } else { + if (req.qpair_count > SLASH_QDMA_FD_MAX_QPAIRS) + return -EINVAL; + n_qpairs = req.qpair_count; + for (i = 0; i < n_qpairs; i++) + ids[i] = req.qpair_ids[i]; + } + + /* Allocate the per-fd context. */ + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + /* Look up each qpair entry and take refs while holding the lock. */ mutex_lock(&qdma_dev->lock); if (qdma_dev->hw_shutdown || !qdma_dev->have_qdma_handle) { mutex_unlock(&qdma_dev->lock); + kfree(ctx); return -ENODEV; } - entry = slash_qdma_qpair_lookup(qdma_dev, req.qid); - if (!entry || !entry->dir_mask) { - mutex_unlock(&qdma_dev->lock); - return -ENOENT; + for (i = 0; i < n_qpairs; i++) { + struct slash_qdma_qpair_entry *entry = + slash_qdma_qpair_lookup(qdma_dev, ids[i]); + + if (!entry || !entry->dir_mask) { + /* Drop refs taken so far for the earlier entries. */ + while (i-- > 0) + slash_qdma_qpair_put(ctx->entries[i]); + mutex_unlock(&qdma_dev->lock); + kfree(ctx); + return -ENOENT; + } + + /* + * Take a ref on the entry. These refs are held by the file context + * and released when the fd is closed, ensuring the entries cannot be + * freed prematurely. + */ + slash_qdma_qpair_get(entry); + ctx->entries[i] = entry; + ctx->qids[i] = ids[i]; } + ctx->n_qpairs = n_qpairs; - /* - * Take a ref on the entry and the device. These refs are held by - * the file context and released when the fd is closed, ensuring - * neither the entry nor the device can be freed prematurely. - */ - slash_qdma_qpair_get(entry); kref_get(&qdma_dev->ref); - /* - * Take a ref on the owning client so handle-based transfers issued on - * this qpair fd can resolve registered buffers even if the control fd - * that created the qpair is closed first. - */ - kref_get(&client->ref); mutex_unlock(&qdma_dev->lock); - /* Allocate the per-fd context. */ - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) { - slash_qdma_qpair_put(entry); - kref_put(&client->ref, slash_qdma_client_release); - kref_put(&qdma_dev->ref, slash_qdma_dev_release); - return -ENOMEM; - } - ctx->qdma_dev = qdma_dev; - ctx->entry = entry; - ctx->client = client; - ctx->qid = req.qid; /* Create the anonymous inode file with read/write access. */ file = anon_inode_getfile("slash_qdma_qpair", &slash_qdma_qpair_fops, ctx, O_RDWR | (req.flags & O_CLOEXEC)); if (IS_ERR(file)) { err = PTR_ERR(file); - slash_qdma_qpair_put(entry); - kref_put(&client->ref, slash_qdma_client_release); + for (i = 0; i < ctx->n_qpairs; i++) + slash_qdma_qpair_put(ctx->entries[i]); kref_put(&qdma_dev->ref, slash_qdma_dev_release); kfree(ctx); return err; diff --git a/driver/tests/test_slash_qdma.c b/driver/tests/test_slash_qdma.c index f9b0eb7e..904b3e81 100644 --- a/driver/tests/test_slash_qdma.c +++ b/driver/tests/test_slash_qdma.c @@ -2,9 +2,10 @@ /* * QDMA control device (/dev/slash_qdma_ctl) ABI tests. * - * Covers QPAIR_ADD / Q_OP / QPAIR_GET_FD / INFO and the per-qpair - * anon-inode fd (read/write/lseek/pread/pwrite, multi-fd, wrong-direction, - * mmap-unsupported, HBM/DDR region round trips). See + * Covers QPAIR_ADD / Q_OP / QPAIR_GET_FD / INFO, the kernel-owned buffer fd + * (BUF_CREATE + mmap), and the per-qpair anon-inode transfer fd + * (TRANSFER ioctl, multi-fd, wrong-direction, read/write/lseek/mmap + * unsupported, HBM/DDR region round trips). See * docs/reference/kernel-abi/index.rst for the spec. */ @@ -35,12 +36,60 @@ static void fill_pattern(uint8_t *buf, size_t len) buf[i] = (uint8_t)(i & 0xff); } -static int qdma_buf_register(int ctl_fd, void *addr, uint64_t length, - uint32_t *buf_id, uint32_t *transfer_hint); -static int qdma_buf_unregister(int ctl_fd, uint32_t buf_id); -static long qdma_buf_transfer(int io_fd, uint32_t buf_id, uint64_t buf_offset, +/* + * Create a kernel-owned DMA buffer via BUF_CREATE on @ioctl_fd (control fd or + * queue-pair fd). Returns the new buffer fd (>= 0), or -errno on failure. + */ +static int qdma_buf_create(int ioctl_fd, uint64_t length, uint32_t *granule, + uint32_t *transfer_hint) +{ + struct slash_qdma_buf_create req; + int fd; + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.flags = O_CLOEXEC; + req.length = length; + + fd = ioctl(ioctl_fd, SLASH_QDMA_IOCTL_BUF_CREATE, &req); + if (fd < 0) + return -errno; + + if (granule) + *granule = req.granule; + if (transfer_hint) + *transfer_hint = req.transfer_hint; + return fd; +} + +/* mmap a buffer fd for CPU access; returns the mapping or MAP_FAILED. */ +static void *qdma_buf_map(int buf_fd, uint64_t length) +{ + return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED, buf_fd, 0); +} + +/* + * Issue a single-sub-transfer buffer transfer on a qpair fd (qpair_index 0); + * returns the ioctl result (bytes transferred or -1 with errno set). + */ +static long qdma_buf_transfer(int io_fd, int buf_fd, uint64_t buf_offset, uint64_t dev_addr, uint64_t length, - uint32_t direction); + uint32_t direction) +{ + struct slash_qdma_transfer req; + + memset(&req, 0, sizeof(req)); + req.size = sizeof(req); + req.count = 1; + req.xfers[0].qpair_index = 0; + req.xfers[0].direction = direction; + req.xfers[0].buf_fd = buf_fd; + req.xfers[0].buf_offset = buf_offset; + req.xfers[0].dev_addr = dev_addr; + req.xfers[0].length = length; + + return ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req); +} /* ---------- fixture ---------- */ @@ -133,40 +182,118 @@ TEST_F(qdma, qpair_lifecycle) TEST_F(qdma, write_read_verify) { - uint8_t *write_buf, *read_buf; uint64_t dma_addr = get_dma_addr(); - uint32_t write_id = 0, read_id = 0; + int write_fd, read_fd; + uint8_t *write_buf, *read_buf; long ret; bring_up_qpair(_metadata, self, 0x3); - write_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, write_buf); - read_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, read_buf); + write_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(write_fd, 0); + read_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(read_fd, 0); + + write_buf = qdma_buf_map(write_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, write_buf); + read_buf = qdma_buf_map(read_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, read_buf); fill_pattern(write_buf, TRANSFER_SIZE); memset(read_buf, 0, TRANSFER_SIZE); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, TRANSFER_SIZE, - &write_id, NULL)); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, TRANSFER_SIZE, - &read_id, NULL)); - - ret = qdma_buf_transfer(self->io_fd, write_id, 0, dma_addr, + ret = qdma_buf_transfer(self->io_fd, write_fd, 0, dma_addr, TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); ASSERT_EQ(TRANSFER_SIZE, ret); - ret = qdma_buf_transfer(self->io_fd, read_id, 0, dma_addr, + ret = qdma_buf_transfer(self->io_fd, read_fd, 0, dma_addr, TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); ASSERT_EQ(TRANSFER_SIZE, ret); EXPECT_EQ(0, memcmp(write_buf, read_buf, TRANSFER_SIZE)); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, write_id)); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, read_id)); - free(write_buf); - free(read_buf); + munmap(write_buf, TRANSFER_SIZE); + munmap(read_buf, TRANSFER_SIZE); + close(write_fd); + close(read_fd); +} + +/* ---------- buffer fd behaviour ---------- */ + +TEST_F(qdma, buf_create_zero_length_returns_einval) +{ + EXPECT_EQ(-EINVAL, qdma_buf_create(self->ctl_fd, 0, NULL, NULL)); +} + +TEST_F(qdma, buf_create_unaligned_length_returns_einval) +{ + /* Length must be a multiple of the page size. */ + EXPECT_EQ(-EINVAL, + qdma_buf_create(self->ctl_fd, TRANSFER_SIZE + 1, NULL, NULL)); +} + +TEST_F(qdma, buf_create_reports_granule_and_hint) +{ + uint32_t granule = 0; + uint32_t hint = 0; + int buf_fd; + + buf_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, &granule, &hint); + ASSERT_GE(buf_fd, 0); + EXPECT_EQ(4096u, granule); + EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, hint); + close(buf_fd); +} + +TEST_F(qdma, buf_create_via_qpair_fd) +{ + int buf_fd; + uint8_t *map; + long ret; + uint64_t dma_addr = get_dma_addr(); + + bring_up_qpair(_metadata, self, 0x3); + + /* Buffers can be created through the queue-pair fd too (SCM_RIGHTS use). */ + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); + + map = qdma_buf_map(buf_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, map); + fill_pattern(map, TRANSFER_SIZE); + + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, dma_addr, TRANSFER_SIZE, + SLASH_QDMA_XFER_H2C); + ASSERT_EQ(TRANSFER_SIZE, ret); + + munmap(map, TRANSFER_SIZE); + close(buf_fd); +} + +TEST_F(qdma, buf_fd_mapping_outlives_fd_close) +{ + int buf_fd; + uint8_t *map; + uint64_t dma_addr = get_dma_addr(); + long ret; + + bring_up_qpair(_metadata, self, 0x3); + + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); + map = qdma_buf_map(buf_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, map); + + /* Closing the fd must not invalidate an existing mapping. */ + close(buf_fd); + + fill_pattern(map, TRANSFER_SIZE); + /* The mapping is still valid; the bytes are readable. */ + EXPECT_EQ(0u, map[0]); + (void)dma_addr; + (void)ret; + + munmap(map, TRANSFER_SIZE); } /* ---------- error paths ---------- */ @@ -288,65 +415,56 @@ TEST_F(qdma, qpair_get_fd_unknown_qid) TEST_F(qdma, io_read_on_h2c_only_returns_enodev) { - uint8_t *buf; - uint32_t buf_id = 0; + int buf_fd; long ret; bring_up_qpair(_metadata, self, 0x1); /* H2C only */ - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, NULL)); - ret = qdma_buf_transfer(self->io_fd, buf_id, 0, SLASH_TEST_HBM_BASE, + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, SLASH_TEST_HBM_BASE, TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); EXPECT_EQ(-1, ret); EXPECT_EQ(ENODEV, errno); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); - free(buf); + close(buf_fd); } TEST_F(qdma, io_write_on_c2h_only_returns_enodev) { - uint8_t *buf; - uint32_t buf_id = 0; + int buf_fd; long ret; bring_up_qpair(_metadata, self, 0x2); /* C2H only */ - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, NULL)); - ret = qdma_buf_transfer(self->io_fd, buf_id, 0, SLASH_TEST_HBM_BASE, + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, SLASH_TEST_HBM_BASE, TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); EXPECT_EQ(-1, ret); EXPECT_EQ(ENODEV, errno); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); - free(buf); + close(buf_fd); } TEST_F(qdma, io_zero_length_returns_einval) { - uint8_t *buf; - uint32_t buf_id = 0; + int buf_fd; long ret; bring_up_qpair(_metadata, self, 0x3); - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, NULL)); - ret = qdma_buf_transfer(self->io_fd, buf_id, 0, SLASH_TEST_HBM_BASE, + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, SLASH_TEST_HBM_BASE, 0, SLASH_QDMA_XFER_H2C); EXPECT_EQ(-1, ret); EXPECT_EQ(EINVAL, errno); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); - free(buf); + close(buf_fd); } TEST_F(qdma, io_mmap_unsupported) @@ -355,17 +473,17 @@ TEST_F(qdma, io_mmap_unsupported) bring_up_qpair(_metadata, self, 0x3); + /* The transfer (queue-pair) fd is not mappable — only buffer fds are. */ p = mmap(NULL, 4096, PROT_READ, MAP_SHARED, self->io_fd, 0); EXPECT_EQ(MAP_FAILED, p); if (p != MAP_FAILED) munmap(p, 4096); } -TEST_F(qdma, io_ioctl_returns_enotty) +TEST_F(qdma, io_junk_ioctl_returns_enotty) { - /* The per-qpair anon_inode fd defines no ioctls; the handler - * returns -ENOTTY for any cmd. Exercising this path keeps the stub - * formally covered. */ + /* The per-qpair fd defines only BUF_CREATE / TRANSFER; any other cmd + * returns -ENOTTY. */ unsigned int junk = _IO('v', 0xFE); bring_up_qpair(_metadata, self, 0x3); @@ -387,17 +505,11 @@ TEST_F(qdma, io_lseek_unsupported) TEST_F(qdma, io_read_write_unsupported) { - uint8_t *buf; - uint32_t buf_id = 0; + uint8_t buf[TRANSFER_SIZE]; long ret; bring_up_qpair(_metadata, self, 0x3); - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, NULL)); - fill_pattern(buf, TRANSFER_SIZE); - ret = write(self->io_fd, buf, TRANSFER_SIZE); EXPECT_EQ(-1, ret); EXPECT_EQ(EINVAL, errno); @@ -405,15 +517,12 @@ TEST_F(qdma, io_read_write_unsupported) ret = read(self->io_fd, buf, TRANSFER_SIZE); EXPECT_EQ(-1, ret); EXPECT_EQ(EINVAL, errno); - - free(buf); } TEST_F(qdma, io_multiple_fds_same_qpair) { + int write_fd, read_fd, io_fd_b; uint8_t *write_buf, *read_buf; - uint32_t write_id = 0, read_id = 0; - int io_fd_b; long ret; bring_up_qpair(_metadata, self, 0x3); @@ -421,65 +530,61 @@ TEST_F(qdma, io_multiple_fds_same_qpair) io_fd_b = slash_qpair_get_fd(self->ctl_fd, self->qid, O_CLOEXEC); ASSERT_GE(io_fd_b, 0); - write_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, write_buf); - read_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, read_buf); + write_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(write_fd, 0); + read_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(read_fd, 0); + + write_buf = qdma_buf_map(write_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, write_buf); + read_buf = qdma_buf_map(read_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, read_buf); fill_pattern(write_buf, TRANSFER_SIZE); memset(read_buf, 0, TRANSFER_SIZE); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, TRANSFER_SIZE, - &write_id, NULL)); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, TRANSFER_SIZE, - &read_id, NULL)); - - ret = qdma_buf_transfer(self->io_fd, write_id, 0, SLASH_TEST_HBM_BASE, + ret = qdma_buf_transfer(self->io_fd, write_fd, 0, SLASH_TEST_HBM_BASE, TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); ASSERT_EQ(TRANSFER_SIZE, ret); - ret = qdma_buf_transfer(io_fd_b, read_id, 0, SLASH_TEST_HBM_BASE, + ret = qdma_buf_transfer(io_fd_b, read_fd, 0, SLASH_TEST_HBM_BASE, TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); ASSERT_EQ(TRANSFER_SIZE, ret); EXPECT_EQ(0, memcmp(write_buf, read_buf, TRANSFER_SIZE)); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, write_id)); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, read_id)); + munmap(write_buf, TRANSFER_SIZE); + munmap(read_buf, TRANSFER_SIZE); + close(write_fd); + close(read_fd); close(io_fd_b); - free(write_buf); - free(read_buf); } TEST_F(qdma, io_fd_outlives_qpair_del) { - uint8_t *buf; - uint32_t buf_id = 0; + int buf_fd; long ret; bring_up_qpair(_metadata, self, 0x3); + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); + /* DEL the qpair while io_fd is still open. */ ASSERT_EQ(0, slash_qpair_op(self->ctl_fd, self->qid, SLASH_QDMA_QUEUE_OP_DEL)); self->qpair_added = 0; self->qpair_started = 0; - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, NULL)); - /* - * fd is still valid but the qpair's HW queues are gone. The spec - * (index.rst:613-616) does not name a specific errno, so we only - * assert the call fails — not which errno it returns. + * fd is still valid but the qpair's HW queues are gone. The spec does + * not name a specific errno, so we only assert the call fails. */ - ret = qdma_buf_transfer(self->io_fd, buf_id, 0, SLASH_TEST_HBM_BASE, + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, SLASH_TEST_HBM_BASE, TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); EXPECT_EQ(-1, ret); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); - free(buf); + close(buf_fd); /* close(io_fd) happens in fixture teardown — must not crash. */ } @@ -488,41 +593,41 @@ TEST_F(qdma, io_fd_outlives_qpair_del) static void region_round_trip(struct __test_metadata *_metadata, FIXTURE_DATA(qdma) * self, uint64_t base) { + int write_fd, read_fd; uint8_t *write_buf, *read_buf; - uint32_t write_id = 0, read_id = 0; long ret; bring_up_qpair(_metadata, self, 0x3); - write_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, write_buf); - read_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, read_buf); + write_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(write_fd, 0); + read_fd = qdma_buf_create(self->ctl_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(read_fd, 0); + + write_buf = qdma_buf_map(write_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, write_buf); + read_buf = qdma_buf_map(read_fd, TRANSFER_SIZE); + ASSERT_NE(MAP_FAILED, read_buf); fill_pattern(write_buf, TRANSFER_SIZE); memset(read_buf, 0, TRANSFER_SIZE); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, TRANSFER_SIZE, - &write_id, NULL)); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, TRANSFER_SIZE, - &read_id, NULL)); - - ret = qdma_buf_transfer(self->io_fd, write_id, 0, base, + ret = qdma_buf_transfer(self->io_fd, write_fd, 0, base, TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); ASSERT_EQ(TRANSFER_SIZE, ret) TH_LOG("H2C transfer to 0x%llx failed: %s", (unsigned long long)base, strerror(errno)); - ret = qdma_buf_transfer(self->io_fd, read_id, 0, base, + ret = qdma_buf_transfer(self->io_fd, read_fd, 0, base, TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); ASSERT_EQ(TRANSFER_SIZE, ret); EXPECT_EQ(0, memcmp(write_buf, read_buf, TRANSFER_SIZE)); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, write_id)); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, read_id)); - free(write_buf); - free(read_buf); + munmap(write_buf, TRANSFER_SIZE); + munmap(read_buf, TRANSFER_SIZE); + close(write_fd); + close(read_fd); } TEST_F(qdma, transfer_hbm) @@ -711,314 +816,146 @@ TEST_F(qdma, qpair_get_fd_oversized_struct_zeros_tail) free(buf); } -TEST_F(qdma, reject_unaligned_4k_transfer) -{ - uint8_t *write_buf; - uint32_t buf_id = 0; - - bring_up_qpair(_metadata, self, 0x3); - - write_buf = aligned_alloc(4096, TRANSFER_SIZE * 2); - ASSERT_NE(NULL, write_buf); - fill_pattern(write_buf, TRANSFER_SIZE * 2); - - EXPECT_EQ(-EINVAL, - qdma_buf_register(self->ctl_fd, write_buf + 1, TRANSFER_SIZE, - &buf_id, NULL)); - - free(write_buf); -} - TEST_F(qdma, reject_partial_4k_transfer) { - uint8_t *write_buf; + int buf_fd; uint64_t dma_addr = get_dma_addr(); - uint32_t buf_id = 0; long ret; bring_up_qpair(_metadata, self, 0x3); - write_buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, write_buf); - fill_pattern(write_buf, TRANSFER_SIZE); + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, TRANSFER_SIZE, - &buf_id, NULL)); - ret = qdma_buf_transfer(self->io_fd, buf_id, 0, dma_addr, + /* A sub-page length is not a multiple of the buffer granule. */ + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, dma_addr, TRANSFER_SIZE / 2, SLASH_QDMA_XFER_H2C); ASSERT_EQ(-1, ret); ASSERT_EQ(EINVAL, errno); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); - free(write_buf); + close(buf_fd); } TEST_F(qdma, multipage_4k_write_read_verify) { const size_t xfer_size = TRANSFER_SIZE * 8; /* 8 base pages, one request */ + int write_fd, read_fd; uint8_t *write_buf, *read_buf; uint64_t dma_addr = get_dma_addr(); - uint32_t write_id = 0, read_id = 0; long ret; bring_up_qpair(_metadata, self, 0x3); - /* - * A multi-page base-page buffer is mapped as one SGL entry (one DMA - * descriptor) per 4 KiB page and submitted as a single libqdma request. - * The size is deliberately not a 2 MiB multiple, so this always takes the - * base-page path regardless of transparent-hugepage state; a sub-2-MiB - * anonymous mmap is always backed by 4 KiB base pages. - */ - write_buf = mmap(NULL, xfer_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + write_fd = qdma_buf_create(self->ctl_fd, xfer_size, NULL, NULL); + ASSERT_GE(write_fd, 0); + read_fd = qdma_buf_create(self->ctl_fd, xfer_size, NULL, NULL); + ASSERT_GE(read_fd, 0); + + write_buf = qdma_buf_map(write_fd, xfer_size); ASSERT_NE(MAP_FAILED, write_buf); - read_buf = mmap(NULL, xfer_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + read_buf = qdma_buf_map(read_fd, xfer_size); ASSERT_NE(MAP_FAILED, read_buf); fill_pattern(write_buf, xfer_size); memset(read_buf, 0, xfer_size); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, xfer_size, - &write_id, NULL)); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, xfer_size, - &read_id, NULL)); - - ret = qdma_buf_transfer(self->io_fd, write_id, 0, dma_addr, xfer_size, + ret = qdma_buf_transfer(self->io_fd, write_fd, 0, dma_addr, xfer_size, SLASH_QDMA_XFER_H2C); ASSERT_EQ((ssize_t)xfer_size, ret); - ret = qdma_buf_transfer(self->io_fd, read_id, 0, dma_addr, xfer_size, + ret = qdma_buf_transfer(self->io_fd, read_fd, 0, dma_addr, xfer_size, SLASH_QDMA_XFER_C2H); ASSERT_EQ((ssize_t)xfer_size, ret); EXPECT_EQ(0, memcmp(write_buf, read_buf, xfer_size)); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, write_id)); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, read_id)); munmap(write_buf, xfer_size); munmap(read_buf, xfer_size); + close(write_fd); + close(read_fd); } -/* ---------- registered buffers ---------- */ - -/* Register a host buffer via the control fd; returns 0 or -errno. */ -static int qdma_buf_register(int ctl_fd, void *addr, uint64_t length, - uint32_t *buf_id, uint32_t *transfer_hint) -{ - struct slash_qdma_buf_register req; - int ret; - - memset(&req, 0, sizeof(req)); - req.size = sizeof(req); - req.user_addr = (uint64_t)(uintptr_t)addr; - req.length = length; - - ret = ioctl(ctl_fd, SLASH_QDMA_IOCTL_BUF_REGISTER, &req); - if (ret < 0) - return -errno; - - *buf_id = req.buf_id; - if (transfer_hint) - *transfer_hint = req.transfer_hint; - return 0; -} - -/* Unregister a buffer via the control fd; returns 0 or -errno. */ -static int qdma_buf_unregister(int ctl_fd, uint32_t buf_id) -{ - struct slash_qdma_buf_unregister req; - int ret; - - memset(&req, 0, sizeof(req)); - req.size = sizeof(req); - req.buf_id = buf_id; +/* ---------- transfer error paths ---------- */ - ret = ioctl(ctl_fd, SLASH_QDMA_IOCTL_BUF_UNREGISTER, &req); - return ret < 0 ? -errno : 0; -} - -/* Issue a registered-buffer transfer on a qpair fd; returns ioctl result. */ -static long qdma_buf_transfer(int io_fd, uint32_t buf_id, uint64_t buf_offset, - uint64_t dev_addr, uint64_t length, - uint32_t direction) +TEST_F(qdma, transfer_size_below_input_min_returns_einval) { struct slash_qdma_transfer req; - memset(&req, 0, sizeof(req)); - req.size = sizeof(req); - req.buf_id = buf_id; - req.buf_offset = buf_offset; - req.dev_addr = dev_addr; - req.length = length; - req.direction = direction; - - return ioctl(io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req); -} - -TEST_F(qdma, buf_register_size_below_input_min_returns_einval) -{ - struct slash_qdma_buf_register req; + bring_up_qpair(_metadata, self, 0x3); memset(&req, 0, sizeof(req)); req.size = sizeof(__u32); /* below the trailing input field */ - EXPECT_EQ(-1, ioctl(self->ctl_fd, SLASH_QDMA_IOCTL_BUF_REGISTER, &req)); + EXPECT_EQ(-1, ioctl(self->io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req)); EXPECT_EQ(EINVAL, errno); } -TEST_F(qdma, buf_register_zero_length_returns_einval) +TEST_F(qdma, transfer_invalid_buf_fd_returns_einval) { - uint8_t *buf; - uint32_t buf_id = 0; - - buf = aligned_alloc(4096, TRANSFER_SIZE); - ASSERT_NE(NULL, buf); - - EXPECT_EQ(-EINVAL, qdma_buf_register(self->ctl_fd, buf, 0, &buf_id, NULL)); - - free(buf); -} - -TEST_F(qdma, buf_register_unaligned_returns_einval) -{ - uint8_t *buf; - uint32_t buf_id = 0; - - buf = aligned_alloc(4096, TRANSFER_SIZE * 2); - ASSERT_NE(NULL, buf); - - /* Misaligned base address is rejected. */ - EXPECT_EQ(-EINVAL, - qdma_buf_register(self->ctl_fd, buf + 1, TRANSFER_SIZE, &buf_id, NULL)); - - free(buf); -} - -TEST_F(qdma, transfer_size_below_input_min_returns_einval) -{ - struct slash_qdma_transfer req; + long ret; bring_up_qpair(_metadata, self, 0x3); - memset(&req, 0, sizeof(req)); - req.size = sizeof(__u32); /* below the trailing input field */ - EXPECT_EQ(-1, ioctl(self->io_fd, SLASH_QDMA_QPAIR_IOCTL_TRANSFER, &req)); + /* The control fd is a valid fd but not a buffer fd. */ + ret = qdma_buf_transfer(self->io_fd, self->ctl_fd, 0, + get_dma_addr(), TRANSFER_SIZE, + SLASH_QDMA_XFER_H2C); + EXPECT_EQ(-1, ret); EXPECT_EQ(EINVAL, errno); } -TEST_F(qdma, transfer_unknown_buf_returns_enoent) +TEST_F(qdma, transfer_bad_fd_returns_ebadf) { long ret; bring_up_qpair(_metadata, self, 0x3); - ret = qdma_buf_transfer(self->io_fd, 0xDEAD, 0, + ret = qdma_buf_transfer(self->io_fd, -1, 0, get_dma_addr(), TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); EXPECT_EQ(-1, ret); - EXPECT_EQ(ENOENT, errno); + EXPECT_EQ(EBADF, errno); } TEST_F(qdma, transfer_wrong_direction_returns_enodev) { - uint8_t *buf; - uint32_t buf_id = 0; + int buf_fd; uint32_t transfer_hint = 0; long ret; bring_up_qpair(_metadata, self, 0x1); /* H2C only */ - buf = mmap(NULL, TRANSFER_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(MAP_FAILED, buf); - - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, - &transfer_hint)); + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, &transfer_hint); + ASSERT_GE(buf_fd, 0); EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, transfer_hint); /* C2H is not enabled on this qpair. */ - ret = qdma_buf_transfer(self->io_fd, buf_id, 0, + ret = qdma_buf_transfer(self->io_fd, buf_fd, 0, get_dma_addr(), TRANSFER_SIZE, SLASH_QDMA_XFER_C2H); EXPECT_EQ(-1, ret); EXPECT_EQ(ENODEV, errno); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); - munmap(buf, TRANSFER_SIZE); + close(buf_fd); } TEST_F(qdma, transfer_out_of_range_returns_einval) { - uint8_t *buf; - uint32_t buf_id = 0; - uint32_t transfer_hint = 0; + int buf_fd; long ret; bring_up_qpair(_metadata, self, 0x3); - buf = mmap(NULL, TRANSFER_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(MAP_FAILED, buf); + buf_fd = qdma_buf_create(self->io_fd, TRANSFER_SIZE, NULL, NULL); + ASSERT_GE(buf_fd, 0); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, buf, TRANSFER_SIZE, &buf_id, - &transfer_hint)); - EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, transfer_hint); - - /* Slice extends past the registered length. */ - ret = qdma_buf_transfer(self->io_fd, buf_id, TRANSFER_SIZE, + /* Slice extends past the buffer length. */ + ret = qdma_buf_transfer(self->io_fd, buf_fd, TRANSFER_SIZE, get_dma_addr(), TRANSFER_SIZE, SLASH_QDMA_XFER_H2C); EXPECT_EQ(-1, ret); EXPECT_EQ(EINVAL, errno); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, buf_id)); - munmap(buf, TRANSFER_SIZE); -} - -TEST_F(qdma, registered_buffer_round_trip) -{ - const size_t xfer_size = TRANSFER_SIZE * 8; /* 8 base pages */ - uint8_t *write_buf, *read_buf; - uint32_t write_id = 0, read_id = 0; - uint32_t write_hint = 0, read_hint = 0; - uint64_t dma_addr = get_dma_addr(); - long ret; - - bring_up_qpair(_metadata, self, 0x3); - - write_buf = mmap(NULL, xfer_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(MAP_FAILED, write_buf); - read_buf = mmap(NULL, xfer_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(MAP_FAILED, read_buf); - - fill_pattern(write_buf, xfer_size); - memset(read_buf, 0, xfer_size); - - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, write_buf, xfer_size, - &write_id, &write_hint)); - ASSERT_EQ(0, qdma_buf_register(self->ctl_fd, read_buf, xfer_size, - &read_id, &read_hint)); - EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, write_hint); - EXPECT_EQ(SLASH_QDMA_TRANSFER_HINT_V80, read_hint); - - ret = qdma_buf_transfer(self->io_fd, write_id, 0, dma_addr, xfer_size, - SLASH_QDMA_XFER_H2C); - ASSERT_EQ((long)xfer_size, ret); - - ret = qdma_buf_transfer(self->io_fd, read_id, 0, dma_addr, xfer_size, - SLASH_QDMA_XFER_C2H); - ASSERT_EQ((long)xfer_size, ret); - - EXPECT_EQ(0, memcmp(write_buf, read_buf, xfer_size)); - - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, write_id)); - EXPECT_EQ(0, qdma_buf_unregister(self->ctl_fd, read_id)); - - munmap(write_buf, xfer_size); - munmap(read_buf, xfer_size); + close(buf_fd); } TEST_HARNESS_MAIN diff --git a/smi/src/validate.cpp b/smi/src/validate.cpp index 67fb5c2b..60f28159 100644 --- a/smi/src/validate.cpp +++ b/smi/src/validate.cpp @@ -579,8 +579,7 @@ class RawTransferBuffer { : qdma_{qdma}, physAddr_{physAddr}, size_{size}, mmChannel_{mmChannel}, ringSizeIndex_{ringSizeIndex} { try { - createHostMapping(); - registerBuffer(); + createBuffer(); createQpair(); } catch (...) { cleanup(); @@ -632,40 +631,42 @@ class RawTransferBuffer { qid_ = other.qid_; qpairCreated_ = other.qpairCreated_; qpairStarted_ = other.qpairStarted_; + buf_ = other.buf_; data_ = other.data_; physAddr_ = other.physAddr_; size_ = other.size_; transferStepSize_ = other.transferStepSize_; mmChannel_ = other.mmChannel_; ringSizeIndex_ = other.ringSizeIndex_; - bufId_ = other.bufId_; - bufRegistered_ = other.bufRegistered_; other.qdma_ = nullptr; other.fd_ = -1; other.qid_ = 0; other.qpairCreated_ = false; other.qpairStarted_ = false; + other.buf_ = slash_qdma_buffer{}; other.data_ = nullptr; other.physAddr_ = 0; other.size_ = 0; other.transferStepSize_ = 0; other.ringSizeIndex_ = QDMA_RING_SZ_IDX; - other.bufId_ = 0; - other.bufRegistered_ = false; } - void createHostMapping() { - smi::raw::HostMapping mapping = smi::raw::createHostMapping(size_, physAddr_); - data_ = mapping.data; - transferStepSize_ = mapping.step; - } - - void registerBuffer() { - if (slash_qdma_buffer_register(qdma_, data_, size_, &bufId_, nullptr) != 0) { - throwSystemError("Failed to register raw transfer DMA buffer"); + void createBuffer() { + // The kernel owns the DMA buffer (pages + SGL + DMA map built once at + // create time); we mmap it for CPU access via buf_.addr. + if (slash_qdma_buffer_create(qdma_, size_, &buf_) != 0) { + throwSystemError("Failed to create raw transfer DMA buffer"); + } + data_ = buf_.addr; + transferStepSize_ = smi::raw::BASE_TRANSFER_STEP_SIZE; + + // Pre-fault the mapping so the page-fault cost stays out of the timed + // transfer loop. + auto* touch = static_cast(data_); + for (uint64_t off = 0; off < size_; off += transferStepSize_) { + touch[off] = 0; } - bufRegistered_ = true; } void createQpair() { @@ -704,9 +705,19 @@ class RawTransferBuffer { } void transfer(uint64_t offset, uint64_t size, bool toDevice) { - const uint32_t dir = toDevice ? SLASH_QDMA_XFER_H2C : SLASH_QDMA_XFER_C2H; - ssize_t n = slash_qdma_transfer(qdma_, fd_, bufId_, offset, - physAddr_ + offset, size, dir); + // Issue via the array transfer ioctl with a single sub-transfer on this + // buffer's queue pair (qpair_index 0). Channel parallelism for the + // bandwidth test comes from running many buffers concurrently, each + // pinned to a channel by mm_channel (see the channel-allocation knobs). + struct slash_qdma_subxfer xfer{}; + xfer.qpair_index = 0; + xfer.direction = toDevice ? SLASH_QDMA_XFER_H2C : SLASH_QDMA_XFER_C2H; + xfer.buf_fd = buf_.fd; + xfer.buf_offset = offset; + xfer.dev_addr = physAddr_ + offset; + xfer.length = size; + + ssize_t n = slash_qdma_qpair_transfer_batch(fd_, &xfer, 1); if (n < 0) { throwSystemError(toDevice ? "Raw QDMA write failed" : "Raw QDMA read failed"); @@ -721,22 +732,19 @@ class RawTransferBuffer { (void)close(fd_); fd_ = -1; } - if (qdma_ != nullptr && bufRegistered_) { - (void)slash_qdma_buffer_unregister(qdma_, bufId_); - bufRegistered_ = false; - } - if (qdma_ != nullptr && qpairStarted_) { + if (qpairStarted_) { (void)slash_qdma_qpair_stop(qdma_, qid_); qpairStarted_ = false; } - if (qdma_ != nullptr && qpairCreated_) { + if (qpairCreated_) { (void)slash_qdma_qpair_del(qdma_, qid_); qpairCreated_ = false; } - if (data_ != nullptr && data_ != MAP_FAILED) { - (void)munmap(data_, size_); - data_ = nullptr; + if (buf_.addr != nullptr) { + (void)slash_qdma_buffer_destroy(&buf_); + buf_ = slash_qdma_buffer{}; } + data_ = nullptr; } slash_qdma* qdma_ = nullptr; @@ -744,14 +752,13 @@ class RawTransferBuffer { uint32_t qid_ = 0; bool qpairCreated_ = false; bool qpairStarted_ = false; + slash_qdma_buffer buf_{}; void* data_ = nullptr; uint64_t physAddr_ = 0; uint64_t size_ = 0; uint64_t transferStepSize_ = 0; slash_qdma_mm_channel mmChannel_ = SLASH_QDMA_MM_CHANNEL_AUTO; uint32_t ringSizeIndex_ = QDMA_RING_SZ_IDX; - uint32_t bufId_ = 0; - bool bufRegistered_ = false; }; /// Fill @p buf with a deterministic pattern seeded by @p seed. diff --git a/vrt/src/qdma/qdma_intf.cpp b/vrt/src/qdma/qdma_intf.cpp index e2e2e17c..66454c60 100644 --- a/vrt/src/qdma/qdma_intf.cpp +++ b/vrt/src/qdma/qdma_intf.cpp @@ -20,6 +20,8 @@ #include +#include + #include #include @@ -56,6 +58,11 @@ QdmaIntf::~QdmaIntf() { } } +namespace { +constexpr uint64_t kQdmaPage = 4096ULL; +inline uint64_t roundUpToPage(uint64_t v) { return (v + kQdmaPage - 1) & ~(kQdmaPage - 1); } +} // namespace + ssize_t QdmaIntf::write_from_buffer(const char* fname, char* buffer, uint64_t size, uint64_t base) { if (qpairFd < 0) { utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, @@ -66,21 +73,26 @@ ssize_t QdmaIntf::write_from_buffer(const char* fname, char* buffer, uint64_t si return 0; } - uint32_t bufId = 0; - if (slash_qdma_qpair_buffer_register(qpairFd, buffer, size, &bufId, nullptr) != 0) { + // The kernel buffer owns its DMA-mapped pages; stage the caller's data into + // the mapping, then transfer whole pages. + const uint64_t aligned = roundUpToPage(size); + struct slash_qdma_buffer buf{}; + if (slash_qdma_qpair_buffer_create(qpairFd, aligned, &buf) != 0) { utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not register QDMA write buffer for {}", fname); + "Could not create QDMA write buffer for {}", fname); return -EIO; } + std::memcpy(buf.addr, buffer, size); - ssize_t rc = slash_qdma_qpair_transfer(qpairFd, bufId, 0, base, size, SLASH_QDMA_XFER_H2C); - (void)slash_qdma_qpair_buffer_unregister(qpairFd, bufId); - if (rc != (ssize_t)size) { + ssize_t rc = slash_qdma_qpair_transfer(qpairFd, buf.fd, 0, base, aligned, + SLASH_QDMA_XFER_H2C); + (void)slash_qdma_buffer_destroy(&buf); + if (rc != (ssize_t)aligned) { utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not write to {}", fname); return -EIO; } - return rc; + return (ssize_t)size; } ssize_t QdmaIntf::read_to_buffer(const char* fname, char* buffer, uint64_t size, uint64_t base) { @@ -93,21 +105,26 @@ ssize_t QdmaIntf::read_to_buffer(const char* fname, char* buffer, uint64_t size, return 0; } - uint32_t bufId = 0; - if (slash_qdma_qpair_buffer_register(qpairFd, buffer, size, &bufId, nullptr) != 0) { + const uint64_t aligned = roundUpToPage(size); + struct slash_qdma_buffer buf{}; + if (slash_qdma_qpair_buffer_create(qpairFd, aligned, &buf) != 0) { utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, - "Could not register QDMA read buffer for {}", fname); + "Could not create QDMA read buffer for {}", fname); return -EIO; } - ssize_t rc = slash_qdma_qpair_transfer(qpairFd, bufId, 0, base, size, SLASH_QDMA_XFER_C2H); - (void)slash_qdma_qpair_buffer_unregister(qpairFd, bufId); - if (rc != (ssize_t)size) { + ssize_t rc = slash_qdma_qpair_transfer(qpairFd, buf.fd, 0, base, aligned, + SLASH_QDMA_XFER_C2H); + if (rc == (ssize_t)aligned) { + std::memcpy(buffer, buf.addr, size); + } + (void)slash_qdma_buffer_destroy(&buf); + if (rc != (ssize_t)aligned) { utils::Logger::log(utils::LogLevel::ERROR, __PRETTY_FUNCTION__, "Could not read from {}", fname); return -EIO; } - return rc; + return (ssize_t)size; } void QdmaIntf::write_buff(char* buffer, uint64_t start_addr, uint64_t size) { diff --git a/vrt/vrtd/include/vrtd/wire.h b/vrt/vrtd/include/vrtd/wire.h index f5295975..a543b815 100644 --- a/vrt/vrtd/include/vrtd/wire.h +++ b/vrt/vrtd/include/vrtd/wire.h @@ -309,13 +309,14 @@ struct vrtd_resp_buffer_open { uint64_t size; ///< Allocated size in bytes (rounded up to subregion). uint64_t phys_addr; ///< Device physical address of the allocation. /** - * Number of qpair FDs sent via SCM_RIGHTS (1 or 2). When two FDs are - * sent (an mm_channel == AUTO request), the ordering is fixed: FD[0] is - * pinned to AXI-MM channel 0 and FD[1] to channel 1, so the client can - * apply the V80 placement policy deterministically. A single FD is - * pinned to the explicitly requested channel. + * Number of QDMA queue pairs (AXI-MM/NoC channels) owned by the single + * transfer FD sent via SCM_RIGHTS (1 or 2). When two qpairs are bound + * (an mm_channel == AUTO request), their qpair_index ordering is fixed: + * index 0 is pinned to channel 0 and index 1 to channel 1, so the client + * can apply the V80 placement policy deterministically. Exactly one FD is + * always sent regardless of this count. */ - uint32_t qpair_fd_count; + uint32_t qpair_count; } __attribute__((packed)); /** @@ -337,9 +338,9 @@ struct vrtd_resp_buffer_close { * Bypasses the allocator entirely — the caller is responsible for ensuring the * address is valid and not in use. Requires the @c raw-mem-access permission. * - * One or more qpair FDs are sent out-of-band via SCM_RIGHTS when - * @ref vrtd_resp_header::ret == VRTD_RET_OK. The response body reports the - * number of descriptors attached. + * A single transfer FD is sent out-of-band via SCM_RIGHTS when + * @ref vrtd_resp_header::ret == VRTD_RET_OK. The response body reports how + * many queue pairs (channels) that FD owns. */ struct vrtd_req_buffer_open_raw { uint32_t dev_number; ///< Device index (0-based). @@ -351,11 +352,11 @@ struct vrtd_req_buffer_open_raw { struct vrtd_resp_buffer_open_raw { /** - * Number of qpair FDs sent via SCM_RIGHTS (1 or 2). Same fd-to-channel - * ordering as @ref vrtd_resp_buffer_open: FD[0] -> channel 0, FD[1] -> - * channel 1 for an AUTO request; a single FD pins the requested channel. + * Number of QDMA queue pairs (channels) owned by the single transfer FD + * sent via SCM_RIGHTS (1 or 2). Same qpair_index-to-channel ordering as + * @ref vrtd_resp_buffer_open. */ - uint32_t qpair_fd_count; + uint32_t qpair_count; } __attribute__((packed)); /** diff --git a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h index 3aaa47c0..0e6ce0ec 100644 --- a/vrt/vrtd/libvrtd/include/vrtd/vrtd.h +++ b/vrt/vrtd/libvrtd/include/vrtd/vrtd.h @@ -531,12 +531,16 @@ struct vrtd_buffer { uint64_t size; uint64_t phys_addr; - int qpair_fds[2]; - uint32_t qpair_fd_count; - uint32_t buf_id; + /* Single transfer fd that owns @qpair_count queue pairs (channels). */ + int qpair_fd; + /* Number of queue pairs (channels) the fd owns; selects 1- or 2-way split. */ + uint32_t qpair_count; + /* Kernel-owned DMA buffer fd backing @buf (from slash_qdma_qpair_buffer_create). */ + int buffer_fd; enum slash_qdma_transfer_hint transfer_hint; + /* CPU mapping of the kernel buffer (mmap of @buffer_fd). */ void *buf; - /* Internal DMA granule for the local host mapping (4 KiB base pages). */ + /* Internal DMA granule for the host mapping (4 KiB base pages). */ uint64_t transfer_step_size; }; @@ -548,8 +552,8 @@ enum vrtd_ret vrtd_buffer_create_raw( uint64_t alloc_arg, uint64_t size, uint64_t phys_addr, - const int *qpair_fds, - uint32_t qpair_fd_count, + int qpair_fd, + uint32_t qpair_count, struct vrtd_buffer **buffer_out ); diff --git a/vrt/vrtd/libvrtd/src/buffer.c b/vrt/vrtd/libvrtd/src/buffer.c index e63ad1e3..87573074 100644 --- a/vrt/vrtd/libvrtd/src/buffer.c +++ b/vrt/vrtd/libvrtd/src/buffer.c @@ -44,6 +44,8 @@ #include +#include + #include "v80_policy.h" #include @@ -84,87 +86,20 @@ static inline uint64_t vrtd_now_ns(void) { } #endif -static void vrtd_prefault_mapping(void *addr, uint64_t size) { - volatile uint8_t *touch = (volatile uint8_t *) addr; - - for (uint64_t off = 0; off < size; off += BASE_TRANSFER_STEP_SIZE) { - touch[off] = 0; - } -} - -static int vrtd_mmap_regular_base_pages(uint64_t size, void **addr_out) { - void *addr; - - if (addr_out == NULL || size == 0) { - return -EINVAL; - } - - addr = mmap( - NULL, - size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - -1, - 0 - ); - if (addr == MAP_FAILED) { - return -errno; - } - - if (madvise(addr, size, MADV_NOHUGEPAGE) != 0) { - int saved_errno = errno; - (void) munmap(addr, size); - return -saved_errno; - } - - vrtd_prefault_mapping(addr, size); - *addr_out = addr; - return 0; -} - /* - * Issue a single contiguous transfer of [buf_offset, buf_offset + size) on one - * qpair fd. The QDMA transfer ioctl operates on signed ssize_t lengths, so the - * range is chunked to stay within SSIZE_MAX while preserving step alignment. + * Issue a buffer transfer of [offset, offset + size) as a single batched ioctl + * per round, fanning the range across the fd's queue pairs (channels) according + * to the placement policy so both NoC channels run concurrently in-kernel. + * + * The QDMA transfer descriptor's length is a 32-bit byte count, so each + * segment is chunked to stay within that limit while preserving step alignment; + * every chunk round issues one ioctl covering all active channels. */ -static int vrtd_transfer_segment( - int qpair_fd, - uint32_t buf_id, - uint64_t buf_offset, - uint64_t phys_addr, - uint64_t size, - uint64_t step, - uint32_t direction -) { - uint64_t max_chunk = (uint64_t)SSIZE_MAX - ((uint64_t)SSIZE_MAX % step); - uint64_t done = 0; - - if (max_chunk == 0) { - return -EINVAL; - } - - while (done < size) { - uint64_t remaining = size - done; - uint64_t chunk = remaining > max_chunk ? max_chunk : remaining; - uint64_t xfer_offset = buf_offset + done; - uint64_t dev_offset = phys_addr + xfer_offset; - ssize_t ret = slash_qdma_qpair_transfer( - qpair_fd, buf_id, xfer_offset, dev_offset, chunk, direction); - - if (ret <= 0) { - return -EIO; - } - done += (uint64_t) ret; - } - - return 0; -} - static int vrtd_transfer_registered( - const int *qpair_fds, - uint32_t qpair_fd_count, + int qpair_fd, + uint32_t qpair_count, enum slash_qdma_transfer_hint transfer_hint, - uint32_t buf_id, + int buf_fd, uint64_t phys_addr, uint64_t offset, uint64_t size, @@ -177,7 +112,7 @@ static int vrtd_transfer_registered( return 0; } - if (qpair_fds == NULL || qpair_fd_count == 0 || qpair_fds[0] < 0) { + if (qpair_fd < 0 || qpair_count == 0) { return -EINVAL; } @@ -186,68 +121,101 @@ static int vrtd_transfer_registered( } /* - * Decide how the transfer maps onto the available queues. V80 applies the - * placement-aware policy (DDR halved, HBM routed by the half-memory + * Decide how the transfer maps onto the available queue pairs. V80 applies + * the placement-aware policy (DDR halved, HBM routed by the half-memory * boundary); any other hint keeps everything on the primary qpair. */ struct vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; uint32_t nseg; if (transfer_hint == SLASH_QDMA_TRANSFER_HINT_V80) { - nseg = vrtd_plan_v80(phys_addr, offset, size, step, qpair_fd_count, segs); + nseg = vrtd_plan_v80(phys_addr, offset, size, step, qpair_count, segs); } else { - segs[0].fd_index = 0; + segs[0].qpair_index = 0; segs[0].offset = offset; segs[0].size = size; nseg = 1; } + /* Clamp any planned qpair_index to the qpairs the fd actually owns. */ for (uint32_t i = 0; i < nseg; ++i) { - uint32_t fd_index = segs[i].fd_index; + if (segs[i].qpair_index >= qpair_count) { + segs[i].qpair_index = 0; + } + } + + /* Per-channel descriptor length is 32-bit; keep chunks step-aligned. */ + uint64_t max_chunk = 0xFFFFF000ULL; + max_chunk -= max_chunk % step; + if (max_chunk == 0) { + return -EINVAL; + } - /* The plan only references fds[0]/fds[1]; fall back to the primary - * qpair if a planned fd is somehow unavailable. */ - if (fd_index >= qpair_fd_count || qpair_fds[fd_index] < 0) { - fd_index = 0; + uint64_t done[VRTD_V80_MAX_SEGS] = {0}; + for (;;) { + struct slash_qdma_subxfer xfers[VRTD_V80_MAX_SEGS]; + uint32_t map_seg[VRTD_V80_MAX_SEGS]; + uint32_t count = 0; + + for (uint32_t i = 0; i < nseg; ++i) { + uint64_t remaining = segs[i].size - done[i]; + uint64_t chunk; + uint64_t xfer_offset; + + if (remaining == 0) { + continue; + } + chunk = remaining > max_chunk ? max_chunk : remaining; + xfer_offset = segs[i].offset + done[i]; + + memset(&xfers[count], 0, sizeof(xfers[count])); + xfers[count].qpair_index = segs[i].qpair_index; + xfers[count].direction = direction; + xfers[count].buf_fd = buf_fd; + xfers[count].buf_offset = xfer_offset; + xfers[count].dev_addr = phys_addr + xfer_offset; + xfers[count].length = chunk; + map_seg[count] = i; + count++; } - int ret = vrtd_transfer_segment( - qpair_fds[fd_index], buf_id, segs[i].offset, - phys_addr, segs[i].size, step, direction); - if (ret != 0) { - return ret; + if (count == 0) { + break; + } + + ssize_t ret = slash_qdma_qpair_transfer_batch(qpair_fd, xfers, count); + if (ret < 0) { + return -EIO; + } + + for (uint32_t c = 0; c < count; ++c) { + done[map_seg[c]] += xfers[c].length; } } return 0; } -static int vrtd_transfer_temporary_mapping( +/* + * Transfer [0, size) of a separate kernel buffer (@bounce) against the device + * starting at @phys_addr. Used for partial-range read-modify-write staging. + */ +static int vrtd_bounce_transfer( const struct vrtd_buffer *buffer, - void *mapping, + const struct slash_qdma_buffer *bounce, uint64_t phys_addr, uint64_t size, bool to_device ) { - uint32_t buf_id = 0; - enum slash_qdma_transfer_hint hint = SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR; - int ret; - - if (buffer == NULL || mapping == NULL || buffer->qpair_fd_count == 0) { + if (buffer == NULL || bounce == NULL || buffer->qpair_count == 0 || + buffer->qpair_fd < 0) { return -EINVAL; } - if (slash_qdma_qpair_buffer_register(buffer->qpair_fds[0], mapping, size, - &buf_id, &hint) != 0) { - return -EIO; - } - - ret = vrtd_transfer_registered(buffer->qpair_fds, buffer->qpair_fd_count, - hint, buf_id, phys_addr, 0, size, - BASE_TRANSFER_STEP_SIZE, to_device); - - (void)slash_qdma_qpair_buffer_unregister(buffer->qpair_fds[0], buf_id); - return ret; + return vrtd_transfer_registered(buffer->qpair_fd, buffer->qpair_count, + buffer->transfer_hint, bounce->fd, + phys_addr, 0, size, + BASE_TRANSFER_STEP_SIZE, to_device); } enum vrtd_ret vrtd_buffer_create_raw( @@ -258,8 +226,8 @@ enum vrtd_ret vrtd_buffer_create_raw( uint64_t alloc_arg, uint64_t size, uint64_t phys_addr, - const int *qpair_fds, - uint32_t qpair_fd_count, + int qpair_fd, + uint32_t qpair_count, struct vrtd_buffer **buffer_out ) { if (buffer_out == NULL) { @@ -271,35 +239,38 @@ enum vrtd_ret vrtd_buffer_create_raw( return VRTD_RET_INTERNAL_ERROR; } - buffer->buf = MAP_FAILED; + buffer->buf = NULL; buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE; - buffer->qpair_fds[0] = -1; - buffer->qpair_fds[1] = -1; - buffer->qpair_fd_count = 0; - buffer->buf_id = 0; + buffer->qpair_fd = -1; + buffer->qpair_count = 0; + buffer->buffer_fd = -1; buffer->transfer_hint = SLASH_QDMA_TRANSFER_HINT_SINGLE_QPAIR; - if (qpair_fds == NULL || qpair_fd_count == 0 || qpair_fd_count > 2 || qpair_fds[0] < 0) { + if (qpair_fd < 0 || qpair_count == 0 || qpair_count > 2) { free(buffer); return VRTD_RET_BAD_LIB_CALL; } /* - * Host staging buffer is always 4 KiB base pages. Do not use MAP_POPULATE - * before MADV_NOHUGEPAGE: THP=always can fault compound pages before the - * advice takes effect, and the kernel QDMA base-page path intentionally - * rejects those pages (vrtd_mmap_regular_base_pages handles this). + * The kernel owns the DMA buffer: it allocates 4 KiB base pages, builds the + * SGL, and DMA-maps everything once at create time, then hands back a + * mappable fd. We mmap that fd for CPU access (buffer->buf). */ - int mmap_ret = vrtd_mmap_regular_base_pages(size, &buffer->buf); - if (mmap_ret != 0) { + struct slash_qdma_buffer sbuf; + memset(&sbuf, 0, sizeof(sbuf)); + if (slash_qdma_qpair_buffer_create(qpair_fd, size, &sbuf) != 0) { free(buffer); return VRTD_RET_INTERNAL_ERROR; } + + buffer->buf = sbuf.addr; + buffer->buffer_fd = sbuf.fd; + buffer->transfer_hint = sbuf.transfer_hint; buffer->transfer_step_size = BASE_TRANSFER_STEP_SIZE; #if SLASH_QDMA_TIMING syslog( LOG_INFO, - "libvrtd: buffer host mapping path=regular-4k size=%llu phys_addr=0x%llx step=%llu", + "libvrtd: buffer kernel mapping size=%llu phys_addr=0x%llx step=%llu", (unsigned long long)size, (unsigned long long)phys_addr, (unsigned long long)buffer->transfer_step_size @@ -313,18 +284,8 @@ enum vrtd_ret vrtd_buffer_create_raw( buffer->alloc_arg = alloc_arg; buffer->size = size; buffer->phys_addr = phys_addr; - buffer->qpair_fd_count = qpair_fd_count; - for (uint32_t i = 0; i < qpair_fd_count; ++i) { - buffer->qpair_fds[i] = qpair_fds[i]; - } - - if (slash_qdma_qpair_buffer_register( - buffer->qpair_fds[0], buffer->buf, buffer->size, - &buffer->buf_id, &buffer->transfer_hint) != 0) { - (void) munmap(buffer->buf, buffer->size); - free(buffer); - return VRTD_RET_INTERNAL_ERROR; - } + buffer->qpair_fd = qpair_fd; + buffer->qpair_count = qpair_count; *buffer_out = buffer; @@ -393,22 +354,19 @@ enum vrtd_ret vrtd_buffer_destroy( return VRTD_RET_BAD_LIB_CALL; } - for (uint32_t i = 0; i < buffer->qpair_fd_count && i < 2; ++i) { - if (buffer->qpair_fds[i] >= 0) { - (void) slash_qdma_qpair_buffer_unregister(buffer->qpair_fds[i], buffer->buf_id); - break; - } + if (buffer->buf != NULL && buffer->size != 0) { + (void) munmap(buffer->buf, buffer->size); + buffer->buf = NULL; } - for (uint32_t i = 0; i < buffer->qpair_fd_count && i < 2; ++i) { - if (buffer->qpair_fds[i] >= 0) { - (void) close(buffer->qpair_fds[i]); - buffer->qpair_fds[i] = -1; - } + if (buffer->buffer_fd >= 0) { + (void) close(buffer->buffer_fd); + buffer->buffer_fd = -1; } - if (buffer->buf != NULL) { - (void) munmap(buffer->buf, buffer->size); + if (buffer->qpair_fd >= 0) { + (void) close(buffer->qpair_fd); + buffer->qpair_fd = -1; } free(buffer); @@ -462,8 +420,8 @@ enum vrtd_ret vrtd_buffer_sync_to_device( return VRTD_RET_INVALID_ARGUMENT; } - assert(buffer->qpair_fd_count > 0); - assert(buffer->qpair_fds[0] >= 0); + assert(buffer->qpair_count > 0); + assert(buffer->qpair_fd >= 0); assert(buffer->buf != NULL); uint64_t aligned_offset = 0; uint64_t aligned_size = 0; @@ -484,26 +442,27 @@ enum vrtd_ret vrtd_buffer_sync_to_device( int transfer_ret; if (needs_bounce && buffer->alloc_dir == VRTD_ALLOC_DIR_BIDIRECTIONAL) { - void *bounce = NULL; - int mmap_ret = vrtd_mmap_regular_base_pages(aligned_size, &bounce); - if (mmap_ret != 0) { + struct slash_qdma_buffer bounce; + memset(&bounce, 0, sizeof(bounce)); + if (slash_qdma_qpair_buffer_create(buffer->qpair_fd, aligned_size, + &bounce) != 0) { return VRTD_RET_INTERNAL_ERROR; } - transfer_ret = vrtd_transfer_temporary_mapping( - buffer, bounce, buffer->phys_addr + aligned_offset, + transfer_ret = vrtd_bounce_transfer( + buffer, &bounce, buffer->phys_addr + aligned_offset, aligned_size, false); if (transfer_ret == 0) { memcpy( - (uint8_t *)bounce + (offset - aligned_offset), + (uint8_t *)bounce.addr + (offset - aligned_offset), (uint8_t *)buffer->buf + offset, size ); - transfer_ret = vrtd_transfer_temporary_mapping( - buffer, bounce, buffer->phys_addr + aligned_offset, + transfer_ret = vrtd_bounce_transfer( + buffer, &bounce, buffer->phys_addr + aligned_offset, aligned_size, true); } - (void) munmap(bounce, aligned_size); + (void) slash_qdma_buffer_destroy(&bounce); } else { /* * Host-to-device-only buffers cannot read the surrounding device @@ -511,8 +470,8 @@ enum vrtd_ret vrtd_buffer_sync_to_device( * expand partial syncs to the backing DMA granule. */ transfer_ret = vrtd_transfer_registered( - buffer->qpair_fds, buffer->qpair_fd_count, buffer->transfer_hint, - buffer->buf_id, buffer->phys_addr, + buffer->qpair_fd, buffer->qpair_count, buffer->transfer_hint, + buffer->buffer_fd, buffer->phys_addr, aligned_offset, aligned_size, step, true); } if (transfer_ret != 0) { @@ -549,8 +508,8 @@ enum vrtd_ret vrtd_buffer_sync_from_device( return VRTD_RET_INVALID_ARGUMENT; } - assert(buffer->qpair_fd_count > 0); - assert(buffer->qpair_fds[0] >= 0); + assert(buffer->qpair_count > 0); + assert(buffer->qpair_fd >= 0); assert(buffer->buf != NULL); uint64_t aligned_offset = 0; uint64_t aligned_size = 0; @@ -571,27 +530,28 @@ enum vrtd_ret vrtd_buffer_sync_from_device( int transfer_ret; if (needs_bounce) { - void *bounce = NULL; - int mmap_ret = vrtd_mmap_regular_base_pages(aligned_size, &bounce); - if (mmap_ret != 0) { + struct slash_qdma_buffer bounce; + memset(&bounce, 0, sizeof(bounce)); + if (slash_qdma_qpair_buffer_create(buffer->qpair_fd, aligned_size, + &bounce) != 0) { return VRTD_RET_INTERNAL_ERROR; } - transfer_ret = vrtd_transfer_temporary_mapping( - buffer, bounce, buffer->phys_addr + aligned_offset, + transfer_ret = vrtd_bounce_transfer( + buffer, &bounce, buffer->phys_addr + aligned_offset, aligned_size, false); if (transfer_ret == 0) { memcpy( (uint8_t *)buffer->buf + offset, - (uint8_t *)bounce + (offset - aligned_offset), + (uint8_t *)bounce.addr + (offset - aligned_offset), size ); } - (void) munmap(bounce, aligned_size); + (void) slash_qdma_buffer_destroy(&bounce); } else { transfer_ret = vrtd_transfer_registered( - buffer->qpair_fds, buffer->qpair_fd_count, buffer->transfer_hint, - buffer->buf_id, buffer->phys_addr, + buffer->qpair_fd, buffer->qpair_count, buffer->transfer_hint, + buffer->buffer_fd, buffer->phys_addr, aligned_offset, aligned_size, step, false); } if (transfer_ret != 0) { diff --git a/vrt/vrtd/libvrtd/src/requests.c b/vrt/vrtd/libvrtd/src/requests.c index 1cb43997..d56c2a47 100644 --- a/vrt/vrtd/libvrtd/src/requests.c +++ b/vrt/vrtd/libvrtd/src/requests.c @@ -572,23 +572,21 @@ enum vrtd_ret vrtd_buffer_open( }; struct vrtd_resp_buffer_open resp = {0}; - int qpair_fds[2] = {-1, -1}; - uint32_t qpair_fd_count = 0; + /* The daemon sends a single transfer fd that owns resp.qpair_count qpairs. */ + int qpair_fd = -1; + uint32_t fd_count = 0; int ret = vrtd_raw_request_fds(fd, VRTD_REQ_BUFFER_OPEN, &req, sizeof(req), &resp, sizeof(resp), - qpair_fds, 2, &qpair_fd_count, NULL); + &qpair_fd, 1, &fd_count, NULL); if (ret != VRTD_RET_OK) { return ret; } - if (qpair_fd_count == 0 || qpair_fd_count > 2 || - resp.qpair_fd_count == 0 || resp.qpair_fd_count > qpair_fd_count || - qpair_fds[0] < 0) { - for (uint32_t i = 0; i < qpair_fd_count; ++i) { - if (qpair_fds[i] >= 0) { - (void) close(qpair_fds[i]); - } + if (fd_count != 1 || qpair_fd < 0 || + resp.qpair_count == 0 || resp.qpair_count > 2) { + if (qpair_fd >= 0) { + (void) close(qpair_fd); } return VRTD_RET_INTERNAL_ERROR; } @@ -601,16 +599,12 @@ enum vrtd_ret vrtd_buffer_open( alloc_arg, resp.size, resp.phys_addr, - qpair_fds, - resp.qpair_fd_count, + qpair_fd, + resp.qpair_count, buffer_out ); if (ret != VRTD_RET_OK) { - for (uint32_t i = 0; i < qpair_fd_count; ++i) { - if (qpair_fds[i] >= 0) { - (void) close(qpair_fds[i]); - } - } + (void) close(qpair_fd); return ret; } @@ -641,23 +635,21 @@ enum vrtd_ret vrtd_buffer_open_raw( }; struct vrtd_resp_buffer_open_raw resp = {0}; - int qpair_fds[2] = {-1, -1}; - uint32_t qpair_fd_count = 0; + /* The daemon sends a single transfer fd that owns resp.qpair_count qpairs. */ + int qpair_fd = -1; + uint32_t fd_count = 0; int ret = vrtd_raw_request_fds(fd, VRTD_REQ_BUFFER_OPEN_RAW, &req, sizeof(req), &resp, sizeof(resp), - qpair_fds, 2, &qpair_fd_count, NULL); + &qpair_fd, 1, &fd_count, NULL); if (ret != VRTD_RET_OK) { return ret; } - if (qpair_fd_count == 0 || qpair_fd_count > 2 || - resp.qpair_fd_count == 0 || resp.qpair_fd_count > qpair_fd_count || - qpair_fds[0] < 0) { - for (uint32_t i = 0; i < qpair_fd_count; ++i) { - if (qpair_fds[i] >= 0) { - (void) close(qpair_fds[i]); - } + if (fd_count != 1 || qpair_fd < 0 || + resp.qpair_count == 0 || resp.qpair_count > 2) { + if (qpair_fd >= 0) { + (void) close(qpair_fd); } return VRTD_RET_INTERNAL_ERROR; } @@ -670,16 +662,12 @@ enum vrtd_ret vrtd_buffer_open_raw( 0, /* alloc_arg: not used for raw buffers */ size, phys_addr, - qpair_fds, - resp.qpair_fd_count, + qpair_fd, + resp.qpair_count, buffer_out ); if (ret != VRTD_RET_OK) { - for (uint32_t i = 0; i < qpair_fd_count; ++i) { - if (qpair_fds[i] >= 0) { - (void) close(qpair_fds[i]); - } - } + (void) close(qpair_fd); return ret; } diff --git a/vrt/vrtd/libvrtd/src/v80_policy.h b/vrt/vrtd/libvrtd/src/v80_policy.h index 8d70b85e..2205aacf 100644 --- a/vrt/vrtd/libvrtd/src/v80_policy.h +++ b/vrt/vrtd/libvrtd/src/v80_policy.h @@ -36,8 +36,8 @@ * - HBM at/above the half-boundary: channel 1 only. * - HBM spanning the boundary: split there (below -> ch0, above -> ch1). * - * The fd-to-channel mapping is the wire contract from vrtd: fds[0] is pinned to - * channel 0 and fds[1] to channel 1 (see vrtd_resp_buffer_open). + * The qpair-to-channel mapping is the wire contract from vrtd: qpair_index 0 is + * pinned to channel 0 and qpair_index 1 to channel 1 (see vrtd_resp_buffer_open). */ #ifndef VRTD_V80_POLICY_H @@ -60,39 +60,40 @@ /** @brief Maximum segments a transfer is split into (one per mm-channel). */ #define VRTD_V80_MAX_SEGS 2u -/** @brief One contiguous sub-transfer routed to a specific qpair fd. */ +/** @brief One contiguous sub-transfer routed to a specific qpair. */ struct vrtd_xfer_seg { - uint32_t fd_index; /**< Index into the qpair_fds array. */ - uint64_t offset; /**< Buffer-relative byte offset. */ - uint64_t size; /**< Byte count. */ + uint32_t qpair_index; /**< Index into the fd's bound qpairs (== channel). */ + uint64_t offset; /**< Buffer-relative byte offset. */ + uint64_t size; /**< Byte count. */ }; /** * @brief Compute the V80 transfer plan for a buffer range. * * Plans the transfer of [@p offset, @p offset + @p size) within a buffer based - * at device address @p phys_addr across @p qpair_fd_count available queues - * (fds[0] == channel 0, fds[1] == channel 1). Split points are aligned down to - * @p step so every emitted segment stays page-aligned. With fewer than two - * queues (or a zero step) the whole range is assigned to fds[0]. + * at device address @p phys_addr across @p qpair_count available queue pairs + * (qpair_index 0 == channel 0, qpair_index 1 == channel 1). Split points are + * aligned down to @p step so every emitted segment stays page-aligned. With + * fewer than two queue pairs (or a zero step) the whole range is assigned to + * qpair_index 0. * - * @param phys_addr Device base address of the buffer. - * @param offset Buffer-relative start of the transfer. - * @param size Transfer length in bytes (assumed a multiple of step). - * @param step Transfer/page granule used to align split points. - * @param qpair_fd_count Number of available qpair fds (1 or 2). - * @param segs [out] Receives up to VRTD_V80_MAX_SEGS segments. + * @param phys_addr Device base address of the buffer. + * @param offset Buffer-relative start of the transfer. + * @param size Transfer length in bytes (assumed a multiple of step). + * @param step Transfer/page granule used to align split points. + * @param qpair_count Number of available queue pairs (1 or 2). + * @param segs [out] Receives up to VRTD_V80_MAX_SEGS segments. * @return Number of segments written to @p segs (1 or 2). */ static inline uint32_t vrtd_plan_v80(uint64_t phys_addr, uint64_t offset, uint64_t size, uint64_t step, - uint32_t qpair_fd_count, + uint32_t qpair_count, struct vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]) { - if (qpair_fd_count < 2u || step == 0u) { - segs[0].fd_index = 0u; + if (qpair_count < 2u || step == 0u) { + segs[0].qpair_index = 0u; segs[0].offset = offset; segs[0].size = size; return 1u; @@ -111,9 +112,9 @@ static inline uint32_t vrtd_plan_v80(uint64_t phys_addr, /* HBM: route by the 16 GiB half-memory boundary (NSU split). */ uint64_t boundary = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF; if (end <= boundary) { - lo_len = size; /* entirely in the lower half -> ch0 */ + lo_len = size; /* entirely in the lower half -> ch0 */ } else if (start >= boundary) { - segs[0].fd_index = 1u; /* entirely in the upper half -> ch1 */ + segs[0].qpair_index = 1u; /* entirely in the upper half -> ch1 */ segs[0].offset = offset; segs[0].size = size; return 1u; @@ -125,16 +126,16 @@ static inline uint32_t vrtd_plan_v80(uint64_t phys_addr, lo_len -= lo_len % step; /* keep both segments page-aligned */ if (lo_len == 0u || lo_len >= size) { - segs[0].fd_index = 0u; + segs[0].qpair_index = 0u; segs[0].offset = offset; segs[0].size = size; return 1u; } - segs[0].fd_index = 0u; + segs[0].qpair_index = 0u; segs[0].offset = offset; segs[0].size = lo_len; - segs[1].fd_index = 1u; + segs[1].qpair_index = 1u; segs[1].offset = offset + lo_len; segs[1].size = size - lo_len; return 2u; diff --git a/vrt/vrtd/libvrtdpp/src/buffer.cpp b/vrt/vrtd/libvrtdpp/src/buffer.cpp index 9242c6b2..756170a6 100644 --- a/vrt/vrtd/libvrtdpp/src/buffer.cpp +++ b/vrt/vrtd/libvrtdpp/src/buffer.cpp @@ -123,7 +123,7 @@ void *Buffer::data() noexcept int Buffer::getFd() const noexcept { - return buffer ? buffer->qpair_fds[0] : -1; + return buffer ? buffer->qpair_fd : -1; } int Buffer::releaseFd() noexcept @@ -131,8 +131,8 @@ int Buffer::releaseFd() noexcept if (buffer == nullptr) { return -1; } - int ret = buffer->qpair_fds[0]; - buffer->qpair_fds[0] = -1; + int ret = buffer->qpair_fd; + buffer->qpair_fd = -1; return ret; } diff --git a/vrt/vrtd/src/buffer.c b/vrt/vrtd/src/buffer.c index bf2c5d13..aa86cfab 100644 --- a/vrt/vrtd/src/buffer.c +++ b/vrt/vrtd/src/buffer.c @@ -134,7 +134,7 @@ static int buffer_init(struct buffer *buf, .size = 0, .qpair_count = 0, .qids = {0}, - .fds = {-1, -1}, + .fd = -1, .allocation_valid = false, .qpair_created = false, }; @@ -239,13 +239,17 @@ static int buffer_init(struct buffer *buf, LOG(LOG_ERR, "Failed to start buffer qpair %u: %m", qpair.qid); goto fail; } + } - int fd = slash_qdma_qpair_get_fd(qdma, qpair.qid, O_CLOEXEC); - if (fd < 0) { - LOG(LOG_ERR, "Failed to get fd for buffer qpair %u: %m", qpair.qid); - goto fail; - } - buf->fds[i] = fd; + /* Step 5: bind every started qpair into a single transfer fd so one + * transfer ioctl can fan across both channels. The qids array index + * becomes the qpair_index used by the client's sub-transfers. */ + buf->fd = slash_qdma_qpair_get_fd_multi(qdma, buf->qids, buf->qpair_count, + O_CLOEXEC); + if (buf->fd < 0) { + LOG(LOG_ERR, "Failed to get combined fd for %u buffer qpairs: %m", + (unsigned int)buf->qpair_count); + goto fail; } LOG(LOG_DEBUG, "Buffer initialized addr=0x%llx size=%llu qpairs=%u", @@ -346,7 +350,7 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma, .size = size, .qpair_count = 0, .qids = {0}, - .fds = {-1, -1}, + .fd = -1, .allocation_valid = false, /* no allocator reservation to free */ .qpair_created = false, }; @@ -380,14 +384,16 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma, cleanup_buffer(buf); return NULL; } + } - int fd = slash_qdma_qpair_get_fd(qdma, qpair.qid, O_CLOEXEC); - if (fd < 0) { - LOG(LOG_ERR, "buffer_create_raw: failed to get fd for qpair %u: %m", qpair.qid); - cleanup_buffer(buf); - return NULL; - } - buf->fds[i] = fd; + /* Bind every started qpair into a single transfer fd. */ + buf->fd = slash_qdma_qpair_get_fd_multi(qdma, buf->qids, buf->qpair_count, + O_CLOEXEC); + if (buf->fd < 0) { + LOG(LOG_ERR, "buffer_create_raw: failed to get combined fd for %u qpairs: %m", + (unsigned int)buf->qpair_count); + cleanup_buffer(buf); + return NULL; } LOG(LOG_DEBUG, "Raw buffer created phys_addr=0x%llx size=%llu qpairs=%u", @@ -419,12 +425,10 @@ void cleanup_buffer(struct buffer *buf) (unsigned long long)buf->addr, (unsigned long long)buf->size, (unsigned int)buf->qpair_count); - /* Close the QDMA queue fds first, before stopping the queues. */ - for (uint32_t i = 0; i < VRTD_BUFFER_MAX_QPAIR_FDS; ++i) { - if (buf->fds[i] >= 0) { - (void) close(buf->fds[i]); - buf->fds[i] = -1; - } + /* Close the combined QDMA transfer fd first, before stopping the queues. */ + if (buf->fd >= 0) { + (void) close(buf->fd); + buf->fd = -1; } /* Stop and delete the QDMA queue pairs. Errors are logged but @@ -473,9 +477,7 @@ void cleanup_buffer(struct buffer *buf) buf->size = 0; buf->qpair_count = 0; memset(buf->qids, 0, sizeof(buf->qids)); - for (uint32_t i = 0; i < VRTD_BUFFER_MAX_QPAIR_FDS; ++i) { - buf->fds[i] = -1; - } + buf->fd = -1; free(buf); } diff --git a/vrt/vrtd/src/buffer.h b/vrt/vrtd/src/buffer.h index 4f59759f..e5abba79 100644 --- a/vrt/vrtd/src/buffer.h +++ b/vrt/vrtd/src/buffer.h @@ -74,13 +74,14 @@ struct buffer { uint64_t addr; /** @brief Size of the allocated memory region in bytes (rounded up to subregion granularity). */ uint64_t size; - /** @brief Number of QDMA queue pairs created for this buffer. */ + /** @brief Number of QDMA queue pairs created for this buffer (1 or 2). */ uint32_t qpair_count; /** @brief QDMA queue IDs assigned to this buffer's queue pairs. */ uint32_t qids[VRTD_BUFFER_MAX_QPAIR_FDS]; - /** @brief File descriptors for the QDMA queue pairs. - * Passed to the client via SCM_RIGHTS for direct data transfer. */ - int fds[VRTD_BUFFER_MAX_QPAIR_FDS]; + /** @brief Single transfer fd that owns all @qpair_count queue pairs. + * Passed to the client via SCM_RIGHTS for direct data transfer; the client + * selects a channel per sub-transfer by qpair_index. -1 when not created. */ + int fd; /** @brief True if the address-space allocation in the memory map is valid and must be freed. */ bool allocation_valid; /** @brief True if the QDMA queue pair has been created and must be torn down on cleanup. */ diff --git a/vrt/vrtd/src/serve.c b/vrt/vrtd/src/serve.c index 5839efe7..d4ea1eb2 100644 --- a/vrt/vrtd/src/serve.c +++ b/vrt/vrtd/src/serve.c @@ -2025,18 +2025,15 @@ static uint16_t client_handle_request_buffer_open( } if (buf->qpair_count == 0 || buf->qpair_count > VRTD_BUFFER_MAX_QPAIR_FDS || - buf->fds[0] < 0) { + buf->fd < 0) { LOG(LOG_ERR, "Buffer created without valid qpair fd"); return VRTD_RET_INTERNAL_ERROR; } uint64_t real_size = buf->size; uint64_t phys_addr = buf->addr; - uint32_t qpair_fd_count = buf->qpair_count; - int fds[VRTD_BUFFER_MAX_QPAIR_FDS]; - for (uint32_t i = 0; i < qpair_fd_count; ++i) { - fds[i] = buf->fds[i]; - } + uint32_t qpair_count = buf->qpair_count; + int fd = buf->fd; /* * Transfer ownership of the buffer into the device's buffer list. @@ -2050,12 +2047,12 @@ static uint16_t client_handle_request_buffer_open( resp_body->size = real_size; resp_body->phys_addr = phys_addr; - resp_body->qpair_fd_count = qpair_fd_count; - for (uint32_t i = 0; i < qpair_fd_count; ++i) { - client->out_fds[i] = fds[i]; - } - client->out_fd_count = qpair_fd_count; - *out_fd = fds[0]; + resp_body->qpair_count = qpair_count; + /* A single transfer fd owns all qpairs; the client selects channels by + * qpair_index per sub-transfer. */ + client->out_fds[0] = fd; + client->out_fd_count = 1; + *out_fd = fd; *have_out_fd = true; *resp_size = sizeof(*resp_body); @@ -2159,28 +2156,23 @@ static uint16_t client_handle_request_buffer_open_raw( } if (buf->qpair_count == 0 || buf->qpair_count > VRTD_BUFFER_MAX_QPAIR_FDS || - buf->fds[0] < 0) { + buf->fd < 0) { LOG(LOG_ERR, "Raw buffer created without valid qpair fd"); return VRTD_RET_INTERNAL_ERROR; } - uint32_t qpair_fd_count = buf->qpair_count; - int fds[VRTD_BUFFER_MAX_QPAIR_FDS]; - for (uint32_t i = 0; i < qpair_fd_count; ++i) { - fds[i] = buf->fds[i]; - } + uint32_t qpair_count = buf->qpair_count; + int fd = buf->fd; if (buffer_ptr_array_push_move(&d->buffers, &buf) != 0) { LOG(LOG_ERR, "Failed to add raw buffer to device buffer list"); return VRTD_RET_INTERNAL_ERROR; } - resp_body->qpair_fd_count = qpair_fd_count; - for (uint32_t i = 0; i < qpair_fd_count; ++i) { - client->out_fds[i] = fds[i]; - } - client->out_fd_count = qpair_fd_count; - *out_fd = fds[0]; + resp_body->qpair_count = qpair_count; + client->out_fds[0] = fd; + client->out_fd_count = 1; + *out_fd = fd; *have_out_fd = true; *resp_size = sizeof(*resp_body); diff --git a/vrt/vrtd/tests/buffer_test.cpp b/vrt/vrtd/tests/buffer_test.cpp index e2fbfd59..39651c07 100644 --- a/vrt/vrtd/tests/buffer_test.cpp +++ b/vrt/vrtd/tests/buffer_test.cpp @@ -38,26 +38,23 @@ static constexpr uint64_t CLIENT_ID = 42; static void qpair_fd_round_trip(int fd, uint64_t addr, const uint8_t *src, uint8_t *dst) { - uint8_t write_buf[XFER_SIZE]; - uint8_t read_buf[XFER_SIZE]{}; - std::memcpy(write_buf, src, XFER_SIZE); - - uint32_t write_id = 0; - uint32_t read_id = 0; - ASSERT_EQ(slash_qdma_qpair_buffer_register(fd, write_buf, XFER_SIZE, &write_id, nullptr), 0); - ASSERT_EQ(slash_qdma_qpair_buffer_register(fd, read_buf, XFER_SIZE, &read_id, nullptr), 0); + struct slash_qdma_buffer write_buf{}; + struct slash_qdma_buffer read_buf{}; + ASSERT_EQ(slash_qdma_qpair_buffer_create(fd, XFER_SIZE, &write_buf), 0); + ASSERT_EQ(slash_qdma_qpair_buffer_create(fd, XFER_SIZE, &read_buf), 0); + std::memcpy(write_buf.addr, src, XFER_SIZE); ssize_t written = slash_qdma_qpair_transfer( - fd, write_id, 0, addr, XFER_SIZE, SLASH_QDMA_XFER_H2C); + fd, write_buf.fd, 0, addr, XFER_SIZE, SLASH_QDMA_XFER_H2C); EXPECT_EQ(written, static_cast(XFER_SIZE)); ssize_t read_bytes = slash_qdma_qpair_transfer( - fd, read_id, 0, addr, XFER_SIZE, SLASH_QDMA_XFER_C2H); + fd, read_buf.fd, 0, addr, XFER_SIZE, SLASH_QDMA_XFER_C2H); EXPECT_EQ(read_bytes, static_cast(XFER_SIZE)); - std::memcpy(dst, read_buf, XFER_SIZE); + std::memcpy(dst, read_buf.addr, XFER_SIZE); - EXPECT_EQ(slash_qdma_qpair_buffer_unregister(fd, write_id), 0); - EXPECT_EQ(slash_qdma_qpair_buffer_unregister(fd, read_id), 0); + EXPECT_EQ(slash_qdma_buffer_destroy(&write_buf), 0); + EXPECT_EQ(slash_qdma_buffer_destroy(&read_buf), 0); } // ─── Null / argument validation (no hardware needed, always run) ────────────── @@ -181,14 +178,14 @@ TEST_P(BufferTest, LifecycleBidirectional) { XFER_SIZE, 0, CLIENT_ID, SLASH_QDMA_MM_CHANNEL_AUTO, nullptr); ASSERT_NE(buf, nullptr); ASSERT_GE(buf->qpair_count, 1u); - EXPECT_GE(buf->fds[0], 0); + EXPECT_GE(buf->fd, 0); uint8_t src[XFER_SIZE]; for (size_t i = 0; i < XFER_SIZE; ++i) src[i] = static_cast(i & 0xFF); uint8_t dst[XFER_SIZE]{}; - qpair_fd_round_trip(buf->fds[0], buf->addr, src, dst); + qpair_fd_round_trip(buf->fd, buf->addr, src, dst); EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); cleanup_buffer(buf); @@ -199,14 +196,14 @@ TEST_P(BufferTest, RawCreateAndIO) { VRTD_ALLOC_DIR_BIDIRECTIONAL, SLASH_QDMA_MM_CHANNEL_AUTO); ASSERT_NE(buf, nullptr); ASSERT_GE(buf->qpair_count, 1u); - EXPECT_GE(buf->fds[0], 0); + EXPECT_GE(buf->fd, 0); EXPECT_EQ(buf->addr, DDR_START_ADDRESS); EXPECT_FALSE(buf->allocation_valid); uint8_t src[XFER_SIZE]; std::memset(src, 0xCD, sizeof(src)); uint8_t dst[XFER_SIZE]{}; - qpair_fd_round_trip(buf->fds[0], DDR_START_ADDRESS, src, dst); + qpair_fd_round_trip(buf->fd, DDR_START_ADDRESS, src, dst); EXPECT_EQ(std::memcmp(src, dst, XFER_SIZE), 0); cleanup_buffer(buf); diff --git a/vrt/vrtd/tests/v80_policy_test.cpp b/vrt/vrtd/tests/v80_policy_test.cpp index f9050a83..068a724b 100644 --- a/vrt/vrtd/tests/v80_policy_test.cpp +++ b/vrt/vrtd/tests/v80_policy_test.cpp @@ -35,7 +35,7 @@ TEST(V80Plan, SingleQueueIsWhole) { vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; uint32_t n = vrtd_plan_v80(VRTD_V80_HBM_BASE, 0, 512 * MiB, STEP, 1, segs); ASSERT_EQ(n, 1u); - EXPECT_EQ(segs[0].fd_index, 0u); + EXPECT_EQ(segs[0].qpair_index, 0u); EXPECT_EQ(segs[0].offset, 0u); EXPECT_EQ(segs[0].size, 512 * MiB); } @@ -45,10 +45,10 @@ TEST(V80Plan, DdrSplitsInHalf) { vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; uint32_t n = vrtd_plan_v80(VRTD_V80_DDR_BASE, 0, 512 * MiB, STEP, 2, segs); ASSERT_EQ(n, 2u); - EXPECT_EQ(segs[0].fd_index, 0u); + EXPECT_EQ(segs[0].qpair_index, 0u); EXPECT_EQ(segs[0].offset, 0u); EXPECT_EQ(segs[0].size, 256 * MiB); - EXPECT_EQ(segs[1].fd_index, 1u); + EXPECT_EQ(segs[1].qpair_index, 1u); EXPECT_EQ(segs[1].offset, 256 * MiB); EXPECT_EQ(segs[1].size, 256 * MiB); } @@ -58,7 +58,7 @@ TEST(V80Plan, DdrTinyTransferStaysOnPrimary) { vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; uint32_t n = vrtd_plan_v80(VRTD_V80_DDR_BASE, 0, STEP, STEP, 2, segs); ASSERT_EQ(n, 1u); - EXPECT_EQ(segs[0].fd_index, 0u); + EXPECT_EQ(segs[0].qpair_index, 0u); EXPECT_EQ(segs[0].size, STEP); } @@ -67,7 +67,7 @@ TEST(V80Plan, HbmLowerHalfChannel0) { vrtd_xfer_seg segs[VRTD_V80_MAX_SEGS]; uint32_t n = vrtd_plan_v80(VRTD_V80_HBM_BASE, 0, 512 * MiB, STEP, 2, segs); ASSERT_EQ(n, 1u); - EXPECT_EQ(segs[0].fd_index, 0u); + EXPECT_EQ(segs[0].qpair_index, 0u); EXPECT_EQ(segs[0].offset, 0u); EXPECT_EQ(segs[0].size, 512 * MiB); } @@ -78,7 +78,7 @@ TEST(V80Plan, HbmUpperHalfChannel1) { uint64_t base = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF + 4 * GiB; uint32_t n = vrtd_plan_v80(base, 0, 512 * MiB, STEP, 2, segs); ASSERT_EQ(n, 1u); - EXPECT_EQ(segs[0].fd_index, 1u); + EXPECT_EQ(segs[0].qpair_index, 1u); EXPECT_EQ(segs[0].offset, 0u); EXPECT_EQ(segs[0].size, 512 * MiB); } @@ -89,7 +89,7 @@ TEST(V80Plan, HbmOnBoundaryIsUpperHalf) { uint64_t base = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF; uint32_t n = vrtd_plan_v80(base, 0, 256 * MiB, STEP, 2, segs); ASSERT_EQ(n, 1u); - EXPECT_EQ(segs[0].fd_index, 1u); + EXPECT_EQ(segs[0].qpair_index, 1u); } // An HBM range straddling the boundary splits exactly at it. @@ -98,10 +98,10 @@ TEST(V80Plan, HbmSpanningSplitsAtBoundary) { uint64_t base = VRTD_V80_HBM_BASE + VRTD_V80_HBM_HALF - 256 * MiB; uint32_t n = vrtd_plan_v80(base, 0, 512 * MiB, STEP, 2, segs); ASSERT_EQ(n, 2u); - EXPECT_EQ(segs[0].fd_index, 0u); + EXPECT_EQ(segs[0].qpair_index, 0u); EXPECT_EQ(segs[0].offset, 0u); EXPECT_EQ(segs[0].size, 256 * MiB); - EXPECT_EQ(segs[1].fd_index, 1u); + EXPECT_EQ(segs[1].qpair_index, 1u); EXPECT_EQ(segs[1].offset, 256 * MiB); EXPECT_EQ(segs[1].size, 256 * MiB); } @@ -113,10 +113,10 @@ TEST(V80Plan, HbmSpanningWithOffset) { uint64_t offset = VRTD_V80_HBM_HALF - STEP; // crosses boundary STEP into the range uint32_t n = vrtd_plan_v80(VRTD_V80_HBM_BASE, offset, 2 * STEP, STEP, 2, segs); ASSERT_EQ(n, 2u); - EXPECT_EQ(segs[0].fd_index, 0u); + EXPECT_EQ(segs[0].qpair_index, 0u); EXPECT_EQ(segs[0].offset, offset); EXPECT_EQ(segs[0].size, STEP); - EXPECT_EQ(segs[1].fd_index, 1u); + EXPECT_EQ(segs[1].qpair_index, 1u); EXPECT_EQ(segs[1].offset, offset + STEP); EXPECT_EQ(segs[1].size, STEP); } From 6599ce7f7d323db9a01426404009f8b17ddd9635 Mon Sep 17 00:00:00 2001 From: Vlad-Gabriel Serbu Date: Wed, 17 Jun 2026 15:55:56 +0100 Subject: [PATCH 23/23] vrtd: track client ownership on raw buffers and close per-owner Signed-off-by: Vlad-Gabriel Serbu --- vrt/vrtd/src/buffer.c | 3 ++- vrt/vrtd/src/buffer.h | 3 +++ vrt/vrtd/src/serve.c | 44 ++++++++++++++++++++++++---------- vrt/vrtd/tests/buffer_test.cpp | 16 +++++++++---- vrt/vrtd/tests/device_test.cpp | 3 ++- 5 files changed, 50 insertions(+), 19 deletions(-) diff --git a/vrt/vrtd/src/buffer.c b/vrt/vrtd/src/buffer.c index aa86cfab..a5194fc7 100644 --- a/vrt/vrtd/src/buffer.c +++ b/vrt/vrtd/src/buffer.c @@ -309,6 +309,7 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma, uint64_t phys_addr, uint64_t size, enum vrtd_alloc_dir alloc_dir, + uint64_t client_id, uint32_t mm_channel) { if (qdma == NULL || size == 0) { @@ -345,7 +346,7 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma, .alloc_type = 0, .alloc_arg = 0, .alloc_dir = alloc_dir, - .client_id = 0, + .client_id = client_id, .addr = phys_addr, .size = size, .qpair_count = 0, diff --git a/vrt/vrtd/src/buffer.h b/vrt/vrtd/src/buffer.h index e5abba79..167bf98a 100644 --- a/vrt/vrtd/src/buffer.h +++ b/vrt/vrtd/src/buffer.h @@ -127,6 +127,8 @@ struct buffer *buffer_create(struct slash_qdma *qdma, * @param phys_addr Caller-specified device physical address. * @param size Size in bytes. * @param alloc_dir DMA transfer direction. + * @param client_id Connection ID of the owning client (for ownership checks + * and automatic cleanup on disconnect; must be non-zero). * @param mm_channel AXI-MM/NoC channel selection (enum slash_qdma_mm_channel). * @return Heap-allocated buffer on success, NULL on failure (errno set). */ @@ -134,6 +136,7 @@ struct buffer *buffer_create_raw(struct slash_qdma *qdma, uint64_t phys_addr, uint64_t size, enum vrtd_alloc_dir alloc_dir, + uint64_t client_id, uint32_t mm_channel); /** diff --git a/vrt/vrtd/src/serve.c b/vrt/vrtd/src/serve.c index d4ea1eb2..4559f82a 100644 --- a/vrt/vrtd/src/serve.c +++ b/vrt/vrtd/src/serve.c @@ -2138,12 +2138,19 @@ static uint16_t client_handle_request_buffer_open_raw( return VRTD_RET_NOEXIST; } + uint64_t client_id = client->conn_id; + if (client_id == 0) { + LOG(LOG_ERR, "Invalid client connection id"); + return VRTD_RET_INTERNAL_ERROR; + } + _cleanup_(cleanup_bufferp) struct buffer *buf = buffer_create_raw( d->qdma, req_body->phys_addr, req_body->size, (enum vrtd_alloc_dir) req_body->alloc_dir, + client_id, req_body->mm_channel ); if (buf == NULL) { @@ -2244,8 +2251,15 @@ static uint16_t client_handle_request_buffer_close( return VRTD_RET_NOEXIST; } - /* Search for the buffer by physical address. */ + /* + * Search for the caller's buffer by physical address. Raw buffers bypass + * the allocator and use caller-specified addresses, so distinct clients can + * hold buffers at the same address; scan all matches and pick the one owned + * by this connection rather than rejecting on the first address match. + */ struct buffer *found = NULL; + bool addr_size_match_foreign = false; /* same addr+size, owned by another conn */ + bool addr_match_size_mismatch = false; /* same addr, different size */ for (size_t i = 0; i < d->buffers.len; ++i) { struct buffer *buf = d->buffers.d[i]; if (buf == NULL) { @@ -2254,15 +2268,20 @@ static uint16_t client_handle_request_buffer_close( if (buf->addr != req_body->phys_addr) { continue; } - /* Found a buffer at the right address -- verify size. */ if (buf->size != req_body->size) { - LOG(LOG_WARNING, "buffer_close: size mismatch at addr=0x%llx (expected %llu, got %llu)", - (unsigned long long)req_body->phys_addr, - (unsigned long long)buf->size, (unsigned long long)req_body->size); - return VRTD_RET_INVALID_ARGUMENT; + addr_match_size_mismatch = true; + continue; } - /* Verify ownership: only the client that opened the buffer may close it. */ if (buf->client_id != client->conn_id) { + addr_size_match_foreign = true; + continue; + } + found = buf; + break; + } + + if (found == NULL) { + if (addr_size_match_foreign) { char pwbuf[1024]; LOG( LOG_WARNING, @@ -2272,11 +2291,12 @@ static uint16_t client_handle_request_buffer_close( ); return VRTD_RET_AUTH_ERROR; } - found = buf; - break; - } - - if (found == NULL) { + if (addr_match_size_mismatch) { + LOG(LOG_WARNING, "buffer_close: size mismatch at addr=0x%llx (got %llu)", + (unsigned long long)req_body->phys_addr, + (unsigned long long)req_body->size); + return VRTD_RET_INVALID_ARGUMENT; + } LOG(LOG_NOTICE, "buffer_close: no buffer at addr=0x%llx on device %u", (unsigned long long)req_body->phys_addr, (unsigned int)req_body->dev_number); return VRTD_RET_NOEXIST; diff --git a/vrt/vrtd/tests/buffer_test.cpp b/vrt/vrtd/tests/buffer_test.cpp index 39651c07..1038678b 100644 --- a/vrt/vrtd/tests/buffer_test.cpp +++ b/vrt/vrtd/tests/buffer_test.cpp @@ -124,7 +124,8 @@ TEST(BufferNullTest, CleanupNull) { TEST(BufferNullTest, RawNullQdma) { struct buffer *buf = buffer_create_raw(nullptr, DDR_START_ADDRESS, XFER_SIZE, - VRTD_ALLOC_DIR_HOST_TO_DEVICE, SLASH_QDMA_MM_CHANNEL_AUTO); + VRTD_ALLOC_DIR_HOST_TO_DEVICE, CLIENT_ID, + SLASH_QDMA_MM_CHANNEL_AUTO); EXPECT_EQ(buf, nullptr); EXPECT_EQ(errno, EINVAL); } @@ -133,7 +134,8 @@ TEST(BufferNullTest, RawZeroSize) { struct slash_qdma *qdma = slash_qdma_open("@mock"); ASSERT_NE(qdma, nullptr); struct buffer *buf = buffer_create_raw(qdma, DDR_START_ADDRESS, 0, - VRTD_ALLOC_DIR_HOST_TO_DEVICE, SLASH_QDMA_MM_CHANNEL_AUTO); + VRTD_ALLOC_DIR_HOST_TO_DEVICE, CLIENT_ID, + SLASH_QDMA_MM_CHANNEL_AUTO); EXPECT_EQ(buf, nullptr); EXPECT_EQ(errno, EINVAL); slash_qdma_close(qdma); @@ -193,12 +195,14 @@ TEST_P(BufferTest, LifecycleBidirectional) { TEST_P(BufferTest, RawCreateAndIO) { struct buffer *buf = buffer_create_raw(qdma_, DDR_START_ADDRESS, XFER_SIZE, - VRTD_ALLOC_DIR_BIDIRECTIONAL, SLASH_QDMA_MM_CHANNEL_AUTO); + VRTD_ALLOC_DIR_BIDIRECTIONAL, CLIENT_ID, + SLASH_QDMA_MM_CHANNEL_AUTO); ASSERT_NE(buf, nullptr); ASSERT_GE(buf->qpair_count, 1u); EXPECT_GE(buf->fd, 0); EXPECT_EQ(buf->addr, DDR_START_ADDRESS); EXPECT_FALSE(buf->allocation_valid); + EXPECT_EQ(buf->client_id, CLIENT_ID); uint8_t src[XFER_SIZE]; std::memset(src, 0xCD, sizeof(src)); @@ -223,14 +227,16 @@ TEST_P(BufferTest, QueueExhaustion) { for (int i = 0; i < MAX_BUFFERS; ++i) { struct buffer *buf = buffer_create_raw(qdma_, DDR_START_ADDRESS + i * XFER_SIZE, - XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE, SLASH_QDMA_MM_CHANNEL_AUTO); + XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE, CLIENT_ID, + SLASH_QDMA_MM_CHANNEL_AUTO); ASSERT_NE(buf, nullptr) << "Expected success for buffer " << i; bufs.push_back(buf); } /* 33rd allocation needs queues 65/66 and must fail. */ struct buffer *overflow = buffer_create_raw(qdma_, DDR_START_ADDRESS, - XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE, SLASH_QDMA_MM_CHANNEL_AUTO); + XFER_SIZE, VRTD_ALLOC_DIR_HOST_TO_DEVICE, CLIENT_ID, + SLASH_QDMA_MM_CHANNEL_AUTO); EXPECT_EQ(overflow, nullptr); EXPECT_EQ(errno, ENOSPC); diff --git a/vrt/vrtd/tests/device_test.cpp b/vrt/vrtd/tests/device_test.cpp index 93dda772..c66fd9f6 100644 --- a/vrt/vrtd/tests/device_test.cpp +++ b/vrt/vrtd/tests/device_test.cpp @@ -149,7 +149,8 @@ TEST(DeviceCleanupTest, CleanupWithBuffers) { /* Allocate a raw buffer on the mock QDMA and hand ownership to d->buffers. */ struct buffer *buf = buffer_create_raw(d->qdma, DDR_START_ADDRESS, 4096, - VRTD_ALLOC_DIR_HOST_TO_DEVICE, SLASH_QDMA_MM_CHANNEL_AUTO); + VRTD_ALLOC_DIR_HOST_TO_DEVICE, /*client_id=*/1, + SLASH_QDMA_MM_CHANNEL_AUTO); ASSERT_NE(buf, nullptr); int ret = buffer_ptr_array_push_move(&d->buffers, &buf);