diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index fb3966ce6..7f1c8192d 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -304,25 +304,6 @@ qwen3.5-fp8-mi355x-sglang-mtp:
       - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp }
       - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp }
 
-# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main.
-qwen3.5-fp8-mi355x-sglang-agentic:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
 qwen3.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -704,26 +685,6 @@ glm5.1-fp4-mi355x-sglang:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 16 }
 
-# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main.
-glm5.1-fp4-mi355x-sglang-agentic:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
-  model: amd/GLM-5.1-MXFP4
-  model-prefix: glm5.1
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
 glm5.1-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: amd/GLM-5.1-MXFP4
@@ -744,7 +705,7 @@ glm5.1-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 256 }
 
 kimik2.5-int4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi355x
@@ -763,7 +724,7 @@ kimik2.5-int4-mi355x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-int4-mi325x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi325x
@@ -782,7 +743,7 @@ kimik2.5-int4-mi325x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-int4-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi300x
@@ -821,38 +782,6 @@ kimik2.5-fp4-mi355x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
-# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0'
-kimik2.5-fp4-mi355x-vllm-agentic:
-  # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
-  # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
-  # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
-  # includes all subsequent ROCm offload work.
-  image: vllm/vllm-openai-rocm:v0.21.0
-  model: amd/Kimi-K2.5-MXFP4
-  model-prefix: kimik2.5
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
-      # CPU offload only above the KV cliff. Lower concurrencies fit
-      # entirely on-GPU, so paying the offload-path overhead there would
-      # just slow them down without measuring anything new.
-      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
-      # TP=4 probe: half-node layout doubles per-GPU weight footprint
-      # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to
-      # cliff-region concurrencies on both offload modes so we can directly
-      # compare TP=4 vs TP=8 at the same conc points.
-      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
-
 kimik2.5-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
   model: amd/Kimi-K2.5-MXFP4
@@ -897,33 +826,6 @@ minimaxm2.5-fp8-mi355x-vllm:
       - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
       - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }
 
-# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
-minimaxm2.5-fp8-mi355x-vllm-agentic:
-  # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector]
-  # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"),
-  # which enables SimpleCPUOffloadConnector on ROCm. Required for the
-  # cpu-offload sweep points to use the same offload path as the NVIDIA
-  # agentic-coding configs.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi355x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
-    # Compute saturates first; cpu offload likely won't help, but worth confirming.
-    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
-    - duration: 1800
-      search-space:
-      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
-      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
-
 minimaxm2.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: MiniMaxAI/MiniMax-M2.5
@@ -994,7 +896,7 @@ minimaxm2.5-fp4-mi355x-vllm:
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi300x
@@ -1014,29 +916,6 @@ minimaxm2.5-fp8-mi300x-vllm:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
-# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
-minimaxm2.5-fp8-mi300x-vllm-agentic:
-  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi300x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200);
-    # KV cliff ~52. Compute saturates first.
-    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
-
 minimaxm2.5-fp8-mi325x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -1058,32 +937,8 @@ minimaxm2.5-fp8-mi325x-vllm:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
 
-# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
-minimaxm2.5-fp8-mi325x-vllm-agentic:
-  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi325x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI325X tp=4: cloned from MI300X recipe (slightly faster compute,
-    # similar HBM profile). Compute saturates first; cpu-offload window
-    # exercises the SimpleCPUOffloadConnector path enabled by the rocm
-    # nightly. Mirror MI300X conc grid for cross-vendor comparability.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
-
 gptoss-fp4-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.17.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: mi300x
@@ -1524,7 +1379,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
           - "DECODE_MTP_SIZE=2"
 
 kimik2.5-fp4-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
   runner: mi355x-disagg
@@ -1578,7 +1433,7 @@ kimik2.5-fp4-mi355x-vllm-disagg:
           - "DECODE_NODES=2"
 
 minimaxm2.5-fp8-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi355x-disagg
@@ -1971,7 +1826,6 @@ dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
   
-
 dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
   image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
   model: amd/DeepSeek-R1-0528-MXFP4-v2
@@ -2082,7 +1936,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=3"
 
-
       # 1*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
         conc-list: [ 128 ]
@@ -2140,11 +1993,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
 
-
-# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
-# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
-# image tag, so bumping sglang is just an image tag bump here. Sweeps
-# DP-attention on/off and EP=8.
 dsv4-fp4-mi355x-sglang:
   image: rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -2201,25 +2049,6 @@ dsv4-fp4-mi355x-sglang-mtp:
       - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp }
       - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp }
 
-# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
-# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
-# on 2026-05-05, so any nightly built after that includes the
-# DeepseekV4ForCausalLM model class.
-#
-# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
-# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
-# files keyed on the image string and short-circuits re-import if the
-# file already exists, so the floating tag silently keeps a stale build
-# even after Docker Hub updates `:nightly`.
-#
-# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
-# rest); InferenceX classifies this as fp4 — same as the sister sglang
-# and atom DSv4 mi355x entries below. Image and serving flags follow the
-# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
-# executor, triton_unfused MoE (required for the FP4 expert format),
-# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
-# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
-# probe to validate the ROCm DP+EP path.
 dsv4-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -2408,44 +2237,6 @@ glm5-fp8-mi325x-sglang-mtp:
       search-space:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
-# ============================================================================
-# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries).
-# Recipes that ALREADY existed on main were intentionally left at main's version
-# to preserve main behavior; PR-branch modifications to those recipes are NOT
-# brought in here.
-# ============================================================================
-
-qwen3.5-fp8-mi355x-sglang-agentic-hicache:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
-
-dsv4-fp4-mi355x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.21.0
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
-      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
-
 dsr1-fp4-mi355x-sglang-disagg-mtp:
   image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
   model: amd/DeepSeek-R1-0528-MXFP4-v2
@@ -2674,20 +2465,145 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
       
+qwen3.5-fp8-mi355x-sglang-agentic:
+  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+glm5.1-fp4-mi355x-sglang-agentic:
+  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
+  model: amd/GLM-5.1-MXFP4
+  model-prefix: glm5.1
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+kimik2.5-fp4-mi355x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
+      # CPU offload only above the KV cliff. Lower concurrencies fit
+      # entirely on-GPU, so paying the offload-path overhead there would
+      # just slow them down without measuring anything new.
+      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
+      # TP=4 probe: half-node layout doubles per-GPU weight footprint
+      # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to
+      # cliff-region concurrencies on both offload modes so we can directly
+      # compare TP=4 vs TP=8 at the same conc points.
+      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
+
+minimaxm2.5-fp8-mi355x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
+    # Compute saturates first; cpu offload likely won't help, but worth confirming.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
+      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
+
+minimaxm2.5-fp8-mi300x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi300x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200);
+    # KV cliff ~52. Compute saturates first.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
+
+minimaxm2.5-fp8-mi325x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi325x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI325X tp=4: cloned from MI300X recipe (slightly faster compute,
+    # similar HBM profile). Compute saturates first; cpu-offload window
+    # exercises the SimpleCPUOffloadConnector path enabled by the rocm
+    # nightly. Mirror MI300X conc grid for cross-vendor comparability.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
+
+qwen3.5-fp8-mi355x-sglang-agentic-hicache:
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
+
+dsv4-fp4-mi355x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
 
-# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
-# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
-# image tag, so bumping sglang is just an image tag bump here. Sweeps
-# DP-attention on/off and EP=8.
-
-# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below;
-# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - scenarios: replaced fixed-seq-len with agentic-coding.
-# Image is identical to the base entry (rocm/sgl-dev DSv4 build).
-# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware
-# comparability. Offload sweep is none-only (SGLang has no equivalent of
-# vLLM's SimpleCPUOffloadConnector path that we exercise on b200).
 dsv4-fp4-mi355x-sglang-agentic:
   image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -2702,23 +2618,3 @@ dsv4-fp4-mi355x-sglang-agentic:
       search-space:
       - { tp: 8, offloading: none, conc-list: [16, 32, 64] }
       - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] }
-
-# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
-# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
-# on 2026-05-05, so any nightly built after that includes the
-# DeepseekV4ForCausalLM model class.
-#
-# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
-# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
-# files keyed on the image string and short-circuits re-import if the
-# file already exists, so the floating tag silently keeps a stale build
-# even after Docker Hub updates `:nightly`.
-#
-# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
-# rest); InferenceX classifies this as fp4 — same as the sister sglang
-# and atom DSv4 mi355x entries below. Image and serving flags follow the
-# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
-# executor, triton_unfused MoE (required for the FP4 expert format),
-# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
-# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
-# probe to validate the ROCm DP+EP path.
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index d3b1b6729..380c799e1 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -384,25 +384,6 @@ dsr1-fp4-b200-dynamo-trt:
           ep: 8
           dp-attn: true
 
-    agentic-coding:
-    - duration: 300
-      search-space:
-      - spec-decoding: "none"
-        conc-list: [ 1, 2, 4, 8, 16, 32 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: true
-          additional-settings:
-          # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml
-          - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: false
-
 dsr1-fp8-b200-dynamo-trt:
   image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
   model: deepseek-ai/DeepSeek-R1-0528
@@ -1778,28 +1759,6 @@ dsv4-fp4-b200-vllm:
       - { tp: 8, conc-start: 1, conc-end: 32 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
 
-# Diverged from dsv4-fp4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original dsv4-fp4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - runner: 'b200-dsv4' -> 'b200-dgxc'
-dsv4-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.0-cu130
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: b200-dgxc
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # cpu offload only this iteration — none entries already validated in
-      # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%).
-      # Re-add when investigating regressions in offload=none.
-      - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
-      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
-
 dsv4-fp4-b200-trt:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -1845,7 +1804,7 @@ dsv4-fp4-b200-trt-mtp:
 # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
 dsv4-fp4-b200-vllm-mtp:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dsv4
@@ -2105,23 +2064,6 @@ qwen3.5-bf16-b200-sglang-mtp:
       search-space:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
-# agentic-coding sibling — temporarily disabled, blocked by e2e-tests.yml
-# artifact-name mismatch (downloads `agentic_*` but benchmark-tmpl.yml uploads
-# as `bmk_agentic_*`). Re-enable once that workflow is aligned.
-# qwen3.5-bf16-b200-sglang-agentic:
-#   image: lmsysorg/sglang:v0.5.12-cu130
-#   model: Qwen/Qwen3.5-397B-A17B
-#   model-prefix: qwen3.5
-#   runner: b200
-#   precision: bf16
-#   framework: sglang
-#   multinode: false
-#   scenarios:
-#     agentic-coding:
-#     - duration: 1800
-#       search-space:
-#       - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
 qwen3.5-fp8-b200-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -2143,25 +2085,6 @@ qwen3.5-fp8-b200-sglang:
       - { tp: 8, conc-start: 4, conc-end: 4 }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
 
-# Diverged from qwen3.5-fp8-b200-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's qwen3.5-fp8-b200-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original qwen3.5-fp8-b200-sglang entry stays byte-identical to origin/main.
-qwen3.5-fp8-b200-sglang-agentic:
-  image: lmsysorg/sglang:nightly-dev-20260422-de962f32
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: b200
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
 qwen3.5-fp4-b200-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/Qwen3.5-397B-A17B-NVFP4
@@ -2245,26 +2168,6 @@ glm5-fp8-b200-sglang-mtp:
   # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1
   # does not have a B300-specific recipe, so this config reuses the existing GLM5 FP8
   # B200 SGLang recipe as-is until B300-specific tuning is available.
-# Diverged from glm5-fp8-b200-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's glm5-fp8-b200-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original glm5-fp8-b200-sglang entry stays byte-identical to origin/main.
-glm5-fp8-b200-sglang-agentic:
-  image: lmsysorg/sglang:v0.5.12-cu130
-  model: zai-org/GLM-5-FP8
-  model-prefix: glm5
-  runner: b200
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] }
-
 glm5-fp8-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: zai-org/GLM-5-FP8
@@ -2411,7 +2314,6 @@ qwen3.5-fp8-b200-sglang-mtp:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
     
-
 qwen3.5-fp8-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -2553,39 +2455,8 @@ kimik2.5-int4-b200-vllm:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from kimik2.5-int4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-int4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - runner: 'b200' -> 'b200-dgxc'
-kimik2.5-int4-b200-vllm-agentic:
-  # Bumped from v0.19.1 — that release tripped a bug in
-  # `flashinfer_trtllm_mxint4_moe` ('list' object has no attribute 'to')
-  # during warmup `profile_run` on the agentic-coding path
-  # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the
-  # flashinfer fix.
-  image: vllm/vllm-openai:v0.20.2
-  model: moonshotai/Kimi-K2.5
-  model-prefix: kimik2.5
-  runner: b200-dgxc
-  precision: int4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-      - { tp: 8, offloading: cpu,  conc-list: [32, 64, 96, 128] }
-
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
-
 kimik2.5-int4-b300-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: b300
@@ -2624,29 +2495,6 @@ kimik2.5-int4-h200-vllm:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
-# Diverged from kimik2.5-int4-h200-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-int4-h200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - runner: 'h200' -> 'h200-dgxc'
-kimik2.5-int4-h200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
-  model: moonshotai/Kimi-K2.5
-  model-prefix: kimik2.5
-  # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with
-  # the /home/sa-shared/gharunners/ai-perf-cache mount where aiperf's
-  # content-addressed dataset mmap cache lives. Other h200 pools (cw, nb)
-  # don't have that mount and would re-materialize 65 GB to /tmp every job.
-  runner: h200-dgxc
-  precision: int4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] }
-      - { tp: 8, offloading: cpu,  conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] }
-
 kimik2.5-fp4-b200-vllm:
   image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
@@ -2668,40 +2516,8 @@ kimik2.5-fp4-b200-vllm:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
 
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2'
-#   - runner: 'b200' -> 'b200-dgxc'
-kimik2.5-fp4-b200-vllm-agentic:
-  # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that
-  # cleared the agentic-coding warmup crash on max_model_len=131072 +
-  # prefix caching.
-  image: vllm/vllm-openai:v0.20.2
-  model: nvidia/Kimi-K2.5-NVFP4
-  model-prefix: kimik2.5
-  runner: b200-dgxc
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] }
-      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [16, 24, 32, 36] }
-      - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] }
-      - { tp: 4, ep: 1, offloading: cpu,  conc-list: [12, 14, 16, 18, 20, 22, 24, 32] }
-
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-
 kimik2.5-fp4-b300-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: b300
@@ -2763,34 +2579,6 @@ dsr1-fp8-b300-sglang-mtp:
       search-space:
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 512, spec-decoding: mtp }
 
-# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130'
-#   - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4'
-#   - model-prefix: 'dsr1' -> 'kimik2.5'
-#   - precision: 'fp8' -> 'fp4'
-#   - framework: 'sglang' -> 'vllm'
-kimik2.5-fp4-b300-vllm-agentic:
-  # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM
-  # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the
-  # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted
-  # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the
-  # INT4 B300 sister already uses successfully.
-  image: vllm/vllm-openai:v0.20.0-cu130
-  model: nvidia/Kimi-K2.5-NVFP4
-  model-prefix: kimik2.5
-  runner: b300
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
-      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
-
 dsr1-fp8-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc14
   model: deepseek-ai/DeepSeek-R1-0528
@@ -2880,7 +2668,7 @@ dsr1-fp8-h200-sglang-mtp:
 # Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache
 # flag is omitted. Max-model-len is pinned at 800k per the recipe.
 dsv4-fp8-h200-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: h200
@@ -2904,7 +2692,7 @@ dsv4-fp8-h200-vllm:
 # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
 dsv4-fp8-h200-vllm-mtp:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: h200
@@ -2924,31 +2712,6 @@ dsv4-fp8-h200-vllm-mtp:
       - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp }
 
-# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only).
-# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper
-# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up.
-# Diverged from dsv4-fp8-h200-vllm (agentic-coding sibling). Reasons below;
-# the original dsv4-fp8-h200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129'
-dsv4-fp8-h200-vllm-agentic:
-  image: vllm/vllm-openai:deepseekv4-cu129
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: h200
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [1, 2, 4, 8, 16] }
-
-# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image
-# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
-# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
-
 dsv4-fp8-h200-sglang:
   image: lmsysorg/sglang:deepseek-v4-hopper@sha256:7f19c6dc092e47a10fac2e41f47eab78970280d06648b8e50d312a82f0ae722f
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -3024,30 +2787,6 @@ dsv4-fp4-b300-vllm:
       - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
-# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main.
-dsv4-fp4-b300-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.0-cu130
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: b300
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # cpu offload only this iteration — none entries already validated in
-      # earlier runs. Re-add when investigating regressions in offload=none.
-      - { tp: 4, offloading: cpu,  conc-list: [16, 32, 64] }
-      - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
-      - { tp: 4, ep: 4, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
-      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [128, 256, 512] }
-
 dsv4-fp4-b300-trt:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -3095,7 +2834,7 @@ dsv4-fp4-b300-trt-mtp:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024, spec-decoding: mtp }
 
 dsv4-fp4-b300-vllm-mtp:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -4284,27 +4023,6 @@ gptoss-fp4-b200-vllm:
       - { tp: 4, conc-start: 4, conc-end: 64 }
       - { tp: 8, conc-start: 4, conc-end: 4 }
 
-# Diverged from gptoss-fp4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original gptoss-fp4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1'
-gptoss-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.1
-  model: openai/gpt-oss-120b
-  model-prefix: gptoss
-  runner: b200
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] }
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] }
-      - { tp: 4, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
-      - { tp: 8, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
-
 minimaxm2.5-fp8-b200-vllm:
   image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -4330,35 +4048,8 @@ minimaxm2.5-fp8-b200-vllm:
   # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
   # does not have a B300-specific recipe, so this config reuses the existing
   # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from minimaxm2.5-fp8-b200-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
-#   - runner: 'b200' -> 'b200-dgxc'
-minimaxm2.5-fp8-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.1
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: b200-dgxc
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # B200 tp=4: compute ceiling ~50 (empirical), KV cliff ~48 (analytical).
-    # Push none past the KV cliff (96, 128) to make the no-offload throughput
-    # collapse visible; cpu range overlaps fully for same-conc comparison.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 96, 128] }
-      - { tp: 4, offloading: cpu,  conc-list: [48, 56, 64, 96, 128] }
-
-  # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
-  # does not have a B300-specific recipe, so this config reuses the existing
-  # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.
-
 minimaxm2.5-fp8-b300-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: b300
@@ -4381,31 +4072,6 @@ minimaxm2.5-fp8-b300-vllm:
       - { tp: 2, conc-start: 64, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 8 }
 
-# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
-minimaxm2.5-fp8-b300-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.1
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: b300
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # B300 tp=4: compute ceiling ~60 (empirical), KV cliff ~85 (analytical).
-    # Push none past the KV cliff (96, 128, 192) so the no-offload throughput
-    # collapse is visible; cpu range overlaps fully so each high-conc point
-    # has a same-conc no-offload counterpart for direct comparison.
-    # Dense sampling between 96 and 128 (step=4) to resolve the sharp dropoff
-    # observed in v6 cpu data right past conc=96.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
-      - { tp: 4, offloading: cpu,  conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
-
 minimaxm2.5-fp4-b200-vllm:
   image: vllm/vllm-openai:v0.22.0
   model: nvidia/MiniMax-M2.5-NVFP4
@@ -4438,34 +4104,11 @@ minimaxm2.5-fp4-b200-vllm:
   # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
   # does not have a B300-specific recipe, so this config reuses the existing
   # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from minimaxm2.5-fp4-b200-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's minimaxm2.5-fp4-b200-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main.
-minimaxm2.5-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.0-cu130
+minimaxm2.5-fp4-b300-vllm:
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/MiniMax-M2.5-NVFP4
   model-prefix: minimaxm2.5
-  runner: b200
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
-  # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
-  # does not have a B300-specific recipe, so this config reuses the existing
-  # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-
-minimaxm2.5-fp4-b300-vllm:
-  image: vllm/vllm-openai:v0.21.0
-  model: nvidia/MiniMax-M2.5-NVFP4
-  model-prefix: minimaxm2.5
-  runner: b300
+  runner: b300
   precision: fp4
   framework: vllm
   multinode: false
@@ -4489,7 +4132,7 @@ minimaxm2.5-fp4-b300-vllm:
       - { tp: 8, conc-start: 4, conc-end: 4 }
 
 gptoss-fp4-h100-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: h100
@@ -4530,29 +4173,6 @@ minimaxm2.5-fp8-h100-vllm:
       search-space:
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
 
-# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main.
-minimaxm2.5-fp8-h100-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: h100
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # H100 tp=4 ep=4: compute ceiling ~10 (empirical), KV cliff ~6 (analytical).
-    # Best cpu-offload demo SKU — 4-conc-point window between cliffs.
-    # Dense sampling 4-12 covers both cliffs; conc 16 confirms compute plateau.
-    - duration: 1800
-      search-space:
-      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] }
-      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [5, 6, 7, 8, 10, 12] }
-
 dsr1-fp8-h100-dynamo-sglang:
   image: lmsysorg/sglang:v0.5.8-cu130
   model: deepseek-ai/DeepSeek-R1-0528
@@ -4757,28 +4377,6 @@ minimaxm2.5-fp8-h200-vllm:
       search-space:
       - { tp: 4, conc-start: 1, conc-end: 256 }
 
-# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main.
-minimaxm2.5-fp8-h200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: h200
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # H200 tp=4: compute ceiling ~35 (empirical), KV cliff ~29 (analytical).
-    # cpu offload window conc 29-35 — dense sampling 24-40 captures both cliffs.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [24, 28, 32, 36, 40, 48] }
-
 dsr1-fp4-gb200-dynamo-trt:
   image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
   model: nvidia/DeepSeek-R1-0528-NVFP4-v2
@@ -8267,7 +7865,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
           dp-attn: true
 
 kimik2.5-fp4-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:v0.18.0-cu130
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: gb200
@@ -8369,7 +7967,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
           dp-attn: true
 
 dsv4-fp4-b200-dynamo-vllm:
-  image: vllm/vllm-openai:v0.20.1
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-multinode
@@ -8425,7 +8023,7 @@ dsv4-fp4-b200-dynamo-vllm:
           dp-attn: true
 
 dsv4-fp4-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:v0.20.0-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb200
@@ -8525,7 +8123,7 @@ dsv4-fp4-gb200-dynamo-vllm:
 # MTP2 variant of dsv4-fp4-gb200-dynamo-vllm. Uses the vLLM 0.20.1 image
 # and hand-picked 8k/1k Pareto points mirrored from NVIDIA/srt-slurm.
 dsv4-fp4-gb200-dynamo-vllm-mtp2:
-  image: vllm/vllm-openai:v0.20.1-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb200
@@ -8605,7 +8203,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
           dp-attn: true
 
 dsv4-fp4-b300-dynamo-vllm:
-  image: vllm/vllm-openai:v0.20.1
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -8661,7 +8259,7 @@ dsv4-fp4-b300-dynamo-vllm:
           dp-attn: true
 
 dsv4-fp4-gb300-dynamo-vllm:
-  image: vllm/vllm-openai:v0.20.0-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb300-nv
@@ -9202,27 +8800,6 @@ dsv4-fp4-gb300-dynamo-sglang-mtp:
           ep: 8
           dp-attn: true
 
-
-kimik2.5-int4-h100-vllm:
-  image: vllm/vllm-openai:v0.20.2
-  model: moonshotai/Kimi-K2.5
-  model-prefix: kimik2.5
-  runner: h100
-  precision: int4
-  framework: vllm
-  multinode: false
-  scenarios:
-    # New entry, agentic-coding only: this PR intentionally does NOT add
-    # fixed-seq-len coverage for kimik2.5-int4 on H100 to keep the
-    # fixed-seq-len test surface identical to origin/main.
-    # H100 has 80 GB HBM per GPU (smallest in this set); the KV cliff arrives
-    # early. Sweep saturates conc=20 to keep total HBM headroom.
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 20] }
-      - { tp: 8, offloading: cpu,  conc-list: [1, 2, 4, 8, 12, 16, 20] }
-
 qwen3.5-fp8-h100-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -9681,12 +9258,307 @@ glm5-fp8-gb300-dynamo-sglang:
           ep: 1
           dp-attn: false
 
-# ============================================================================
-# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries).
-# Recipes that ALREADY existed on main were intentionally left at main's version
-# to preserve main behavior; PR-branch modifications to those recipes are NOT
-# brought in here.
-# ============================================================================
+dsv4-fp4-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b200-dgxc
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # cpu offload only this iteration — none entries already validated in
+      # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%).
+      # Re-add when investigating regressions in offload=none.
+      - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
+      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
+
+qwen3.5-fp8-b200-sglang-agentic:
+  image: lmsysorg/sglang:nightly-dev-20260422-de962f32
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: b200
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+glm5-fp8-b200-sglang-agentic:
+  image: lmsysorg/sglang:v0.5.12-cu130
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: b200
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] }
+
+kimik2.5-int4-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  runner: b200-dgxc
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+      - { tp: 8, offloading: cpu,  conc-list: [32, 64, 96, 128] }
+
+# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
+# does not have a B300-specific recipe, so this config reuses the existing
+# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
+
+kimik2.5-int4-h200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with
+  # the /home/sa-shared/gharunners/ai-perf-cache mount where aiperf's
+  # content-addressed dataset mmap cache lives. Other h200 pools (cw, nb)
+  # don't have that mount and would re-materialize 65 GB to /tmp every job.
+  runner: h200-dgxc
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] }
+      - { tp: 8, offloading: cpu,  conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] }
+
+kimik2.5-fp4-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: nvidia/Kimi-K2.5-NVFP4
+  model-prefix: kimik2.5
+  runner: b200-dgxc
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] }
+      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [16, 24, 32, 36] }
+      - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] }
+      - { tp: 4, ep: 1, offloading: cpu,  conc-list: [12, 14, 16, 18, 20, 22, 24, 32] }
+
+# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
+# does not have a B300-specific recipe, so this config reuses the existing
+# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
+
+kimik2.5-fp4-b300-vllm-agentic:
+  # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM
+  # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the
+  # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted
+  # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the
+  # INT4 B300 sister already uses successfully.
+  image: vllm/vllm-openai:v0.22.0
+  model: nvidia/Kimi-K2.5-NVFP4
+  model-prefix: kimik2.5
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+
+dsv4-fp8-h200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: h200
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [1, 2, 4, 8, 16] }
+
+# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image
+# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
+# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
+
+dsv4-fp4-b300-vllm-agentic:
+  # image: vllm/vllm-openai:v0.22.0
+  # includes https://github.com/vllm-project/vllm/pull/43447 up to 6c529f3001ab8bf44b1657e779dc54b622397045
+  image: cquil/vllm-openai:v0.22.0-6c529f3001ab8bf44b1657e779dc54b622397045
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # cpu offload only this iteration — none entries already validated in
+      # earlier runs. Re-add when investigating regressions in offload=none.
+      - { tp: 4, offloading: cpu,  conc-list: [16, 32, 64] }
+      - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
+      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [128, 256, 512] }
+      - { tp: 4, offloading: none,  conc-list: [16, 32, 64] }
+      - { tp: 8, offloading: none,  conc-list: [16, 32, 64] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: none,  conc-list: [64, 128, 256] }
+      - { tp: 8, ep: 8, dp-attn: true, offloading: none,  conc-list: [128, 256, 512] }
+
+gptoss-fp4-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  runner: b200
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 4, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
+      - { tp: 8, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
+
+minimaxm2.5-fp8-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: b200-dgxc
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # B200 tp=4: compute ceiling ~50 (empirical), KV cliff ~48 (analytical).
+    # Push none past the KV cliff (96, 128) to make the no-offload throughput
+    # collapse visible; cpu range overlaps fully for same-conc comparison.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 96, 128] }
+      - { tp: 4, offloading: cpu,  conc-list: [48, 56, 64, 96, 128] }
+
+  # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
+  # does not have a B300-specific recipe, so this config reuses the existing
+  # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.
+
+minimaxm2.5-fp8-b300-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: b300
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # B300 tp=4: compute ceiling ~60 (empirical), KV cliff ~85 (analytical).
+    # Push none past the KV cliff (96, 128, 192) so the no-offload throughput
+    # collapse is visible; cpu range overlaps fully so each high-conc point
+    # has a same-conc no-offload counterpart for direct comparison.
+    # Dense sampling between 96 and 128 (step=4) to resolve the sharp dropoff
+    # observed in v6 cpu data right past conc=96.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
+      - { tp: 4, offloading: cpu,  conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
+
+minimaxm2.5-fp4-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: nvidia/MiniMax-M2.5-NVFP4
+  model-prefix: minimaxm2.5
+  runner: b200
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+  # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
+  # does not have a B300-specific recipe, so this config reuses the existing
+  # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
+
+minimaxm2.5-fp8-h100-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: h100
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # H100 tp=4 ep=4: compute ceiling ~10 (empirical), KV cliff ~6 (analytical).
+    # Best cpu-offload demo SKU — 4-conc-point window between cliffs.
+    # Dense sampling 4-12 covers both cliffs; conc 16 confirms compute plateau.
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] }
+      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [5, 6, 7, 8, 10, 12] }
+
+minimaxm2.5-fp8-h200-vllm-agentic:
+  image: vllm/vllm-openai:v0.22.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: h200
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # H200 tp=4: compute ceiling ~35 (empirical), KV cliff ~29 (analytical).
+    # cpu offload window conc 29-35 — dense sampling 24-40 captures both cliffs.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [24, 28, 32, 36, 40, 48] }
+
+kimik2.5-int4-h100-vllm:
+  image: vllm/vllm-openai:v0.22.0
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  runner: h100
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    # New entry, agentic-coding only: this PR intentionally does NOT add
+    # fixed-seq-len coverage for kimik2.5-int4 on H100 to keep the
+    # fixed-seq-len test surface identical to origin/main.
+    # H100 has 80 GB HBM per GPU (smallest in this set); the KV cliff arrives
+    # early. Sweep saturates conc=20 to keep total HBM headroom.
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 20] }
+      - { tp: 8, offloading: cpu,  conc-list: [1, 2, 4, 8, 12, 16, 20] }
 
 qwen3.5-fp8-b300-sglang-agentic-hicache:
   image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd
@@ -9704,7 +9576,7 @@ qwen3.5-fp8-b300-sglang-agentic-hicache:
       - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
 
 kimik2.5-fp4-b200-vllm-agentic-lmcache:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: b200-dgxc
@@ -9724,17 +9596,8 @@ kimik2.5-fp4-b200-vllm-agentic-lmcache:
 # does not have a B300-specific recipe, so this config reuses the existing
 # Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
 
-# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons
-# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to
-# origin/main so its fixed-seq-len sweep is unaffected.
-#   - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape
-#     mirroring the conc=192 point in the base entry's fixed-seq-len sweep.
-#   - additional-settings.CONFIG_FILE: points at the new agentic recipe under
-#     recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh
-#     overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC
-#     branch). Local-overlay pattern mirrors the existing 8k1k overlay.
 dsv4-fp4-gb300-dynamo-vllm-agentic:
-  image: vllm/vllm-openai:v0.21.0-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   # gb300-nv (not generic gb300) — the generic label is shared by both NV
@@ -9823,7 +9686,7 @@ dsv4-fp4-gb300-dynamo-vllm-agentic:
 # overlay (recipes/vllm/deepseek-v4/agentic/), so a change to the recipe
 # applies to both clusters with no duplication.
 dsv4-fp4-gb300-cw-dynamo-vllm-agentic:
-  image: vllm/vllm-openai:v0.21.0-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb300-cw
@@ -9881,16 +9744,6 @@ dsv4-fp4-gb300-cw-dynamo-vllm-agentic:
           ep: 8
           dp-attn: true
 
-# Diverged from qwen3.5-fp8-h100-sglang (agentic-coding sibling). Reasons below;
-# the original qwen3.5-fp8-h100-sglang entry stays byte-identical to origin/main
-# so its fixed-seq-len sweep is unaffected.
-#   - scenarios: replaced fixed-seq-len with agentic-coding.
-#   - runner: 'h100' -> 'h100-dgxc' (agentic runs need the dgxc-slurm cluster).
-# Image is identical to the base entry (lmsysorg/sglang:v0.5.12-cu130).
-# CONC range conservative for H100's 80 GB HBM3 under the long-ISL with-
-# subagents corpus. hicache arm capped at conc 16 since high-conc + hicache
-# tends to flake on first runs and conc 16 covers the cliff. The bench script
-# sets WEKA_LOADER_OVERRIDE to the 256k-capped corpus variant.
 qwen3.5-fp8-h100-sglang-agentic:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -9905,3 +9758,33 @@ qwen3.5-fp8-h100-sglang-agentic:
       search-space:
       - { tp: 8, ep: 8, offloading: none,    conc-list: [1, 2, 4, 8, 12, 14, 16] }
       - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] }
+
+# Split from dsr1-fp4-b200-dynamo-trt: agentic-coding scenario only.
+dsr1-fp4-b200-dynamo-trt-agentic:
+  image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1
+  model: deepseek-r1-fp4
+  model-prefix: dsr1
+  runner: b200-multinode
+  precision: fp4
+  framework: dynamo-trt
+  multinode: true
+  disagg: true
+  scenarios:
+    agentic-coding:
+    - duration: 300
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml
+          - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index e3080b4bf..e062b42f1 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -924,8 +924,19 @@ resolve_trace_source() {
     # public-dataset loader names allowed by the inferencex-agentx-mvp
     # scenario. Used by recipes whose servers have non-default context
     # caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the
-    # unfiltered 052726 corpus and switches to the 256k-capped variant).
-    local loader="${WEKA_LOADER_OVERRIDE:-semianalysis_cc_traces_weka_with_subagents}"
+    # unfiltered corpus and switches to the 256k-capped variant), or
+    # by recipes that want to pin a specific corpus generation rather
+    # than ride the model-prefix-aware default below.
+    #
+    # Default (no override) is model-prefix-aware:
+    #   DSv4 recipes      -> 052726 (v5 corpus, the original baseline)
+    #   everything else   -> 060226 (v6 corpus, newer CC versions)
+    # DSv4 stays on 052726 for continuity with prior published baselines.
+    local default_loader="semianalysis_cc_traces_weka_with_subagents_060226"
+    if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then
+        default_loader="semianalysis_cc_traces_weka_with_subagents"
+    fi
+    local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}"
     local dataset
     case "$loader" in
         semianalysis_cc_traces_weka_with_subagents)
@@ -934,13 +945,19 @@ resolve_trace_source() {
         semianalysis_cc_traces_weka_with_subagents_256k)
             dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k"
             ;;
+        semianalysis_cc_traces_weka_with_subagents_060226)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060226"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_060226_256k)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060226-256k"
+            ;;
         *)
-            echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2
+            echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k" >&2
             exit 1
             ;;
     esac
     TRACE_SOURCE_FLAG="--public-dataset $loader"
-    echo "Loading traces via aiperf public-dataset: $loader ($dataset)"
+    echo "Loading traces via aiperf public-dataset: $loader ($dataset) [MODEL_PREFIX=${MODEL_PREFIX:-unset}]"
     # Pre-download the dataset into the shared HF_HUB_CACHE (same mount used
     # for model weights) so subsequent runs read from cache instead of
     # re-downloading every job.
diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
index f9955adc7..16dc3bfd5 100755
--- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
@@ -17,7 +17,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -33,7 +43,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
---model-path $MODEL \
+--model-path "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
index ff76b768d..3b2561fe2 100755
--- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
@@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi
 amd-smi || true
 
@@ -34,7 +44,7 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
---model-path=$MODEL \
+--model-path=$MODEL_PATH --served-model-name=$MODEL \
 --host=0.0.0.0 \
 --port=$PORT \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
index 108347479..e80008f71 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
@@ -38,7 +38,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -221,7 +231,7 @@ export VLLM_FLOAT32_MATMUL_PRECISION=high
 
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --trust-remote-code
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
index f6748a5f8..fdb7a49b6 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
@@ -32,16 +32,35 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
+# Opt this recipe out of the DSv4 052726 default; use the v6 corpus.
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226
 resolve_trace_source
 install_agentic_deps
 
 # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s.
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 
+# vllm-project/vllm#43447: keep SWA prefix-cache tails sparsely so transient
+# sliding-window allocations don't evict useful prefix entries. 32k matches
+# the trace-replay tuning the PR author validated (0% -> 74% hit rate).
+# Requires the custom image (cquil/vllm-openai:*-7ead0a0f...) that carries
+# the patch; on stock images the env var is ignored.
+export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768
+
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
 mkdir -p "$RESULT_DIR"
@@ -113,7 +132,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve "$MODEL" \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port "$PORT" \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
index 99aec25fe..029c8ea7f 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -31,7 +31,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -135,7 +145,7 @@ fi
 
 echo "Starting sglang server..."
 python3 -m sglang.launch_server \
-    --model-path "$MODEL" \
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
     --host=0.0.0.0 \
     --port "$PORT" \
     "${PARALLEL_ARGS[@]}" \
diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
index 0a0177983..799c2bf26 100755
--- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -40,7 +50,7 @@ export PYTHONNOUSERSITE=1
 
 # Per recipe: EP + DP=8 (no --tensor-parallel-size). TP from search space is
 # used for GPU allocation by the runner and as the DP size.
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
index 500b456f5..3b85a31cd 100755
--- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -42,7 +52,7 @@ echo "Starting SGLang server..."
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
-    --model-path $MODEL \
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh
index 259c19586..b3597cf52 100755
--- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -39,7 +49,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
---model-path=$MODEL \
+--model-path=$MODEL_PATH --served-model-name=$MODEL \
 --host=0.0.0.0 \
 --port=$PORT \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
index 6e921db58..80d70e724 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -53,7 +63,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --config "$RESULT_DIR/config.yaml" \
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
index 557986b0d..13e32d315 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -57,7 +67,7 @@ export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTHONNOUSERSITE=1
 export VLLM_MXFP4_USE_MARLIN=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --config "$RESULT_DIR/config.yaml" \
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
index 1592a8d5c..e0d967246 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -57,7 +67,7 @@ export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTHONNOUSERSITE=1
 export VLLM_MXFP4_USE_MARLIN=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --config "$RESULT_DIR/config.yaml" \
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
index eb1883ff1..ff597c9a4 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi
 amd-smi || true
 
@@ -65,7 +75,7 @@ esac
 
 echo "Starting vllm server..."
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --attention-backend ROCM_AITER_UNIFIED_ATTN \
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
index 99e29c819..1f8c29351 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi
 
 # If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory.
@@ -64,7 +74,7 @@ esac
 
 echo "Starting vllm server..."
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --attention-backend ROCM_AITER_UNIFIED_ATTN \
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
index ad0b4495a..34b45c9ec 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -178,7 +188,7 @@ export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
 
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --tensor-parallel-size="$TP"
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
index 8cebe4f20..9667003e1 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -85,7 +95,7 @@ export PYTHONNOUSERSITE=1
 
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --tensor-parallel-size="$TP"
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index fd0ce3677..139b12256 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -33,7 +33,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -786,7 +796,7 @@ export PYTHONNOUSERSITE=1
 
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --tensor-parallel-size="$TP"
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
index 697d3fa45..5685f098c 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
@@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -45,7 +55,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_INT4=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --gpu-memory-utilization 0.95 \
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
index 2fd3b381c..cb6c67f4b 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
@@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -45,7 +55,7 @@ echo "Starting vllm server..."
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_INT4=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --gpu-memory-utilization 0.95 \
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
index 97929e43e..1bfa0c33b 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
@@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -56,7 +66,7 @@ echo "Starting vllm server..."
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_INT4=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --gpu-memory-utilization 0.95 \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
index 38ef72b56..b4a63eff3 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -58,7 +68,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 $PARALLEL_ARGS \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
index 4ce131cba..0724aba5b 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -62,7 +72,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
index 9f2d83a0b..c291a2ceb 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -62,7 +72,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
index d21690da6..516bc4696 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -58,7 +68,7 @@ echo "Starting vllm server..."
 export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
index ed59991cb..e6343b8ba 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -58,7 +68,7 @@ echo "Starting vllm server..."
 export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
index 260bbdc68..8988316d3 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
@@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -64,7 +74,7 @@ echo "Starting vllm server..."
 export VLLM_ROCM_USE_AITER=1
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
index edac27a45..caa70de63 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
@@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -61,7 +71,7 @@ echo "Starting vllm server..."
 export VLLM_ROCM_USE_AITER=1
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
index 39dd63293..cd114fe96 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
@@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -65,7 +75,7 @@ export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
index 4ba87976b..d06d82ec8 100755
--- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
@@ -20,7 +20,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -39,7 +49,7 @@ export SGL_ENABLE_JIT_DEEPGEMM=false
 export SGLANG_ENABLE_FLASHINFER_GEMM=true
 
 python3 -m sglang.launch_server \
---model-path=$MODEL \
+--model-path=$MODEL_PATH --served-model-name=$MODEL \
 --host=0.0.0.0 \
 --port=$PORT \
 --served-model-name "Qwen/Qwen3.5-397B-A17B" \
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
index 3432af5c9..ad49b2b67 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
@@ -20,7 +20,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -39,7 +49,7 @@ export SGL_ENABLE_JIT_DEEPGEMM=false
 export SGLANG_ENABLE_FLASHINFER_GEMM=true
 
 python3 -m sglang.launch_server \
---model-path=$MODEL \
+--model-path=$MODEL_PATH --served-model-name=$MODEL \
 --host=0.0.0.0 \
 --port=$PORT \
 --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" \
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
index 9d9c1d7d5..4f9b12659 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
@@ -24,7 +24,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -85,7 +95,7 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true
 { set +x; } 2>/dev/null
 SGLANG_CMD=(
     python3 -m sglang.launch_server
-    --model-path="$MODEL"
+    --model-path="$MODEL_PATH" --served-model-name="$MODEL"
     --host=0.0.0.0
     --port="$PORT"
     --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8"
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
index 95f0397a0..b280fff8b 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
@@ -27,7 +27,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
@@ -98,7 +108,7 @@ fi
 { set +x; } 2>/dev/null
 SGLANG_CMD=(
     python3 -m sglang.launch_server
-    --model-path="$MODEL"
+    --model-path="$MODEL_PATH" --served-model-name="$MODEL"
     --host=0.0.0.0
     --port="$PORT"
     --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8"
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
index aef9650ca..ff901b674 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -36,7 +46,7 @@ export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
     --attention-backend triton \
-    --model-path $MODEL \
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
index 5427d0d31..cdded8860 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
@@ -24,7 +24,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
@@ -110,7 +120,7 @@ export PYTHONNOUSERSITE=1
 SGLANG_CMD=(
     python3 -m sglang.launch_server
     --attention-backend triton
-    --model-path "$MODEL"
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL"
     --host=0.0.0.0
     --port "$PORT"
     --tensor-parallel-size "$TP"
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 67e8b48cc..cb4a634c3 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -387,6 +387,7 @@ else
         --container-image=$SQUASH_FILE \
         --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT,$WRITABLE_MODELS_DIR:$WRITABLE_MODELS_DIR \
         --no-container-mount-home \
+        --container-remap-root \
         --container-workdir=$CONTAINER_MOUNT_DIR \
         --no-container-entrypoint --export=ALL,PORT=8888 \
         bash "$BENCH_SCRIPT"
diff --git a/utils/aiperf b/utils/aiperf
index 062a5de92..47e6e2060 160000
--- a/utils/aiperf
+++ b/utils/aiperf
@@ -1 +1 @@
-Subproject commit 062a5de92c8ac8a0a6dd5d2a7fb9a539a147f3d9
+Subproject commit 47e6e206001a85a3cc4c6212a1e0425f045bbcb3