From f632aa42c2872eecaa0089d119e6f1fea1a5c2ec Mon Sep 17 00:00:00 2001
From: Cam Quilici <cameron@semianalysis.com>
Date: Tue, 2 Jun 2026 12:21:55 -0500
Subject: [PATCH 01/15] agentic(trace-source): default non-DSv4 to v6 (060226)
 corpus

resolve_trace_source() now picks a model-prefix-aware default:

  MODEL_PREFIX == dsv4  -> semianalysis_cc_traces_weka_with_subagents
                           (052726, the v5 baseline, unchanged for
                           continuity with prior DSv4 published runs)
  everything else       -> semianalysis_cc_traces_weka_with_subagents_060226
                           (060226, newer v6 corpus with fresher CC
                           recording windows)

WEKA_LOADER_OVERRIDE still wins. Allowed values widened from the
two 052726 loaders to all four:

  semianalysis_cc_traces_weka_with_subagents          (052726)
  semianalysis_cc_traces_weka_with_subagents_256k     (052726-256k)
  semianalysis_cc_traces_weka_with_subagents_060226   (060226)
  semianalysis_cc_traces_weka_with_subagents_060226_256k (060226-256k)

Bumps utils/aiperf submodule to de3ad1c1, which registers the two
060226 plugin entries those new loader names resolve through.

The pre-cache log line now also includes MODEL_PREFIX so it's obvious
in CI which default fired.

Signed-off-by: Cam Quilici <cameron@semianalysis.com>
---
 benchmarks/benchmark_lib.sh | 25 +++++++++++++++++++++----
 utils/aiperf                |  2 +-
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index e3080b4bf..e062b42f1 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -924,8 +924,19 @@ resolve_trace_source() {
     # public-dataset loader names allowed by the inferencex-agentx-mvp
     # scenario. Used by recipes whose servers have non-default context
     # caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the
-    # unfiltered 052726 corpus and switches to the 256k-capped variant).
-    local loader="${WEKA_LOADER_OVERRIDE:-semianalysis_cc_traces_weka_with_subagents}"
+    # unfiltered corpus and switches to the 256k-capped variant), or
+    # by recipes that want to pin a specific corpus generation rather
+    # than ride the model-prefix-aware default below.
+    #
+    # Default (no override) is model-prefix-aware:
+    #   DSv4 recipes      -> 052726 (v5 corpus, the original baseline)
+    #   everything else   -> 060226 (v6 corpus, newer CC versions)
+    # DSv4 stays on 052726 for continuity with prior published baselines.
+    local default_loader="semianalysis_cc_traces_weka_with_subagents_060226"
+    if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then
+        default_loader="semianalysis_cc_traces_weka_with_subagents"
+    fi
+    local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}"
     local dataset
     case "$loader" in
         semianalysis_cc_traces_weka_with_subagents)
@@ -934,13 +945,19 @@ resolve_trace_source() {
         semianalysis_cc_traces_weka_with_subagents_256k)
             dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k"
             ;;
+        semianalysis_cc_traces_weka_with_subagents_060226)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060226"
+            ;;
+        semianalysis_cc_traces_weka_with_subagents_060226_256k)
+            dataset="semianalysisai/cc-traces-weka-with-subagents-060226-256k"
+            ;;
         *)
-            echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2
+            echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k" >&2
             exit 1
             ;;
     esac
     TRACE_SOURCE_FLAG="--public-dataset $loader"
-    echo "Loading traces via aiperf public-dataset: $loader ($dataset)"
+    echo "Loading traces via aiperf public-dataset: $loader ($dataset) [MODEL_PREFIX=${MODEL_PREFIX:-unset}]"
     # Pre-download the dataset into the shared HF_HUB_CACHE (same mount used
     # for model weights) so subsequent runs read from cache instead of
     # re-downloading every job.
diff --git a/utils/aiperf b/utils/aiperf
index 062a5de92..de3ad1c18 160000
--- a/utils/aiperf
+++ b/utils/aiperf
@@ -1 +1 @@
-Subproject commit 062a5de92c8ac8a0a6dd5d2a7fb9a539a147f3d9
+Subproject commit de3ad1c18b704a60c43bcc5f76dfb2ac7e346fd1

From 5544a448d594a5ff3b8b83a25d714a8635adc3b7 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 2 Jun 2026 13:16:10 -0500
Subject: [PATCH 02/15] configs(master): consolidate agentic recipes at end +
 split combined dsr1-trt entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reorganizes both master YAMLs so all pure-agentic (agentic-coding-only)
recipes sit at the bottom of the file behind an "# Agentic configs"
divider, separated from fixed-seq-len / synthetic / prefix-share entries
above. No functional change to any non-agentic recipe.

nvidia-master.yaml: splits dsr1-fp4-b200-dynamo-trt — which previously
mixed fixed-seq-len + agentic-coding in one entry — into the original
entry (fixed-seq-len only) plus a new sibling dsr1-fp4-b200-dynamo-trt-agentic
carrying the agentic-coding scenario. 22 pure-agentic entries moved.

amd-master.yaml: no split needed (no combined entries); 9 pure-agentic
entries moved to the end.

Verified via deep YAML parse: nvidia adds 1 key (the split sibling) and
modifies the source key's scenarios from [agentic-coding, fixed-seq-len]
to [fixed-seq-len]; amd has 0 keys added/removed/modified. All other
entries are byte-equal after round-trip.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml    | 358 ++++++------
 .github/configs/nvidia-master.yaml | 893 +++++++++++++++--------------
 2 files changed, 637 insertions(+), 614 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index fb3966ce6..0495ebf16 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -304,25 +304,6 @@ qwen3.5-fp8-mi355x-sglang-mtp:
       - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp }
       - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp }
 
-# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main.
-qwen3.5-fp8-mi355x-sglang-agentic:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
 qwen3.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -704,26 +685,6 @@ glm5.1-fp4-mi355x-sglang:
       - { tp: 2, conc-start: 4, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 16 }
 
-# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main.
-glm5.1-fp4-mi355x-sglang-agentic:
-  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
-  model: amd/GLM-5.1-MXFP4
-  model-prefix: glm5.1
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
 glm5.1-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: amd/GLM-5.1-MXFP4
@@ -821,38 +782,6 @@ kimik2.5-fp4-mi355x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
-# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0'
-kimik2.5-fp4-mi355x-vllm-agentic:
-  # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
-  # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
-  # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
-  # includes all subsequent ROCm offload work.
-  image: vllm/vllm-openai-rocm:v0.21.0
-  model: amd/Kimi-K2.5-MXFP4
-  model-prefix: kimik2.5
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
-      # CPU offload only above the KV cliff. Lower concurrencies fit
-      # entirely on-GPU, so paying the offload-path overhead there would
-      # just slow them down without measuring anything new.
-      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
-      # TP=4 probe: half-node layout doubles per-GPU weight footprint
-      # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to
-      # cliff-region concurrencies on both offload modes so we can directly
-      # compare TP=4 vs TP=8 at the same conc points.
-      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
-
 kimik2.5-fp4-mi355x-atom:
   image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511
   model: amd/Kimi-K2.5-MXFP4
@@ -897,33 +826,6 @@ minimaxm2.5-fp8-mi355x-vllm:
       - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 }
       - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 }
 
-# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
-minimaxm2.5-fp8-mi355x-vllm-agentic:
-  # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector]
-  # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"),
-  # which enables SimpleCPUOffloadConnector on ROCm. Required for the
-  # cpu-offload sweep points to use the same offload path as the NVIDIA
-  # agentic-coding configs.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi355x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
-    # Compute saturates first; cpu offload likely won't help, but worth confirming.
-    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
-    - duration: 1800
-      search-space:
-      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
-      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
-
 minimaxm2.5-fp8-mi355x-atom:
   image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post
   model: MiniMaxAI/MiniMax-M2.5
@@ -1014,29 +916,6 @@ minimaxm2.5-fp8-mi300x-vllm:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
-# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
-minimaxm2.5-fp8-mi300x-vllm-agentic:
-  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi300x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200);
-    # KV cliff ~52. Compute saturates first.
-    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
-
 minimaxm2.5-fp8-mi325x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -1058,30 +937,6 @@ minimaxm2.5-fp8-mi325x-vllm:
       - { tp: 2, conc-start: 4, conc-end: 64 }
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
 
-# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
-minimaxm2.5-fp8-mi325x-vllm-agentic:
-  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: mi325x
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # MI325X tp=4: cloned from MI300X recipe (slightly faster compute,
-    # similar HBM profile). Compute saturates first; cpu-offload window
-    # exercises the SimpleCPUOffloadConnector path enabled by the rocm
-    # nightly. Mirror MI300X conc grid for cross-vendor comparability.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
-
 gptoss-fp4-mi300x-vllm:
   image: vllm/vllm-openai-rocm:v0.17.0
   model: openai/gpt-oss-120b
@@ -2415,37 +2270,6 @@ glm5-fp8-mi325x-sglang-mtp:
 # brought in here.
 # ============================================================================
 
-qwen3.5-fp8-mi355x-sglang-agentic-hicache:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
-
-dsv4-fp4-mi355x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.21.0
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
-      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
-
 dsr1-fp4-mi355x-sglang-disagg-mtp:
   image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
   model: amd/DeepSeek-R1-0528-MXFP4-v2
@@ -2679,6 +2503,188 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 # amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
 # image tag, so bumping sglang is just an image tag bump here. Sweeps
 # DP-attention on/off and EP=8.
+# =============================================================================
+# Agentic configs
+# -----------------------------------------------------------------------------
+# All entries below run the agentic-coding scenario (Weka trace replay).
+# Above this divider: fixed-seq-len / synthetic / prefix-share scenarios only.
+# =============================================================================
+
+# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is
+# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this
+# PR adds an agentic-coding scenarios block that differs from main
+# (either main had none or had a different conc/offload sweep).
+# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main.
+qwen3.5-fp8-mi355x-sglang-agentic:
+  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is
+# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this
+# PR adds an agentic-coding scenarios block that differs from main
+# (either main had none or had a different conc/offload sweep).
+# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main.
+glm5.1-fp4-mi355x-sglang-agentic:
+  image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
+  model: amd/GLM-5.1-MXFP4
+  model-prefix: glm5.1
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below;
+# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0'
+kimik2.5-fp4-mi355x-vllm-agentic:
+  # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
+  # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
+  # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
+  # includes all subsequent ROCm offload work.
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] }
+      # CPU offload only above the KV cliff. Lower concurrencies fit
+      # entirely on-GPU, so paying the offload-path overhead there would
+      # just slow them down without measuring anything new.
+      - { tp: 8, offloading: cpu,  conc-list: [32, 40, 48, 56] }
+      # TP=4 probe: half-node layout doubles per-GPU weight footprint
+      # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to
+      # cliff-region concurrencies on both offload modes so we can directly
+      # compare TP=4 vs TP=8 at the same conc points.
+      - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
+
+# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below;
+# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
+minimaxm2.5-fp8-mi355x-vllm-agentic:
+  # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector]
+  # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"),
+  # which enables SimpleCPUOffloadConnector on ROCm. Required for the
+  # cpu-offload sweep points to use the same offload path as the NVIDIA
+  # agentic-coding configs.
+  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical).
+    # Compute saturates first; cpu offload likely won't help, but worth confirming.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
+      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
+
+# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below;
+# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
+minimaxm2.5-fp8-mi300x-vllm-agentic:
+  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
+  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi300x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200);
+    # KV cliff ~52. Compute saturates first.
+    # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector).
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
+
+# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below;
+# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
+minimaxm2.5-fp8-mi325x-vllm-agentic:
+  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
+  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: mi325x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # MI325X tp=4: cloned from MI300X recipe (slightly faster compute,
+    # similar HBM profile). Compute saturates first; cpu-offload window
+    # exercises the SimpleCPUOffloadConnector path enabled by the rocm
+    # nightly. Mirror MI300X conc grid for cross-vendor comparability.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
+
+qwen3.5-fp8-mi355x-sglang-agentic-hicache:
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
+
+dsv4-fp4-mi355x-vllm-agentic:
+  image: vllm/vllm-openai-rocm:v0.21.0
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4] }
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
 
 # Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below;
 # the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index d3b1b6729..04764831c 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -384,25 +384,6 @@ dsr1-fp4-b200-dynamo-trt:
           ep: 8
           dp-attn: true
 
-    agentic-coding:
-    - duration: 300
-      search-space:
-      - spec-decoding: "none"
-        conc-list: [ 1, 2, 4, 8, 16, 32 ]
-        prefill:
-          num-worker: 1
-          tp: 4
-          ep: 4
-          dp-attn: true
-          additional-settings:
-          # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml
-          - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml"
-        decode:
-          num-worker: 1
-          tp: 8
-          ep: 8
-          dp-attn: false
-
 dsr1-fp8-b200-dynamo-trt:
   image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
   model: deepseek-ai/DeepSeek-R1-0528
@@ -1778,28 +1759,6 @@ dsv4-fp4-b200-vllm:
       - { tp: 8, conc-start: 1, conc-end: 32 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
 
-# Diverged from dsv4-fp4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original dsv4-fp4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - runner: 'b200-dsv4' -> 'b200-dgxc'
-dsv4-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.0-cu130
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: b200-dgxc
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # cpu offload only this iteration — none entries already validated in
-      # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%).
-      # Re-add when investigating regressions in offload=none.
-      - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
-      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
-
 dsv4-fp4-b200-trt:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -2143,25 +2102,6 @@ qwen3.5-fp8-b200-sglang:
       - { tp: 8, conc-start: 4, conc-end: 4 }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
 
-# Diverged from qwen3.5-fp8-b200-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's qwen3.5-fp8-b200-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original qwen3.5-fp8-b200-sglang entry stays byte-identical to origin/main.
-qwen3.5-fp8-b200-sglang-agentic:
-  image: lmsysorg/sglang:nightly-dev-20260422-de962f32
-  model: Qwen/Qwen3.5-397B-A17B-FP8
-  model-prefix: qwen3.5
-  runner: b200
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
 qwen3.5-fp4-b200-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/Qwen3.5-397B-A17B-NVFP4
@@ -2245,26 +2185,6 @@ glm5-fp8-b200-sglang-mtp:
   # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1
   # does not have a B300-specific recipe, so this config reuses the existing GLM5 FP8
   # B200 SGLang recipe as-is until B300-specific tuning is available.
-# Diverged from glm5-fp8-b200-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's glm5-fp8-b200-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original glm5-fp8-b200-sglang entry stays byte-identical to origin/main.
-glm5-fp8-b200-sglang-agentic:
-  image: lmsysorg/sglang:v0.5.12-cu130
-  model: zai-org/GLM-5-FP8
-  model-prefix: glm5
-  runner: b200
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] }
-
 glm5-fp8-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: zai-org/GLM-5-FP8
@@ -2553,37 +2473,6 @@ kimik2.5-int4-b200-vllm:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from kimik2.5-int4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-int4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - runner: 'b200' -> 'b200-dgxc'
-kimik2.5-int4-b200-vllm-agentic:
-  # Bumped from v0.19.1 — that release tripped a bug in
-  # `flashinfer_trtllm_mxint4_moe` ('list' object has no attribute 'to')
-  # during warmup `profile_run` on the agentic-coding path
-  # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the
-  # flashinfer fix.
-  image: vllm/vllm-openai:v0.20.2
-  model: moonshotai/Kimi-K2.5
-  model-prefix: kimik2.5
-  runner: b200-dgxc
-  precision: int4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-      - { tp: 8, offloading: cpu,  conc-list: [32, 64, 96, 128] }
-
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
-
 kimik2.5-int4-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: moonshotai/Kimi-K2.5
@@ -2624,29 +2513,6 @@ kimik2.5-int4-h200-vllm:
       search-space:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
-# Diverged from kimik2.5-int4-h200-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-int4-h200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - runner: 'h200' -> 'h200-dgxc'
-kimik2.5-int4-h200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
-  model: moonshotai/Kimi-K2.5
-  model-prefix: kimik2.5
-  # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with
-  # the /home/sa-shared/gharunners/ai-perf-cache mount where aiperf's
-  # content-addressed dataset mmap cache lives. Other h200 pools (cw, nb)
-  # don't have that mount and would re-materialize 65 GB to /tmp every job.
-  runner: h200-dgxc
-  precision: int4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] }
-      - { tp: 8, offloading: cpu,  conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] }
-
 kimik2.5-fp4-b200-vllm:
   image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
@@ -2668,38 +2534,6 @@ kimik2.5-fp4-b200-vllm:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
 
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2'
-#   - runner: 'b200' -> 'b200-dgxc'
-kimik2.5-fp4-b200-vllm-agentic:
-  # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that
-  # cleared the agentic-coding warmup crash on max_model_len=131072 +
-  # prefix caching.
-  image: vllm/vllm-openai:v0.20.2
-  model: nvidia/Kimi-K2.5-NVFP4
-  model-prefix: kimik2.5
-  runner: b200-dgxc
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] }
-      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [16, 24, 32, 36] }
-      - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] }
-      - { tp: 4, ep: 1, offloading: cpu,  conc-list: [12, 14, 16, 18, 20, 22, 24, 32] }
-
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-
 kimik2.5-fp4-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: nvidia/Kimi-K2.5-NVFP4
@@ -2763,34 +2597,6 @@ dsr1-fp8-b300-sglang-mtp:
       search-space:
       - { tp: 8, ep: 1, conc-start: 1, conc-end: 512, spec-decoding: mtp }
 
-# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130'
-#   - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4'
-#   - model-prefix: 'dsr1' -> 'kimik2.5'
-#   - precision: 'fp8' -> 'fp4'
-#   - framework: 'sglang' -> 'vllm'
-kimik2.5-fp4-b300-vllm-agentic:
-  # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM
-  # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the
-  # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted
-  # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the
-  # INT4 B300 sister already uses successfully.
-  image: vllm/vllm-openai:v0.20.0-cu130
-  model: nvidia/Kimi-K2.5-NVFP4
-  model-prefix: kimik2.5
-  runner: b300
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
-      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
-
 dsr1-fp8-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc14
   model: deepseek-ai/DeepSeek-R1-0528
@@ -2924,31 +2730,6 @@ dsv4-fp8-h200-vllm-mtp:
       - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp }
 
-# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only).
-# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper
-# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up.
-# Diverged from dsv4-fp8-h200-vllm (agentic-coding sibling). Reasons below;
-# the original dsv4-fp8-h200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129'
-dsv4-fp8-h200-vllm-agentic:
-  image: vllm/vllm-openai:deepseekv4-cu129
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: h200
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [1, 2, 4, 8, 16] }
-
-# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image
-# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
-# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
-
 dsv4-fp8-h200-sglang:
   image: lmsysorg/sglang:deepseek-v4-hopper@sha256:7f19c6dc092e47a10fac2e41f47eab78970280d06648b8e50d312a82f0ae722f
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -3024,30 +2805,6 @@ dsv4-fp4-b300-vllm:
       - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
-# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main.
-dsv4-fp4-b300-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.0-cu130
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  runner: b300
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      # cpu offload only this iteration — none entries already validated in
-      # earlier runs. Re-add when investigating regressions in offload=none.
-      - { tp: 4, offloading: cpu,  conc-list: [16, 32, 64] }
-      - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
-      - { tp: 4, ep: 4, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
-      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [128, 256, 512] }
-
 dsv4-fp4-b300-trt:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -4284,31 +4041,10 @@ gptoss-fp4-b200-vllm:
       - { tp: 4, conc-start: 4, conc-end: 64 }
       - { tp: 8, conc-start: 4, conc-end: 4 }
 
-# Diverged from gptoss-fp4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original gptoss-fp4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1'
-gptoss-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.1
-  model: openai/gpt-oss-120b
-  model-prefix: gptoss
-  runner: b200
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] }
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] }
-      - { tp: 4, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
-      - { tp: 8, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
-
-minimaxm2.5-fp8-b200-vllm:
-  image: vllm/vllm-openai:v0.22.0
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
+minimaxm2.5-fp8-b200-vllm:
+  image: vllm/vllm-openai:v0.22.0
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
   runner: b200
   precision: fp8
   framework: vllm
@@ -4330,33 +4066,6 @@ minimaxm2.5-fp8-b200-vllm:
   # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
   # does not have a B300-specific recipe, so this config reuses the existing
   # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from minimaxm2.5-fp8-b200-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
-#   - runner: 'b200' -> 'b200-dgxc'
-minimaxm2.5-fp8-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.1
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: b200-dgxc
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # B200 tp=4: compute ceiling ~50 (empirical), KV cliff ~48 (analytical).
-    # Push none past the KV cliff (96, 128) to make the no-offload throughput
-    # collapse visible; cpu range overlaps fully for same-conc comparison.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 96, 128] }
-      - { tp: 4, offloading: cpu,  conc-list: [48, 56, 64, 96, 128] }
-
-  # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
-  # does not have a B300-specific recipe, so this config reuses the existing
-  # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.
-
 minimaxm2.5-fp8-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -4381,31 +4090,6 @@ minimaxm2.5-fp8-b300-vllm:
       - { tp: 2, conc-start: 64, conc-end: 256 }
       - { tp: 4, conc-start: 4, conc-end: 8 }
 
-# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
-minimaxm2.5-fp8-b300-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.1
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: b300
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # B300 tp=4: compute ceiling ~60 (empirical), KV cliff ~85 (analytical).
-    # Push none past the KV cliff (96, 128, 192) so the no-offload throughput
-    # collapse is visible; cpu range overlaps fully so each high-conc point
-    # has a same-conc no-offload counterpart for direct comparison.
-    # Dense sampling between 96 and 128 (step=4) to resolve the sharp dropoff
-    # observed in v6 cpu data right past conc=96.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
-      - { tp: 4, offloading: cpu,  conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
-
 minimaxm2.5-fp4-b200-vllm:
   image: vllm/vllm-openai:v0.22.0
   model: nvidia/MiniMax-M2.5-NVFP4
@@ -4438,29 +4122,6 @@ minimaxm2.5-fp4-b200-vllm:
   # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
   # does not have a B300-specific recipe, so this config reuses the existing
   # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from minimaxm2.5-fp4-b200-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's minimaxm2.5-fp4-b200-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main.
-minimaxm2.5-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.0-cu130
-  model: nvidia/MiniMax-M2.5-NVFP4
-  model-prefix: minimaxm2.5
-  runner: b200
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
-  # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
-  # does not have a B300-specific recipe, so this config reuses the existing
-  # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-
 minimaxm2.5-fp4-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: nvidia/MiniMax-M2.5-NVFP4
@@ -4530,29 +4191,6 @@ minimaxm2.5-fp8-h100-vllm:
       search-space:
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
 
-# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main.
-minimaxm2.5-fp8-h100-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: h100
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # H100 tp=4 ep=4: compute ceiling ~10 (empirical), KV cliff ~6 (analytical).
-    # Best cpu-offload demo SKU — 4-conc-point window between cliffs.
-    # Dense sampling 4-12 covers both cliffs; conc 16 confirms compute plateau.
-    - duration: 1800
-      search-space:
-      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] }
-      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [5, 6, 7, 8, 10, 12] }
-
 dsr1-fp8-h100-dynamo-sglang:
   image: lmsysorg/sglang:v0.5.8-cu130
   model: deepseek-ai/DeepSeek-R1-0528
@@ -4757,28 +4395,6 @@ minimaxm2.5-fp8-h200-vllm:
       search-space:
       - { tp: 4, conc-start: 1, conc-end: 256 }
 
-# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main.
-minimaxm2.5-fp8-h200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
-  model: MiniMaxAI/MiniMax-M2.5
-  model-prefix: minimaxm2.5
-  runner: h200
-  precision: fp8
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    # H200 tp=4: compute ceiling ~35 (empirical), KV cliff ~29 (analytical).
-    # cpu offload window conc 29-35 — dense sampling 24-40 captures both cliffs.
-    - duration: 1800
-      search-space:
-      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] }
-      - { tp: 4, offloading: cpu,  conc-list: [24, 28, 32, 36, 40, 48] }
-
 dsr1-fp4-gb200-dynamo-trt:
   image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2
   model: nvidia/DeepSeek-R1-0528-NVFP4-v2
@@ -9203,26 +8819,6 @@ dsv4-fp4-gb300-dynamo-sglang-mtp:
           dp-attn: true
 
 
-kimik2.5-int4-h100-vllm:
-  image: vllm/vllm-openai:v0.20.2
-  model: moonshotai/Kimi-K2.5
-  model-prefix: kimik2.5
-  runner: h100
-  precision: int4
-  framework: vllm
-  multinode: false
-  scenarios:
-    # New entry, agentic-coding only: this PR intentionally does NOT add
-    # fixed-seq-len coverage for kimik2.5-int4 on H100 to keep the
-    # fixed-seq-len test surface identical to origin/main.
-    # H100 has 80 GB HBM per GPU (smallest in this set); the KV cliff arrives
-    # early. Sweep saturates conc=20 to keep total HBM headroom.
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 20] }
-      - { tp: 8, offloading: cpu,  conc-list: [1, 2, 4, 8, 12, 16, 20] }
-
 qwen3.5-fp8-h100-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -9687,12 +9283,45 @@ glm5-fp8-gb300-dynamo-sglang:
 # to preserve main behavior; PR-branch modifications to those recipes are NOT
 # brought in here.
 # ============================================================================
+# =============================================================================
+# Agentic configs
+# -----------------------------------------------------------------------------
+# All entries below run the agentic-coding scenario (Weka trace replay).
+# Above this divider: fixed-seq-len / synthetic / prefix-share scenarios only.
+# =============================================================================
 
-qwen3.5-fp8-b300-sglang-agentic-hicache:
-  image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd
+# Diverged from dsv4-fp4-b200-vllm (agentic-coding sibling). Reasons below;
+# the original dsv4-fp4-b200-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - runner: 'b200-dsv4' -> 'b200-dgxc'
+dsv4-fp4-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.20.0-cu130
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b200-dgxc
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # cpu offload only this iteration — none entries already validated in
+      # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%).
+      # Re-add when investigating regressions in offload=none.
+      - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
+      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
+
+# Diverged from qwen3.5-fp8-b200-sglang (agentic-coding sibling). Metadata is
+# identical to origin/main's qwen3.5-fp8-b200-sglang; the split exists because this
+# PR adds an agentic-coding scenarios block that differs from main
+# (either main had none or had a different conc/offload sweep).
+# The original qwen3.5-fp8-b200-sglang entry stays byte-identical to origin/main.
+qwen3.5-fp8-b200-sglang-agentic:
+  image: lmsysorg/sglang:nightly-dev-20260422-de962f32
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
-  runner: b300
+  runner: b200
   precision: fp8
   framework: sglang
   multinode: false
@@ -9700,46 +9329,404 @@ qwen3.5-fp8-b300-sglang-agentic-hicache:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-      - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
 
-kimik2.5-fp4-b200-vllm-agentic-lmcache:
-  image: vllm/vllm-openai:v0.21.0
-  model: nvidia/Kimi-K2.5-NVFP4
+# Diverged from glm5-fp8-b200-sglang (agentic-coding sibling). Metadata is
+# identical to origin/main's glm5-fp8-b200-sglang; the split exists because this
+# PR adds an agentic-coding scenarios block that differs from main
+# (either main had none or had a different conc/offload sweep).
+# The original glm5-fp8-b200-sglang entry stays byte-identical to origin/main.
+glm5-fp8-b200-sglang-agentic:
+  image: lmsysorg/sglang:v0.5.12-cu130
+  model: zai-org/GLM-5-FP8
+  model-prefix: glm5
+  runner: b200
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] }
+
+# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
+# does not have a B300-specific recipe, so this config reuses the existing
+# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
+# Diverged from kimik2.5-int4-b200-vllm (agentic-coding sibling). Reasons below;
+# the original kimik2.5-int4-b200-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - runner: 'b200' -> 'b200-dgxc'
+kimik2.5-int4-b200-vllm-agentic:
+  # Bumped from v0.19.1 — that release tripped a bug in
+  # `flashinfer_trtllm_mxint4_moe` ('list' object has no attribute 'to')
+  # during warmup `profile_run` on the agentic-coding path
+  # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the
+  # flashinfer fix.
+  image: vllm/vllm-openai:v0.20.2
+  model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: b200-dgxc
-  precision: fp4
+  precision: int4
   framework: vllm
   multinode: false
   scenarios:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] }
-      - { tp: 8, ep: 1, offloading: lmcache,  conc-list: [16, 24, 32, 36] }
-      - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] }
-      - { tp: 4, ep: 1, offloading: lmcache,  conc-list: [12, 14, 16, 18, 20, 22, 24, 32] }
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+      - { tp: 8, offloading: cpu,  conc-list: [32, 64, 96, 128] }
 
 # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
 # does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
+# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
 
-# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons
-# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to
-# origin/main so its fixed-seq-len sweep is unaffected.
-#   - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape
-#     mirroring the conc=192 point in the base entry's fixed-seq-len sweep.
-#   - additional-settings.CONFIG_FILE: points at the new agentic recipe under
-#     recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh
-#     overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC
-#     branch). Local-overlay pattern mirrors the existing 8k1k overlay.
-dsv4-fp4-gb300-dynamo-vllm-agentic:
-  image: vllm/vllm-openai:v0.21.0-ubuntu2404
-  model: deepseek-ai/DeepSeek-V4-Pro
-  model-prefix: dsv4
-  # gb300-nv (not generic gb300) — the generic label is shared by both NV
-  # and CW runner pools, so runs-on: gb300 lets CW runners pick up shards.
-  # The gb300-nv label is on NV runners only (per .github/configs/runners.yaml
+# Diverged from kimik2.5-int4-h200-vllm (agentic-coding sibling). Reasons below;
+# the original kimik2.5-int4-h200-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - runner: 'h200' -> 'h200-dgxc'
+kimik2.5-int4-h200-vllm-agentic:
+  image: vllm/vllm-openai:v0.20.2
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with
+  # the /home/sa-shared/gharunners/ai-perf-cache mount where aiperf's
+  # content-addressed dataset mmap cache lives. Other h200 pools (cw, nb)
+  # don't have that mount and would re-materialize 65 GB to /tmp every job.
+  runner: h200-dgxc
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] }
+      - { tp: 8, offloading: cpu,  conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] }
+
+# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
+# does not have a B300-specific recipe, so this config reuses the existing
+# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
+# Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below;
+# the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2'
+#   - runner: 'b200' -> 'b200-dgxc'
+kimik2.5-fp4-b200-vllm-agentic:
+  # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that
+  # cleared the agentic-coding warmup crash on max_model_len=131072 +
+  # prefix caching.
+  image: vllm/vllm-openai:v0.20.2
+  model: nvidia/Kimi-K2.5-NVFP4
+  model-prefix: kimik2.5
+  runner: b200-dgxc
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] }
+      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [16, 24, 32, 36] }
+      - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] }
+      - { tp: 4, ep: 1, offloading: cpu,  conc-list: [12, 14, 16, 18, 20, 22, 24, 32] }
+
+# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
+# does not have a B300-specific recipe, so this config reuses the existing
+# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
+
+# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below;
+# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130'
+#   - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4'
+#   - model-prefix: 'dsr1' -> 'kimik2.5'
+#   - precision: 'fp8' -> 'fp4'
+#   - framework: 'sglang' -> 'vllm'
+kimik2.5-fp4-b300-vllm-agentic:
+  # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM
+  # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the
+  # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted
+  # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the
+  # INT4 B300 sister already uses successfully.
+  image: vllm/vllm-openai:v0.20.0-cu130
+  model: nvidia/Kimi-K2.5-NVFP4
+  model-prefix: kimik2.5
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+      - { tp: 8, ep: 1, offloading: cpu,  conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
+
+# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only).
+# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper
+# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up.
+# Diverged from dsv4-fp8-h200-vllm (agentic-coding sibling). Reasons below;
+# the original dsv4-fp8-h200-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129'
+dsv4-fp8-h200-vllm-agentic:
+  image: vllm/vllm-openai:deepseekv4-cu129
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: h200
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [1, 2, 4, 8, 16] }
+
+# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image
+# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
+# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
+
+# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is
+# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this
+# PR adds an agentic-coding scenarios block that differs from main
+# (either main had none or had a different conc/offload sweep).
+# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main.
+dsv4-fp4-b300-vllm-agentic:
+  image: vllm/vllm-openai:v0.20.0-cu130
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      # cpu offload only this iteration — none entries already validated in
+      # earlier runs. Re-add when investigating regressions in offload=none.
+      - { tp: 4, offloading: cpu,  conc-list: [16, 32, 64] }
+      - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
+      - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [128, 256, 512] }
+
+# Diverged from gptoss-fp4-b200-vllm (agentic-coding sibling). Reasons below;
+# the original gptoss-fp4-b200-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1'
+gptoss-fp4-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.19.1
+  model: openai/gpt-oss-120b
+  model-prefix: gptoss
+  runner: b200
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] }
+      - { tp: 4, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
+      - { tp: 8, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
+
+# Diverged from minimaxm2.5-fp8-b200-vllm (agentic-coding sibling). Reasons below;
+# the original minimaxm2.5-fp8-b200-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
+#   - runner: 'b200' -> 'b200-dgxc'
+minimaxm2.5-fp8-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.19.1
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: b200-dgxc
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # B200 tp=4: compute ceiling ~50 (empirical), KV cliff ~48 (analytical).
+    # Push none past the KV cliff (96, 128) to make the no-offload throughput
+    # collapse visible; cpu range overlaps fully for same-conc comparison.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 96, 128] }
+      - { tp: 4, offloading: cpu,  conc-list: [48, 56, 64, 96, 128] }
+
+  # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
+  # does not have a B300-specific recipe, so this config reuses the existing
+  # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.
+
+# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below;
+# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so
+# its fixed-seq-len sweep is unaffected.
+#   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
+minimaxm2.5-fp8-b300-vllm-agentic:
+  image: vllm/vllm-openai:v0.19.1
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: b300
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # B300 tp=4: compute ceiling ~60 (empirical), KV cliff ~85 (analytical).
+    # Push none past the KV cliff (96, 128, 192) so the no-offload throughput
+    # collapse is visible; cpu range overlaps fully so each high-conc point
+    # has a same-conc no-offload counterpart for direct comparison.
+    # Dense sampling between 96 and 128 (step=4) to resolve the sharp dropoff
+    # observed in v6 cpu data right past conc=96.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
+      - { tp: 4, offloading: cpu,  conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
+
+# Diverged from minimaxm2.5-fp4-b200-vllm (agentic-coding sibling). Metadata is
+# identical to origin/main's minimaxm2.5-fp4-b200-vllm; the split exists because this
+# PR adds an agentic-coding scenarios block that differs from main
+# (either main had none or had a different conc/offload sweep).
+# The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main.
+minimaxm2.5-fp4-b200-vllm-agentic:
+  image: vllm/vllm-openai:v0.19.0-cu130
+  model: nvidia/MiniMax-M2.5-NVFP4
+  model-prefix: minimaxm2.5
+  runner: b200
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+
+  # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html
+  # does not have a B300-specific recipe, so this config reuses the existing
+  # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
+
+# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is
+# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this
+# PR adds an agentic-coding scenarios block that differs from main
+# (either main had none or had a different conc/offload sweep).
+# The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main.
+minimaxm2.5-fp8-h100-vllm-agentic:
+  image: vllm/vllm-openai:v0.20.2
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: h100
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # H100 tp=4 ep=4: compute ceiling ~10 (empirical), KV cliff ~6 (analytical).
+    # Best cpu-offload demo SKU — 4-conc-point window between cliffs.
+    # Dense sampling 4-12 covers both cliffs; conc 16 confirms compute plateau.
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] }
+      - { tp: 4, ep: 4, offloading: cpu,  conc-list: [5, 6, 7, 8, 10, 12] }
+
+# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is
+# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this
+# PR adds an agentic-coding scenarios block that differs from main
+# (either main had none or had a different conc/offload sweep).
+# The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main.
+minimaxm2.5-fp8-h200-vllm-agentic:
+  image: vllm/vllm-openai:v0.20.2
+  model: MiniMaxAI/MiniMax-M2.5
+  model-prefix: minimaxm2.5
+  runner: h200
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    # H200 tp=4: compute ceiling ~35 (empirical), KV cliff ~29 (analytical).
+    # cpu offload window conc 29-35 — dense sampling 24-40 captures both cliffs.
+    - duration: 1800
+      search-space:
+      - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] }
+      - { tp: 4, offloading: cpu,  conc-list: [24, 28, 32, 36, 40, 48] }
+
+kimik2.5-int4-h100-vllm:
+  image: vllm/vllm-openai:v0.20.2
+  model: moonshotai/Kimi-K2.5
+  model-prefix: kimik2.5
+  runner: h100
+  precision: int4
+  framework: vllm
+  multinode: false
+  scenarios:
+    # New entry, agentic-coding only: this PR intentionally does NOT add
+    # fixed-seq-len coverage for kimik2.5-int4 on H100 to keep the
+    # fixed-seq-len test surface identical to origin/main.
+    # H100 has 80 GB HBM per GPU (smallest in this set); the KV cliff arrives
+    # early. Sweep saturates conc=20 to keep total HBM headroom.
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 20] }
+      - { tp: 8, offloading: cpu,  conc-list: [1, 2, 4, 8, 12, 16, 20] }
+
+qwen3.5-fp8-b300-sglang-agentic-hicache:
+  image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  runner: b300
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+      - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
+
+kimik2.5-fp4-b200-vllm-agentic-lmcache:
+  image: vllm/vllm-openai:v0.21.0
+  model: nvidia/Kimi-K2.5-NVFP4
+  model-prefix: kimik2.5
+  runner: b200-dgxc
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] }
+      - { tp: 8, ep: 1, offloading: lmcache,  conc-list: [16, 24, 32, 36] }
+      - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] }
+      - { tp: 4, ep: 1, offloading: lmcache,  conc-list: [12, 14, 16, 18, 20, 22, 24, 32] }
+
+# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
+# does not have a B300-specific recipe, so this config reuses the existing
+# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
+
+# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons
+# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to
+# origin/main so its fixed-seq-len sweep is unaffected.
+#   - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape
+#     mirroring the conc=192 point in the base entry's fixed-seq-len sweep.
+#   - additional-settings.CONFIG_FILE: points at the new agentic recipe under
+#     recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh
+#     overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC
+#     branch). Local-overlay pattern mirrors the existing 8k1k overlay.
+dsv4-fp4-gb300-dynamo-vllm-agentic:
+  image: vllm/vllm-openai:v0.21.0-ubuntu2404
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  # gb300-nv (not generic gb300) — the generic label is shared by both NV
+  # and CW runner pools, so runs-on: gb300 lets CW runners pick up shards.
+  # The gb300-nv label is on NV runners only (per .github/configs/runners.yaml
   # + actual runner label listings). Pins agentic to the NVIDIA cluster
   # for initial validation. Drop -nv suffix to widen later.
   runner: gb300-nv
@@ -9905,3 +9892,33 @@ qwen3.5-fp8-h100-sglang-agentic:
       search-space:
       - { tp: 8, ep: 8, offloading: none,    conc-list: [1, 2, 4, 8, 12, 14, 16] }
       - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] }
+
+# Split from dsr1-fp4-b200-dynamo-trt: agentic-coding scenario only.
+dsr1-fp4-b200-dynamo-trt-agentic:
+  image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1
+  model: deepseek-r1-fp4
+  model-prefix: dsr1
+  runner: b200-multinode
+  precision: fp4
+  framework: dynamo-trt
+  multinode: true
+  disagg: true
+  scenarios:
+    agentic-coding:
+    - duration: 300
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml
+          - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: false

From 76aedd65780ddaabfb2cb0d630081a42e6cb72ac Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 2 Jun 2026 13:20:16 -0500
Subject: [PATCH 03/15] configs(master): bump all vllm images to v0.22.0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps every non-comment `image:` line in both master configs to the
unsuffixed v0.22.0 tag:
  - vllm/vllm-openai:*           -> vllm/vllm-openai:v0.22.0
  - vllm/vllm-openai-rocm:*      -> vllm/vllm-openai-rocm:v0.22.0

Covers all prior variants: v0.17–v0.21 numbered releases, the -cu130 /
-ubuntu2404 / deepseekv4-cu129 build-variant tags, and the nightly-<sha>
ROCm pins (which were holding DSv4 ROCm support that has since landed in
the tagged release). Comment-line tag references in the agentic
divergence change-log blocks are intentionally untouched so their
"X -> Y" history reads correctly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml    | 24 +++++------
 .github/configs/nvidia-master.yaml | 64 +++++++++++++++---------------
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 0495ebf16..ee4276a26 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -705,7 +705,7 @@ glm5.1-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 256 }
 
 kimik2.5-int4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi355x
@@ -724,7 +724,7 @@ kimik2.5-int4-mi355x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-int4-mi325x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi325x
@@ -743,7 +743,7 @@ kimik2.5-int4-mi325x-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-int4-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: mi300x
@@ -896,7 +896,7 @@ minimaxm2.5-fp4-mi355x-vllm:
       - { tp: 4, conc-start: 4, conc-end: 64 }
 
 minimaxm2.5-fp8-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi300x
@@ -938,7 +938,7 @@ minimaxm2.5-fp8-mi325x-vllm:
       - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 }
 
 gptoss-fp4-mi300x-vllm:
-  image: vllm/vllm-openai-rocm:v0.17.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: mi300x
@@ -1379,7 +1379,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
           - "DECODE_MTP_SIZE=2"
 
 kimik2.5-fp4-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
   runner: mi355x-disagg
@@ -1433,7 +1433,7 @@ kimik2.5-fp4-mi355x-vllm-disagg:
           - "DECODE_NODES=2"
 
 minimaxm2.5-fp8-mi355x-vllm-disagg:
-  image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi355x-disagg
@@ -2558,7 +2558,7 @@ kimik2.5-fp4-mi355x-vllm-agentic:
   # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
   # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
   # includes all subsequent ROCm offload work.
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
   runner: mi355x
@@ -2591,7 +2591,7 @@ minimaxm2.5-fp8-mi355x-vllm-agentic:
   # which enables SimpleCPUOffloadConnector on ROCm. Required for the
   # cpu-offload sweep points to use the same offload path as the NVIDIA
   # agentic-coding configs.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi355x
@@ -2614,7 +2614,7 @@ minimaxm2.5-fp8-mi355x-vllm-agentic:
 #   - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
 minimaxm2.5-fp8-mi300x-vllm-agentic:
   # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi300x
@@ -2637,7 +2637,7 @@ minimaxm2.5-fp8-mi300x-vllm-agentic:
 #   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
 minimaxm2.5-fp8-mi325x-vllm-agentic:
   # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
-  image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: mi325x
@@ -2671,7 +2671,7 @@ qwen3.5-fp8-mi355x-sglang-agentic-hicache:
       - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
 
 dsv4-fp4-mi355x-vllm-agentic:
-  image: vllm/vllm-openai-rocm:v0.21.0
+  image: vllm/vllm-openai-rocm:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 04764831c..d7791fa11 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1804,7 +1804,7 @@ dsv4-fp4-b200-trt-mtp:
 # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
 dsv4-fp4-b200-vllm-mtp:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dsv4
@@ -2474,7 +2474,7 @@ kimik2.5-int4-b200-vllm:
       - { tp: 8, conc-start: 4, conc-end: 64 }
 
 kimik2.5-int4-b300-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: b300
@@ -2535,7 +2535,7 @@ kimik2.5-fp4-b200-vllm:
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
 
 kimik2.5-fp4-b300-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: b300
@@ -2686,7 +2686,7 @@ dsr1-fp8-h200-sglang-mtp:
 # Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache
 # flag is omitted. Max-model-len is pinned at 800k per the recipe.
 dsv4-fp8-h200-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: h200
@@ -2710,7 +2710,7 @@ dsv4-fp8-h200-vllm:
 # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
 dsv4-fp8-h200-vllm-mtp:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: h200
@@ -2852,7 +2852,7 @@ dsv4-fp4-b300-trt-mtp:
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024, spec-decoding: mtp }
 
 dsv4-fp4-b300-vllm-mtp:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -4067,7 +4067,7 @@ minimaxm2.5-fp8-b200-vllm:
   # does not have a B300-specific recipe, so this config reuses the existing
   # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.
 minimaxm2.5-fp8-b300-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: b300
@@ -4123,7 +4123,7 @@ minimaxm2.5-fp4-b200-vllm:
   # does not have a B300-specific recipe, so this config reuses the existing
   # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
 minimaxm2.5-fp4-b300-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/MiniMax-M2.5-NVFP4
   model-prefix: minimaxm2.5
   runner: b300
@@ -4150,7 +4150,7 @@ minimaxm2.5-fp4-b300-vllm:
       - { tp: 8, conc-start: 4, conc-end: 4 }
 
 gptoss-fp4-h100-vllm:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: h100
@@ -7883,7 +7883,7 @@ kimik2.5-fp4-gb200-dynamo-trt:
           dp-attn: true
 
 kimik2.5-fp4-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:v0.18.0-cu130
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: gb200
@@ -7985,7 +7985,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
           dp-attn: true
 
 dsv4-fp4-b200-dynamo-vllm:
-  image: vllm/vllm-openai:v0.20.1
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-multinode
@@ -8041,7 +8041,7 @@ dsv4-fp4-b200-dynamo-vllm:
           dp-attn: true
 
 dsv4-fp4-gb200-dynamo-vllm:
-  image: vllm/vllm-openai:v0.20.0-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb200
@@ -8141,7 +8141,7 @@ dsv4-fp4-gb200-dynamo-vllm:
 # MTP2 variant of dsv4-fp4-gb200-dynamo-vllm. Uses the vLLM 0.20.1 image
 # and hand-picked 8k/1k Pareto points mirrored from NVIDIA/srt-slurm.
 dsv4-fp4-gb200-dynamo-vllm-mtp2:
-  image: vllm/vllm-openai:v0.20.1-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb200
@@ -8221,7 +8221,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2:
           dp-attn: true
 
 dsv4-fp4-b300-dynamo-vllm:
-  image: vllm/vllm-openai:v0.20.1
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -8277,7 +8277,7 @@ dsv4-fp4-b300-dynamo-vllm:
           dp-attn: true
 
 dsv4-fp4-gb300-dynamo-vllm:
-  image: vllm/vllm-openai:v0.20.0-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb300-nv
@@ -9295,7 +9295,7 @@ glm5-fp8-gb300-dynamo-sglang:
 # its fixed-seq-len sweep is unaffected.
 #   - runner: 'b200-dsv4' -> 'b200-dgxc'
 dsv4-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.0-cu130
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200-dgxc
@@ -9364,7 +9364,7 @@ kimik2.5-int4-b200-vllm-agentic:
   # during warmup `profile_run` on the agentic-coding path
   # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the
   # flashinfer fix.
-  image: vllm/vllm-openai:v0.20.2
+  image: vllm/vllm-openai:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: b200-dgxc
@@ -9387,7 +9387,7 @@ kimik2.5-int4-b200-vllm-agentic:
 # its fixed-seq-len sweep is unaffected.
 #   - runner: 'h200' -> 'h200-dgxc'
 kimik2.5-int4-h200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
+  image: vllm/vllm-openai:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with
@@ -9417,7 +9417,7 @@ kimik2.5-fp4-b200-vllm-agentic:
   # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that
   # cleared the agentic-coding warmup crash on max_model_len=131072 +
   # prefix caching.
-  image: vllm/vllm-openai:v0.20.2
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: b200-dgxc
@@ -9451,7 +9451,7 @@ kimik2.5-fp4-b300-vllm-agentic:
   # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted
   # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the
   # INT4 B300 sister already uses successfully.
-  image: vllm/vllm-openai:v0.20.0-cu130
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: b300
@@ -9473,7 +9473,7 @@ kimik2.5-fp4-b300-vllm-agentic:
 # its fixed-seq-len sweep is unaffected.
 #   - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129'
 dsv4-fp8-h200-vllm-agentic:
-  image: vllm/vllm-openai:deepseekv4-cu129
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: h200
@@ -9496,7 +9496,7 @@ dsv4-fp8-h200-vllm-agentic:
 # (either main had none or had a different conc/offload sweep).
 # The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main.
 dsv4-fp4-b300-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.0-cu130
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
@@ -9519,7 +9519,7 @@ dsv4-fp4-b300-vllm-agentic:
 # its fixed-seq-len sweep is unaffected.
 #   - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1'
 gptoss-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.1
+  image: vllm/vllm-openai:v0.22.0
   model: openai/gpt-oss-120b
   model-prefix: gptoss
   runner: b200
@@ -9541,7 +9541,7 @@ gptoss-fp4-b200-vllm-agentic:
 #   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
 #   - runner: 'b200' -> 'b200-dgxc'
 minimaxm2.5-fp8-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.1
+  image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: b200-dgxc
@@ -9567,7 +9567,7 @@ minimaxm2.5-fp8-b200-vllm-agentic:
 # its fixed-seq-len sweep is unaffected.
 #   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
 minimaxm2.5-fp8-b300-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.1
+  image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: b300
@@ -9593,7 +9593,7 @@ minimaxm2.5-fp8-b300-vllm-agentic:
 # (either main had none or had a different conc/offload sweep).
 # The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main.
 minimaxm2.5-fp4-b200-vllm-agentic:
-  image: vllm/vllm-openai:v0.19.0-cu130
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/MiniMax-M2.5-NVFP4
   model-prefix: minimaxm2.5
   runner: b200
@@ -9616,7 +9616,7 @@ minimaxm2.5-fp4-b200-vllm-agentic:
 # (either main had none or had a different conc/offload sweep).
 # The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main.
 minimaxm2.5-fp8-h100-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
+  image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: h100
@@ -9639,7 +9639,7 @@ minimaxm2.5-fp8-h100-vllm-agentic:
 # (either main had none or had a different conc/offload sweep).
 # The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main.
 minimaxm2.5-fp8-h200-vllm-agentic:
-  image: vllm/vllm-openai:v0.20.2
+  image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
   runner: h200
@@ -9656,7 +9656,7 @@ minimaxm2.5-fp8-h200-vllm-agentic:
       - { tp: 4, offloading: cpu,  conc-list: [24, 28, 32, 36, 40, 48] }
 
 kimik2.5-int4-h100-vllm:
-  image: vllm/vllm-openai:v0.20.2
+  image: vllm/vllm-openai:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
   runner: h100
@@ -9691,7 +9691,7 @@ qwen3.5-fp8-b300-sglang-agentic-hicache:
       - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
 
 kimik2.5-fp4-b200-vllm-agentic-lmcache:
-  image: vllm/vllm-openai:v0.21.0
+  image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
   runner: b200-dgxc
@@ -9721,7 +9721,7 @@ kimik2.5-fp4-b200-vllm-agentic-lmcache:
 #     overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC
 #     branch). Local-overlay pattern mirrors the existing 8k1k overlay.
 dsv4-fp4-gb300-dynamo-vllm-agentic:
-  image: vllm/vllm-openai:v0.21.0-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   # gb300-nv (not generic gb300) — the generic label is shared by both NV
@@ -9810,7 +9810,7 @@ dsv4-fp4-gb300-dynamo-vllm-agentic:
 # overlay (recipes/vllm/deepseek-v4/agentic/), so a change to the recipe
 # applies to both clusters with no duplication.
 dsv4-fp4-gb300-cw-dynamo-vllm-agentic:
-  image: vllm/vllm-openai:v0.21.0-ubuntu2404
+  image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: gb300-cw

From 6dede7b24c94f68a74acd537c552950ef74531af Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 2 Jun 2026 13:46:25 -0500
Subject: [PATCH 04/15] configs(master): strip stale narrative comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removes ~240 lines of slop comments that no longer earn their keep:

  - "Diverged from X (agentic-coding sibling)..." rationale blocks
    (24 occurrences) — the sibling split is now durable and the
    "preserved on main" framing isn't meaningful on a branch
  - "Net-new agentic recipes from chore/agentx-v0.3" PR-context headers
  - "agentic-coding sibling — temporarily disabled" + the entire
    commented-out qwen3.5-bf16-b200-sglang-agentic placeholder block
  - Orphan boundary comments ("# DSv4-Pro FP4 on MI355X via SGLang.
    Uses a rocm720..." / "# DSv4 on MI355X via vLLM, using the official
    vllm/vllm-openai-rocm nightly...") that were stranded by prior
    entry moves
  - Inline image-bump rationale that's now stale ("# Bumped from
    v0.19.1...", "# Same image as the INT4 sibling: v0.20.x...",
    "# Nightly carrying vllm-project/vllm@20cac26b...", "# v0.21.0
    (released 2026-05-14)...") since everything is on v0.22.0

Verified via YAML deep-equal: 0 keys added/removed/modified in either
file — purely comment removal.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml    | 110 -----------------------
 .github/configs/nvidia-master.yaml | 140 -----------------------------
 2 files changed, 250 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index ee4276a26..7f1c8192d 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1826,7 +1826,6 @@ dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
   
-
 dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
   image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529
   model: amd/DeepSeek-R1-0528-MXFP4-v2
@@ -1937,7 +1936,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=3"
 
-
       # 1*DEP8 + 1*DEP8
       - spec-decoding: "mtp"
         conc-list: [ 128 ]
@@ -1995,11 +1993,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
 
-
-# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
-# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
-# image tag, so bumping sglang is just an image tag bump here. Sweeps
-# DP-attention on/off and EP=8.
 dsv4-fp4-mi355x-sglang:
   image: rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -2056,25 +2049,6 @@ dsv4-fp4-mi355x-sglang-mtp:
       - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp }
       - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp }
 
-# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
-# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
-# on 2026-05-05, so any nightly built after that includes the
-# DeepseekV4ForCausalLM model class.
-#
-# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
-# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
-# files keyed on the image string and short-circuits re-import if the
-# file already exists, so the floating tag silently keeps a stale build
-# even after Docker Hub updates `:nightly`.
-#
-# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
-# rest); InferenceX classifies this as fp4 — same as the sister sglang
-# and atom DSv4 mi355x entries below. Image and serving flags follow the
-# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
-# executor, triton_unfused MoE (required for the FP4 expert format),
-# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
-# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
-# probe to validate the ROCm DP+EP path.
 dsv4-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -2263,13 +2237,6 @@ glm5-fp8-mi325x-sglang-mtp:
       search-space:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
-# ============================================================================
-# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries).
-# Recipes that ALREADY existed on main were intentionally left at main's version
-# to preserve main behavior; PR-branch modifications to those recipes are NOT
-# brought in here.
-# ============================================================================
-
 dsr1-fp4-mi355x-sglang-disagg-mtp:
   image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
   model: amd/DeepSeek-R1-0528-MXFP4-v2
@@ -2498,23 +2465,6 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
           - "DECODE_NODES=1"
           - "DECODE_MTP_SIZE=1"
       
-
-# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the
-# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the
-# image tag, so bumping sglang is just an image tag bump here. Sweeps
-# DP-attention on/off and EP=8.
-# =============================================================================
-# Agentic configs
-# -----------------------------------------------------------------------------
-# All entries below run the agentic-coding scenario (Weka trace replay).
-# Above this divider: fixed-seq-len / synthetic / prefix-share scenarios only.
-# =============================================================================
-
-# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main.
 qwen3.5-fp8-mi355x-sglang-agentic:
   image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -2529,11 +2479,6 @@ qwen3.5-fp8-mi355x-sglang-agentic:
       search-space:
       - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
 
-# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main.
 glm5.1-fp4-mi355x-sglang-agentic:
   image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415
   model: amd/GLM-5.1-MXFP4
@@ -2549,15 +2494,7 @@ glm5.1-fp4-mi355x-sglang-agentic:
       # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively
       - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
 
-# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0'
 kimik2.5-fp4-mi355x-vllm-agentic:
-  # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin
-  # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm
-  # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and
-  # includes all subsequent ROCm offload work.
   image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/Kimi-K2.5-MXFP4
   model-prefix: kimik2.5
@@ -2581,16 +2518,7 @@ kimik2.5-fp4-mi355x-vllm-agentic:
       - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
       - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
 
-# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
 minimaxm2.5-fp8-mi355x-vllm-agentic:
-  # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector]
-  # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"),
-  # which enables SimpleCPUOffloadConnector on ROCm. Required for the
-  # cpu-offload sweep points to use the same offload path as the NVIDIA
-  # agentic-coding configs.
   image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
@@ -2608,12 +2536,7 @@ minimaxm2.5-fp8-mi355x-vllm-agentic:
       - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] }
       - { tp: 4, ep: 4, offloading: cpu,  conc-list: [48, 56, 64, 72, 96] }
 
-# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
 minimaxm2.5-fp8-mi300x-vllm-agentic:
-  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
   image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
@@ -2631,12 +2554,7 @@ minimaxm2.5-fp8-mi300x-vllm-agentic:
       - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] }
       - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
 
-# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf'
 minimaxm2.5-fp8-mi325x-vllm-agentic:
-  # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above.
   image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
@@ -2686,14 +2604,6 @@ dsv4-fp4-mi355x-vllm-agentic:
       - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] }
       - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] }
 
-# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below;
-# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - scenarios: replaced fixed-seq-len with agentic-coding.
-# Image is identical to the base entry (rocm/sgl-dev DSv4 build).
-# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware
-# comparability. Offload sweep is none-only (SGLang has no equivalent of
-# vLLM's SimpleCPUOffloadConnector path that we exercise on b200).
 dsv4-fp4-mi355x-sglang-agentic:
   image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -2708,23 +2618,3 @@ dsv4-fp4-mi355x-sglang-agentic:
       search-space:
       - { tp: 8, offloading: none, conc-list: [16, 32, 64] }
       - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] }
-
-# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
-# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged
-# on 2026-05-05, so any nightly built after that includes the
-# DeepseekV4ForCausalLM model class.
-#
-# IMPORTANT: pin to a digest-suffixed nightly tag rather than the
-# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs
-# files keyed on the image string and short-circuits re-import if the
-# file already exists, so the floating tag silently keeps a stale build
-# even after Docker Hub updates `:nightly`.
-#
-# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the
-# rest); InferenceX classifies this as fp4 — same as the sister sglang
-# and atom DSv4 mi355x entries below. Image and serving flags follow the
-# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp
-# executor, triton_unfused MoE (required for the FP4 expert format),
-# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192,
-# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
-# probe to validate the ROCm DP+EP path.
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index d7791fa11..77c5d17ce 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2064,23 +2064,6 @@ qwen3.5-bf16-b200-sglang-mtp:
       search-space:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
 
-# agentic-coding sibling — temporarily disabled, blocked by e2e-tests.yml
-# artifact-name mismatch (downloads `agentic_*` but benchmark-tmpl.yml uploads
-# as `bmk_agentic_*`). Re-enable once that workflow is aligned.
-# qwen3.5-bf16-b200-sglang-agentic:
-#   image: lmsysorg/sglang:v0.5.12-cu130
-#   model: Qwen/Qwen3.5-397B-A17B
-#   model-prefix: qwen3.5
-#   runner: b200
-#   precision: bf16
-#   framework: sglang
-#   multinode: false
-#   scenarios:
-#     agentic-coding:
-#     - duration: 1800
-#       search-space:
-#       - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-
 qwen3.5-fp8-b200-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -2331,7 +2314,6 @@ qwen3.5-fp8-b200-sglang-mtp:
       - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
       - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
     
-
 qwen3.5-fp8-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -8818,7 +8800,6 @@ dsv4-fp4-gb300-dynamo-sglang-mtp:
           ep: 8
           dp-attn: true
 
-
 qwen3.5-fp8-h100-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -9277,23 +9258,6 @@ glm5-fp8-gb300-dynamo-sglang:
           ep: 1
           dp-attn: false
 
-# ============================================================================
-# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries).
-# Recipes that ALREADY existed on main were intentionally left at main's version
-# to preserve main behavior; PR-branch modifications to those recipes are NOT
-# brought in here.
-# ============================================================================
-# =============================================================================
-# Agentic configs
-# -----------------------------------------------------------------------------
-# All entries below run the agentic-coding scenario (Weka trace replay).
-# Above this divider: fixed-seq-len / synthetic / prefix-share scenarios only.
-# =============================================================================
-
-# Diverged from dsv4-fp4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original dsv4-fp4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - runner: 'b200-dsv4' -> 'b200-dgxc'
 dsv4-fp4-b200-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -9312,11 +9276,6 @@ dsv4-fp4-b200-vllm-agentic:
       - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
       - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
 
-# Diverged from qwen3.5-fp8-b200-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's qwen3.5-fp8-b200-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original qwen3.5-fp8-b200-sglang entry stays byte-identical to origin/main.
 qwen3.5-fp8-b200-sglang-agentic:
   image: lmsysorg/sglang:nightly-dev-20260422-de962f32
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -9331,11 +9290,6 @@ qwen3.5-fp8-b200-sglang-agentic:
       search-space:
       - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
 
-# Diverged from glm5-fp8-b200-sglang (agentic-coding sibling). Metadata is
-# identical to origin/main's glm5-fp8-b200-sglang; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original glm5-fp8-b200-sglang entry stays byte-identical to origin/main.
 glm5-fp8-b200-sglang-agentic:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: zai-org/GLM-5-FP8
@@ -9351,19 +9305,7 @@ glm5-fp8-b200-sglang-agentic:
       # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none
       - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] }
 
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from kimik2.5-int4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-int4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - runner: 'b200' -> 'b200-dgxc'
 kimik2.5-int4-b200-vllm-agentic:
-  # Bumped from v0.19.1 — that release tripped a bug in
-  # `flashinfer_trtllm_mxint4_moe` ('list' object has no attribute 'to')
-  # during warmup `profile_run` on the agentic-coding path
-  # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the
-  # flashinfer fix.
   image: vllm/vllm-openai:v0.22.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
@@ -9382,10 +9324,6 @@ kimik2.5-int4-b200-vllm-agentic:
 # does not have a B300-specific recipe, so this config reuses the existing
 # Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available.
 
-# Diverged from kimik2.5-int4-h200-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-int4-h200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - runner: 'h200' -> 'h200-dgxc'
 kimik2.5-int4-h200-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
   model: moonshotai/Kimi-K2.5
@@ -9405,18 +9343,7 @@ kimik2.5-int4-h200-vllm-agentic:
       - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] }
       - { tp: 8, offloading: cpu,  conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] }
 
-# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-# does not have a B300-specific recipe, so this config reuses the existing
-# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
-# Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2'
-#   - runner: 'b200' -> 'b200-dgxc'
 kimik2.5-fp4-b200-vllm-agentic:
-  # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that
-  # cleared the agentic-coding warmup crash on max_model_len=131072 +
-  # prefix caching.
   image: vllm/vllm-openai:v0.22.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
@@ -9437,14 +9364,6 @@ kimik2.5-fp4-b200-vllm-agentic:
 # does not have a B300-specific recipe, so this config reuses the existing
 # Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
 
-# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130'
-#   - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4'
-#   - model-prefix: 'dsr1' -> 'kimik2.5'
-#   - precision: 'fp8' -> 'fp4'
-#   - framework: 'sglang' -> 'vllm'
 kimik2.5-fp4-b300-vllm-agentic:
   # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM
   # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the
@@ -9465,13 +9384,6 @@ kimik2.5-fp4-b300-vllm-agentic:
       - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
       - { tp: 8, ep: 1, offloading: cpu,  conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] }
 
-# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only).
-# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper
-# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up.
-# Diverged from dsv4-fp8-h200-vllm (agentic-coding sibling). Reasons below;
-# the original dsv4-fp8-h200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129'
 dsv4-fp8-h200-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -9490,11 +9402,6 @@ dsv4-fp8-h200-vllm-agentic:
 # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
 
-# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main.
 dsv4-fp4-b300-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -9514,10 +9421,6 @@ dsv4-fp4-b300-vllm-agentic:
       - { tp: 4, ep: 4, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
       - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [128, 256, 512] }
 
-# Diverged from gptoss-fp4-b200-vllm (agentic-coding sibling). Reasons below;
-# the original gptoss-fp4-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1'
 gptoss-fp4-b200-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
   model: openai/gpt-oss-120b
@@ -9535,11 +9438,6 @@ gptoss-fp4-b200-vllm-agentic:
       - { tp: 4, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
       - { tp: 8, offloading: cpu,  conc-list: [64, 96, 128, 192, 256] }
 
-# Diverged from minimaxm2.5-fp8-b200-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-b200-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
-#   - runner: 'b200' -> 'b200-dgxc'
 minimaxm2.5-fp8-b200-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -9562,10 +9460,6 @@ minimaxm2.5-fp8-b200-vllm-agentic:
   # does not have a B300-specific recipe, so this config reuses the existing
   # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available.
 
-# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
 minimaxm2.5-fp8-b300-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -9587,11 +9481,6 @@ minimaxm2.5-fp8-b300-vllm-agentic:
       - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
       - { tp: 4, offloading: cpu,  conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] }
 
-# Diverged from minimaxm2.5-fp4-b200-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's minimaxm2.5-fp4-b200-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main.
 minimaxm2.5-fp4-b200-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
   model: nvidia/MiniMax-M2.5-NVFP4
@@ -9610,11 +9499,6 @@ minimaxm2.5-fp4-b200-vllm-agentic:
   # does not have a B300-specific recipe, so this config reuses the existing
   # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
 
-# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main.
 minimaxm2.5-fp8-h100-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -9633,11 +9517,6 @@ minimaxm2.5-fp8-h100-vllm-agentic:
       - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] }
       - { tp: 4, ep: 4, offloading: cpu,  conc-list: [5, 6, 7, 8, 10, 12] }
 
-# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main.
 minimaxm2.5-fp8-h200-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -9711,15 +9590,6 @@ kimik2.5-fp4-b200-vllm-agentic-lmcache:
 # does not have a B300-specific recipe, so this config reuses the existing
 # Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available.
 
-# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons
-# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to
-# origin/main so its fixed-seq-len sweep is unaffected.
-#   - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape
-#     mirroring the conc=192 point in the base entry's fixed-seq-len sweep.
-#   - additional-settings.CONFIG_FILE: points at the new agentic recipe under
-#     recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh
-#     overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC
-#     branch). Local-overlay pattern mirrors the existing 8k1k overlay.
 dsv4-fp4-gb300-dynamo-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -9868,16 +9738,6 @@ dsv4-fp4-gb300-cw-dynamo-vllm-agentic:
           ep: 8
           dp-attn: true
 
-# Diverged from qwen3.5-fp8-h100-sglang (agentic-coding sibling). Reasons below;
-# the original qwen3.5-fp8-h100-sglang entry stays byte-identical to origin/main
-# so its fixed-seq-len sweep is unaffected.
-#   - scenarios: replaced fixed-seq-len with agentic-coding.
-#   - runner: 'h100' -> 'h100-dgxc' (agentic runs need the dgxc-slurm cluster).
-# Image is identical to the base entry (lmsysorg/sglang:v0.5.12-cu130).
-# CONC range conservative for H100's 80 GB HBM3 under the long-ISL with-
-# subagents corpus. hicache arm capped at conc 16 since high-conc + hicache
-# tends to flake on first runs and conc 16 covers the cliff. The bench script
-# sets WEKA_LOADER_OVERRIDE to the 256k-capped corpus variant.
 qwen3.5-fp8-h100-sglang-agentic:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8

From 32572755524d98283c5339350a049fd7c6aad43d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 2 Jun 2026 14:42:39 -0500
Subject: [PATCH 05/15] chore(aiperf): bump submodule for 060226 loader
 allowlist fix

Picks up SemiAnalysisAI/aiperf@47e6e206, which adds the 060226 and
060226_256k loader names to the inferencex-agentx-mvp scenario's
require_loader allowlist. Without this bump, dispatching any non-DSv4
agentic run on this branch fails preflight because benchmark_lib.sh
now defaults the loader to the 060226 corpus.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 utils/aiperf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/aiperf b/utils/aiperf
index de3ad1c18..47e6e2060 160000
--- a/utils/aiperf
+++ b/utils/aiperf
@@ -1 +1 @@
-Subproject commit de3ad1c18b704a60c43bcc5f76dfb2ac7e346fd1
+Subproject commit 47e6e206001a85a3cc4c6212a1e0425f045bbcb3

From 321fd445c301c5c52901b8f37e295ee38a10f39f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 2 Jun 2026 14:58:48 -0500
Subject: [PATCH 06/15] (testing) b300 dsv4 simple offloading

---
 .github/configs/nvidia-master.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 77c5d17ce..5b0792d08 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -9403,7 +9403,9 @@ dsv4-fp8-h200-vllm-agentic:
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
 
 dsv4-fp4-b300-vllm-agentic:
-  image: vllm/vllm-openai:v0.22.0
+  # image: vllm/vllm-openai:v0.22.0
+  # includes https://github.com/vllm-project/vllm/pull/43447 up to 7ead0a0f27fc2b34efdcc8a557d542c5a372306f
+  image: cquil/vllm-openai:v0.22.0-7ead0a0f27fc2b34efdcc8a557d542c5a372306f
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300

From 32839349559d13a51537879b32dd05e8f60e0661 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 2 Jun 2026 15:22:03 -0500
Subject: [PATCH 07/15] runners(b300-nv): remap container UID to root for
 apt-get install

Same root cause as 967c50ca (h200-dgxc-slurm fix): vllm/vllm-openai
images ship as non-root, and on b300-nv the pyxis/enroot config does
NOT implicitly remap the calling user to UID 0 inside the container.
benchmark_lib.sh::install_agentic_deps runs apt-get install -y git,
which fails with "dpkg: error: requested operation requires superuser
privilege" (see run 26844610474 / dsv4 b300 simple offloading).

Adding --container-remap-root to the srun line matches b200-dgxc and
h200-dgxc-slurm behavior; benchmark_lib.sh stays untouched.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b300-nv.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 67e8b48cc..cb4a634c3 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -387,6 +387,7 @@ else
         --container-image=$SQUASH_FILE \
         --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT,$WRITABLE_MODELS_DIR:$WRITABLE_MODELS_DIR \
         --no-container-mount-home \
+        --container-remap-root \
         --container-workdir=$CONTAINER_MOUNT_DIR \
         --no-container-entrypoint --export=ALL,PORT=8888 \
         bash "$BENCH_SCRIPT"

From 360bcf089130808b0f2a3a249dfdd38e19772c1b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 2 Jun 2026 15:45:01 -0500
Subject: [PATCH 08/15] benchmarks(agentic): skip hf download when MODEL_PATH
 is pre-staged

Replaces the simple unguarded download in every agentic recipe:

  - if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

with the same MODEL_PATH-aware logic that the fixed-seq-len B300 recipes
already use:

  if [[ -n "${MODEL_PATH:-}" ]]; then
      if [[ ! -d "$MODEL_PATH" || empty ]]; then
          hf download "$MODEL" --local-dir "$MODEL_PATH"
      fi
  else
      hf download "$MODEL"
      export MODEL_PATH="$MODEL"
  fi

Effect: on clusters where launch_*.sh exports MODEL_PATH pointing at a
pre-staged on-node copy (e.g. b300-nv sets it to
/scratch/models/<basename>), the agentic recipe now correctly short-
circuits the hf-download instead of re-pulling 700 GB of DSv4-Pro
into $HOME/.cache/huggingface every run.

Touches 33 scripts; same edit in each.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/agentic/dsr1_fp4_b200.sh      | 12 +++++++++++-
 benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh    | 12 +++++++++++-
 benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 12 +++++++++++-
 benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 12 +++++++++++-
 .../single_node/agentic/dsv4_fp4_mi355x_sglang.sh    | 12 +++++++++++-
 benchmarks/single_node/agentic/dsv4_fp8_h200.sh      | 12 +++++++++++-
 benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh  | 12 +++++++++++-
 benchmarks/single_node/agentic/glm5_fp8_b200.sh      | 12 +++++++++++-
 benchmarks/single_node/agentic/gptoss_fp4_b200.sh    | 12 +++++++++++-
 benchmarks/single_node/agentic/gptoss_fp4_h100.sh    | 12 +++++++++++-
 benchmarks/single_node/agentic/gptoss_fp4_h200.sh    | 12 +++++++++++-
 benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh  | 12 +++++++++++-
 benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh  | 12 +++++++++++-
 benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh  | 12 +++++++++++-
 benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh  | 12 +++++++++++-
 .../single_node/agentic/kimik2.5_fp4_mi355x.sh       | 12 +++++++++++-
 benchmarks/single_node/agentic/kimik2.5_int4_b200.sh | 12 +++++++++++-
 benchmarks/single_node/agentic/kimik2.5_int4_h100.sh | 12 +++++++++++-
 benchmarks/single_node/agentic/kimik2.5_int4_h200.sh | 12 +++++++++++-
 .../single_node/agentic/minimaxm2.5_fp4_b200.sh      | 12 +++++++++++-
 .../single_node/agentic/minimaxm2.5_fp8_b200.sh      | 12 +++++++++++-
 .../single_node/agentic/minimaxm2.5_fp8_b300.sh      | 12 +++++++++++-
 .../single_node/agentic/minimaxm2.5_fp8_h100.sh      | 12 +++++++++++-
 .../single_node/agentic/minimaxm2.5_fp8_h200.sh      | 12 +++++++++++-
 .../single_node/agentic/minimaxm2.5_fp8_mi300x.sh    | 12 +++++++++++-
 .../single_node/agentic/minimaxm2.5_fp8_mi325x.sh    | 12 +++++++++++-
 .../single_node/agentic/minimaxm2.5_fp8_mi355x.sh    | 12 +++++++++++-
 benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh  | 12 +++++++++++-
 benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh   | 12 +++++++++++-
 .../single_node/agentic/qwen3.5_fp8_b300_sglang.sh   | 12 +++++++++++-
 benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh   | 12 +++++++++++-
 benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh | 12 +++++++++++-
 .../single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh | 12 +++++++++++-
 33 files changed, 363 insertions(+), 33 deletions(-)

diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
index f9955adc7..23cf71e7d 100755
--- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
@@ -17,7 +17,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
index ff76b768d..c67fc7ebf 100755
--- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
@@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi
 amd-smi || true
 
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
index 108347479..7bc18ce22 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
@@ -38,7 +38,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
index f6748a5f8..7a130673d 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
@@ -32,7 +32,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
index 99aec25fe..ab2897d88 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -31,7 +31,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
index 0a0177983..c1e2f50b3 100755
--- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
index 500b456f5..5987a789e 100755
--- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh
index 259c19586..3d601193f 100755
--- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
index 6e921db58..ec8c4c9f8 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
index 557986b0d..443bc8bcc 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
index 1592a8d5c..7a93c71c5 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
index eb1883ff1..8ca6d805c 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi
 amd-smi || true
 
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
index 99e29c819..6e41756a0 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi
 
 # If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory.
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
index ad0b4495a..e5c87b14a 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
index 8cebe4f20..8ab9672af 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
@@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index fd0ce3677..734f63766 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -33,7 +33,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
index 697d3fa45..ab91c99c5 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
@@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
index 2fd3b381c..fa867d976 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
@@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
index 97929e43e..08549e93a 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
@@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
index 38ef72b56..195b285c6 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
index 4ce131cba..af7c7a216 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
index 9f2d83a0b..d3ea641ef 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
index d21690da6..48f2ab388 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
index ed59991cb..15e5798c6 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
index 260bbdc68..add2a8fa0 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
@@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
index edac27a45..57746eef6 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
@@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
index 39dd63293..eac820aa0 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
@@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
index 4ba87976b..ee40e1855 100755
--- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
@@ -20,7 +20,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
index 3432af5c9..4d39f2c81 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
@@ -20,7 +20,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
index 9d9c1d7d5..d926288ae 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
@@ -24,7 +24,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
index 95f0397a0..9db72e569 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
@@ -27,7 +27,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
index aef9650ca..a78ee87b9 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
@@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
index 5427d0d31..f5e2d2e6f 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
@@ -24,7 +24,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 

From 57d4adb4fb5fbebc478f628c522a0a49cec9e072 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 2 Jun 2026 16:23:34 -0500
Subject: [PATCH 09/15] benchmarks(agentic): launch server from MODEL_PATH, not
 the HF id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Companion to 360bcf08. That commit made the agentic recipes skip
hf-download when MODEL_PATH was already pre-staged — but the recipes
still invoked the server with the HF id ("vllm serve \$MODEL" /
"--model-path \$MODEL"), so the engine looked up the HF cache (now
empty, because we just skipped the download) and tried to download from
scratch itself. With the model not in cache, vllm/sglang would deadlock
in the auto-download path rather than fall through to a clean error.

This commit aligns every agentic recipe with the fixed-seq-len B300
pattern verbatim:

  vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
  python3 -m sglang.launch_server --model-path "$MODEL_PATH" --served-model-name "$MODEL"

Net effect: server loads weights directly from /scratch/models/<name>/
(or wherever the launch script staged the model) and reports the HF id
as the served-model-name for downstream tooling.

Touches all 33 agentic scripts.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/agentic/dsr1_fp4_b200.sh             | 2 +-
 benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh           | 2 +-
 benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh        | 2 +-
 benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh        | 2 +-
 benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh    | 2 +-
 benchmarks/single_node/agentic/dsv4_fp8_h200.sh             | 2 +-
 benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh         | 2 +-
 benchmarks/single_node/agentic/glm5_fp8_b200.sh             | 2 +-
 benchmarks/single_node/agentic/gptoss_fp4_b200.sh           | 2 +-
 benchmarks/single_node/agentic/gptoss_fp4_h100.sh           | 2 +-
 benchmarks/single_node/agentic/gptoss_fp4_h200.sh           | 2 +-
 benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh         | 2 +-
 benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh         | 2 +-
 benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh         | 2 +-
 benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh         | 2 +-
 benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh       | 2 +-
 benchmarks/single_node/agentic/kimik2.5_int4_b200.sh        | 2 +-
 benchmarks/single_node/agentic/kimik2.5_int4_h100.sh        | 2 +-
 benchmarks/single_node/agentic/kimik2.5_int4_h200.sh        | 2 +-
 benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh      | 2 +-
 benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh      | 2 +-
 benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh      | 2 +-
 benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh      | 2 +-
 benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh      | 2 +-
 benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh    | 2 +-
 benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh    | 2 +-
 benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh    | 2 +-
 benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh         | 2 +-
 benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh          | 2 +-
 benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh   | 2 +-
 benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh          | 2 +-
 benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh        | 2 +-
 benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh | 2 +-
 33 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
index 23cf71e7d..16dc3bfd5 100755
--- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh
@@ -43,7 +43,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
---model-path $MODEL \
+--model-path "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
index c67fc7ebf..3b2561fe2 100755
--- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh
@@ -44,7 +44,7 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
---model-path=$MODEL \
+--model-path=$MODEL_PATH --served-model-name=$MODEL \
 --host=0.0.0.0 \
 --port=$PORT \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
index 7bc18ce22..e80008f71 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh
@@ -231,7 +231,7 @@ export VLLM_FLOAT32_MATMUL_PRECISION=high
 
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --trust-remote-code
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
index 7a130673d..88f4b38f5 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
@@ -123,7 +123,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve "$MODEL" \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port "$PORT" \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
index ab2897d88..029c8ea7f 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh
@@ -145,7 +145,7 @@ fi
 
 echo "Starting sglang server..."
 python3 -m sglang.launch_server \
-    --model-path "$MODEL" \
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
     --host=0.0.0.0 \
     --port "$PORT" \
     "${PARALLEL_ARGS[@]}" \
diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
index c1e2f50b3..799c2bf26 100755
--- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh
@@ -50,7 +50,7 @@ export PYTHONNOUSERSITE=1
 
 # Per recipe: EP + DP=8 (no --tensor-parallel-size). TP from search space is
 # used for GPU allocation by the runner and as the DP size.
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
index 5987a789e..3b85a31cd 100755
--- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh
@@ -52,7 +52,7 @@ echo "Starting SGLang server..."
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
-    --model-path $MODEL \
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh
index 3d601193f..b3597cf52 100755
--- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh
@@ -49,7 +49,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
---model-path=$MODEL \
+--model-path=$MODEL_PATH --served-model-name=$MODEL \
 --host=0.0.0.0 \
 --port=$PORT \
 --trust-remote-code \
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
index ec8c4c9f8..80d70e724 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh
@@ -63,7 +63,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --config "$RESULT_DIR/config.yaml" \
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
index 443bc8bcc..13e32d315 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh
@@ -67,7 +67,7 @@ export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTHONNOUSERSITE=1
 export VLLM_MXFP4_USE_MARLIN=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --config "$RESULT_DIR/config.yaml" \
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
index 7a93c71c5..e0d967246 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh
@@ -67,7 +67,7 @@ export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTHONNOUSERSITE=1
 export VLLM_MXFP4_USE_MARLIN=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --config "$RESULT_DIR/config.yaml" \
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
index 8ca6d805c..ff597c9a4 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh
@@ -75,7 +75,7 @@ esac
 
 echo "Starting vllm server..."
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --attention-backend ROCM_AITER_UNIFIED_ATTN \
diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
index 6e41756a0..1f8c29351 100755
--- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
+++ b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh
@@ -74,7 +74,7 @@ esac
 
 echo "Starting vllm server..."
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --attention-backend ROCM_AITER_UNIFIED_ATTN \
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
index e5c87b14a..34b45c9ec 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh
@@ -188,7 +188,7 @@ export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0
 
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --tensor-parallel-size="$TP"
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
index 8ab9672af..9667003e1 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh
@@ -95,7 +95,7 @@ export PYTHONNOUSERSITE=1
 
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --tensor-parallel-size="$TP"
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index 734f63766..139b12256 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -796,7 +796,7 @@ export PYTHONNOUSERSITE=1
 
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --tensor-parallel-size="$TP"
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
index ab91c99c5..5685f098c 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh
@@ -55,7 +55,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_INT4=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --gpu-memory-utilization 0.95 \
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
index fa867d976..cb6c67f4b 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh
@@ -55,7 +55,7 @@ echo "Starting vllm server..."
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_INT4=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --gpu-memory-utilization 0.95 \
diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
index 08549e93a..1bfa0c33b 100755
--- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh
@@ -66,7 +66,7 @@ echo "Starting vllm server..."
 export PYTHONNOUSERSITE=1
 export VLLM_USE_FLASHINFER_MOE_INT4=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --gpu-memory-utilization 0.95 \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
index 195b285c6..b4a63eff3 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh
@@ -68,7 +68,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 $PARALLEL_ARGS \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
index af7c7a216..0724aba5b 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh
@@ -72,7 +72,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
index d3ea641ef..c291a2ceb 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh
@@ -72,7 +72,7 @@ export TORCH_CUDA_ARCH_LIST="10.0"
 export PYTHONNOUSERSITE=1
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
index 48f2ab388..516bc4696 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh
@@ -68,7 +68,7 @@ echo "Starting vllm server..."
 export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
index 15e5798c6..e6343b8ba 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh
@@ -68,7 +68,7 @@ echo "Starting vllm server..."
 export TORCH_CUDA_ARCH_LIST="9.0"
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
index add2a8fa0..8988316d3 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh
@@ -74,7 +74,7 @@ echo "Starting vllm server..."
 export VLLM_ROCM_USE_AITER=1
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
index 57746eef6..caa70de63 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh
@@ -71,7 +71,7 @@ echo "Starting vllm server..."
 export VLLM_ROCM_USE_AITER=1
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
index eac820aa0..cd114fe96 100755
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh
@@ -75,7 +75,7 @@ export VLLM_ROCM_USE_AITER=1
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 export PYTHONNOUSERSITE=1
 
-vllm serve $MODEL \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \
 --host 0.0.0.0 \
 --port $PORT \
 --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
index ee40e1855..d06d82ec8 100755
--- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh
@@ -49,7 +49,7 @@ export SGL_ENABLE_JIT_DEEPGEMM=false
 export SGLANG_ENABLE_FLASHINFER_GEMM=true
 
 python3 -m sglang.launch_server \
---model-path=$MODEL \
+--model-path=$MODEL_PATH --served-model-name=$MODEL \
 --host=0.0.0.0 \
 --port=$PORT \
 --served-model-name "Qwen/Qwen3.5-397B-A17B" \
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
index 4d39f2c81..ad49b2b67 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh
@@ -49,7 +49,7 @@ export SGL_ENABLE_JIT_DEEPGEMM=false
 export SGLANG_ENABLE_FLASHINFER_GEMM=true
 
 python3 -m sglang.launch_server \
---model-path=$MODEL \
+--model-path=$MODEL_PATH --served-model-name=$MODEL \
 --host=0.0.0.0 \
 --port=$PORT \
 --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" \
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
index d926288ae..4f9b12659 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh
@@ -95,7 +95,7 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true
 { set +x; } 2>/dev/null
 SGLANG_CMD=(
     python3 -m sglang.launch_server
-    --model-path="$MODEL"
+    --model-path="$MODEL_PATH" --served-model-name="$MODEL"
     --host=0.0.0.0
     --port="$PORT"
     --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8"
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
index 9db72e569..b280fff8b 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh
@@ -108,7 +108,7 @@ fi
 { set +x; } 2>/dev/null
 SGLANG_CMD=(
     python3 -m sglang.launch_server
-    --model-path="$MODEL"
+    --model-path="$MODEL_PATH" --served-model-name="$MODEL"
     --host=0.0.0.0
     --port="$PORT"
     --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8"
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
index a78ee87b9..ff901b674 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
@@ -46,7 +46,7 @@ export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
     --attention-backend triton \
-    --model-path $MODEL \
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
index f5e2d2e6f..cdded8860 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
@@ -120,7 +120,7 @@ export PYTHONNOUSERSITE=1
 SGLANG_CMD=(
     python3 -m sglang.launch_server
     --attention-backend triton
-    --model-path "$MODEL"
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL"
     --host=0.0.0.0
     --port "$PORT"
     --tensor-parallel-size "$TP"

From 1bccc5cacf281a1221dac8f0558248f220786311 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 2 Jun 2026 17:40:48 -0500
Subject: [PATCH 10/15] benchmarks(dsv4-b300): enable
 VLLM_PREFIX_CACHE_RETENTION_INTERVAL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The custom cquil/vllm-openai image integrates vllm-project/vllm#43447,
which fixes the DSv4 sliding-window prefix-cache eviction issue. But the
fix is opt-in via VLLM_PREFIX_CACHE_RETENTION_INTERVAL — without setting
it, vllm falls back to the legacy cache-every-segment path that this PR
was written to repair, so the trace-replay cache hit rate stays near 0%
even though the patched code is loaded.

Sets the env var to 32768 (32k tokens), matching the value the PR author
validated to take cache hit rate from 0% -> 74% on a comparable agentic
trace-replay benchmark.

On stock vllm images that don't carry the patch, the env var is simply
ignored — safe to land.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml                   | 4 ++++
 benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 5b0792d08..4d7785c2a 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -9422,6 +9422,10 @@ dsv4-fp4-b300-vllm-agentic:
       - { tp: 8, offloading: cpu,  conc-list: [16, 32, 64] }
       - { tp: 4, ep: 4, dp-attn: true, offloading: cpu,  conc-list: [64, 128, 256] }
       - { tp: 8, ep: 8, dp-attn: true, offloading: cpu,  conc-list: [128, 256, 512] }
+      - { tp: 4, offloading: none,  conc-list: [16, 32, 64] }
+      - { tp: 8, offloading: none,  conc-list: [16, 32, 64] }
+      - { tp: 4, ep: 4, dp-attn: true, offloading: none,  conc-list: [64, 128, 256] }
+      - { tp: 8, ep: 8, dp-attn: true, offloading: none,  conc-list: [128, 256, 512] }
 
 gptoss-fp4-b200-vllm-agentic:
   image: vllm/vllm-openai:v0.22.0
diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
index 88f4b38f5..837345423 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
@@ -52,6 +52,13 @@ install_agentic_deps
 # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s.
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 
+# vllm-project/vllm#43447: keep SWA prefix-cache tails sparsely so transient
+# sliding-window allocations don't evict useful prefix entries. 32k matches
+# the trace-replay tuning the PR author validated (0% -> 74% hit rate).
+# Requires the custom image (cquil/vllm-openai:*-7ead0a0f...) that carries
+# the patch; on stock images the env var is ignored.
+export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768
+
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
 mkdir -p "$RESULT_DIR"

From 094610734aa2e88a4ccbfad503002d7810bc8f8f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 2 Jun 2026 23:04:36 -0500
Subject: [PATCH 11/15] configs(dsv4-b300-vllm-agentic): bump cquil image to
 6c529f30 for retention-interval env
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

7ead0a0 only carried the "Prepend uncached blocks in SWA free()" hunk
of PR vllm-project/vllm#43447 — it did NOT modify vllm/envs.py to
register the VLLM_PREFIX_CACHE_RETENTION_INTERVAL env var. That
registration didn't land until commit 7c909f8 in the PR, and 6c529f30
is the latest merge of main into the PR branch.

Effect: the export in dsv4_fp4_b300_vllm.sh (1bccc5ca) finally takes
effect — vllm stops logging "Unknown vLLM environment variable detected"
and actually activates the SWA prefix-cache retention path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 4d7785c2a..380c799e1 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -9404,8 +9404,8 @@ dsv4-fp8-h200-vllm-agentic:
 
 dsv4-fp4-b300-vllm-agentic:
   # image: vllm/vllm-openai:v0.22.0
-  # includes https://github.com/vllm-project/vllm/pull/43447 up to 7ead0a0f27fc2b34efdcc8a557d542c5a372306f
-  image: cquil/vllm-openai:v0.22.0-7ead0a0f27fc2b34efdcc8a557d542c5a372306f
+  # includes https://github.com/vllm-project/vllm/pull/43447 up to 6c529f3001ab8bf44b1657e779dc54b622397045
+  image: cquil/vllm-openai:v0.22.0-6c529f3001ab8bf44b1657e779dc54b622397045
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300

From 38c365c77bf0cd1214ea6d1b81f7f7ed2c56b750 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Tue, 2 Jun 2026 23:09:58 -0500
Subject: [PATCH 12/15] benchmarks(dsv4-b300-vllm): override trace loader to
 060226 (v6)

DSv4 recipes inherit the benchmark_lib carveout that defaults to the
052726 corpus for backward-compat with prior published baselines. This
recipe is opting out to ride the v6 060226 corpus that all non-DSv4
recipes already use, exercising the newer CC versions / longer-tail
trace mix.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
index 837345423..fdb7a49b6 100755
--- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh
@@ -46,6 +46,8 @@ fi
 nvidia-smi
 
 # ---- Resolve traces and install deps ----------------------------------------
+# Opt this recipe out of the DSv4 052726 default; use the v6 corpus.
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226
 resolve_trace_source
 install_agentic_deps
 

From ee8d74391ba7674ba77f67c1e764fc200be1956d Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Wed, 3 Jun 2026 14:33:00 +0900
Subject: [PATCH 13/15] [AMD] agentx-v0.4: add MiniMax/Kimi lmcache agentic
 entries, update Qwen hicache config

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml | 35 ++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 7f1c8192d..134af929a 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -872,6 +872,21 @@ minimaxm2.5-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 128 }
       - { tp: 8, conc-start: 4, conc-end: 16 }
 
+minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: amd/MiniMax-M2.5-MXFP4
+  model-prefix: minimaxm2.5
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 1, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] }
+      - { tp: 1, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48] }
+
 minimaxm2.5-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/MiniMax-M2.5-MXFP4
@@ -2518,6 +2533,16 @@ kimik2.5-fp4-mi355x-vllm-agentic:
       - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
       - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
 
+kimik2.5-fp4-mi355x-vllm-agentic-lmcache:
+  image: vllm/vllm-openai-rocm:v0.22.0
+  model: amd/Kimi-K2.5-MXFP4
+  model-prefix: kimik2.5
+    agentic-coding:
+    - duration: 1800
+      search-space:
+      - { tp: 4, ep: 1, offloading: none, conc-list:    [4, 8, 16, 32, 40, 48, 56, 64, 72] }
+      - { tp: 4, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] }
+
 minimaxm2.5-fp8-mi355x-vllm-agentic:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -2574,19 +2599,15 @@ minimaxm2.5-fp8-mi325x-vllm-agentic:
       - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
 
 qwen3.5-fp8-mi355x-sglang-agentic-hicache:
-  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
+  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: mi355x
-  precision: fp8
-  framework: sglang
-  multinode: false
-  scenarios:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
-      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
+      - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] }
+      - { tp: 4, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] }
 
 dsv4-fp4-mi355x-vllm-agentic:
   image: vllm/vllm-openai-rocm:v0.22.0

From 616f4db634e13e57b3244853dd317bd3f8a5bd1c Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Wed, 3 Jun 2026 14:40:55 +0900
Subject: [PATCH 14/15] [AMD] agentx-v0.4: add MiniMax agentic script, refactor
 Kimi/Qwen scripts

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../agentic/kimik2.5_fp4_mi355x.sh            | 674 ++----------------
 .../agentic/minimaxm2.5_fp4_mi355x.sh         | 256 +++++++
 .../single_node/agentic/qwen3.5_fp8_mi355x.sh | 112 ++-
 .../agentic/qwen3.5_fp8_mi355x_sglang.sh      | 152 ----
 4 files changed, 397 insertions(+), 797 deletions(-)
 create mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh
 delete mode 100755 benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh

diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index 139b12256..d05b27253 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -14,15 +14,11 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
-
-# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0.
-# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this
-# script we need the concrete value so AgentX filters prompt+max_tokens against
-# the same limit vLLM enforces.
-if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then
-    MAX_MODEL_LEN=262144
-fi
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -33,557 +29,22 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-# `hf download` creates the target dir if missing and is itself idempotent.
-# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
-# Either way, MODEL_PATH is what the server is launched with.
-if [[ -n "${MODEL_PATH:-}" ]]; then
-    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
-        hf download "$MODEL" --local-dir "$MODEL_PATH"
-    fi
-else
-    hf download "$MODEL"
-    export MODEL_PATH="$MODEL"
-fi
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 rocm-smi || true
 amd-smi || true
 
+# ---- Resolve traces and install deps ----------------------------------------
+# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
+# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
+# signal at high concurrency.
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
+
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
 
-# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
-pip install amd-quark
-
-# Disable AITER RMSNorm for TP < 8 due to accuracy issues
-if [ "${TP}" -lt 8 ]; then
-  export VLLM_ROCM_USE_AITER_RMSNORM=0
-fi
-
-write_lmcache_rocm_mp_patch() {
-    local patch_dir="$1"
-    mkdir -p "$patch_dir"
-    cat > "$patch_dir/sitecustomize.py" <<'PY'
-"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches."""
-
-import os
-import threading
-
-if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1":
-    import builtins
-    import sys
-
-    _orig_import = builtins.__import__
-
-    def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
-        _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
-
-        if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False):
-            return
-
-        _orig_init = _LazyMemoryAllocator.__init__
-        _orig_allocate = _LazyMemoryAllocator.allocate
-        _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
-
-        def _expand_to(self, target_size: int) -> None:
-            target_size = min(
-                self._final_size,
-                _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
-            )
-            lock = self._agentic_rocm_demand_expand_lock
-            with lock:
-                if target_size <= self._curr_size:
-                    return
-
-                start_size = self._curr_size
-                while self._curr_size < target_size:
-                    commit_start = self._curr_size
-                    commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
-                    while self._curr_size < commit_target:
-                        self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
-                        self._curr_size += self.PIN_CHUNK_SIZE
-                    self._commit_expansion(self._curr_size - commit_start)
-
-                self._log_expansion_progress(self._curr_size - start_size)
-
-        def _retry_with_demand_expansion(self, allocate_once):
-            obj = allocate_once()
-            step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64"))
-            step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
-
-            while obj is None and self._curr_size < self._final_size:
-                _expand_to(self, self._curr_size + step_bytes)
-                obj = allocate_once()
-
-            return obj
-
-        def _patched_init(self, *args, **kwargs):
-            _orig_init(self, *args, **kwargs)
-            self._agentic_rocm_demand_expand_lock = threading.Lock()
-
-            # LMCache MP's upstream LazyMemoryAllocator currently expands to
-            # the final pinned size in a background thread. On ROCm Kimi TP4,
-            # vLLM reaches KV-cache registration only after that 2.5 TB pool
-            # is fully pinned, and the server-side IPC open path can stall
-            # before acknowledging register_kv_caches. Keep the same final
-            # capacity, but pin/commit extra host memory only when L1
-            # allocations actually need it.
-            self._stop_expand.set()
-            self._expand_thread.join()
-            _lazy_memory_allocator.logger.info(
-                "Agentic ROCm patch: using demand-driven LMCache pinned "
-                "memory expansion; final capacity remains %s MB",
-                self._final_size >> 20,
-            )
-
-        def _patched_allocate(
-            self,
-            shapes,
-            dtypes,
-            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
-            allocator_type=None,
-        ):
-            return _retry_with_demand_expansion(
-                self,
-                lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
-            )
-
-        def _patched_batched_allocate(
-            self,
-            shapes,
-            dtypes,
-            batch_size,
-            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
-            allocator_type=None,
-        ):
-            return _retry_with_demand_expansion(
-                self,
-                lambda: _orig_batched_allocate(
-                    self, shapes, dtypes, batch_size, fmt, allocator_type
-                ),
-            )
-
-        _LazyMemoryAllocator.__init__ = _patched_init
-        _LazyMemoryAllocator.allocate = _patched_allocate
-        _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
-        _LazyMemoryAllocator._agentic_rocm_demand_patch = True
-
-    def _patch_l1_memory_manager(_memory_manager) -> None:
-        _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
-        _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
-        if _L1MemoryManager is None or _LazyMemoryAllocator is None:
-            return
-        if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False):
-            return
-
-        _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
-
-        def _patched_get_memory_usage(self):
-            allocator = getattr(self, "_allocator", None)
-            if isinstance(allocator, _LazyMemoryAllocator):
-                address_manager = allocator.get_address_manager()
-                used_size = (
-                    address_manager.get_heap_size() - address_manager.get_free_size()
-                )
-                return used_size, allocator._final_size
-            return _orig_get_memory_usage(self)
-
-        _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
-        _L1MemoryManager._agentic_rocm_final_capacity_patch = True
-
-    def _maybe_patch_lazy_memory_allocator() -> None:
-        module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
-        if module is not None and hasattr(module, "LazyMemoryAllocator"):
-            _patch_lazy_memory_allocator(module)
-
-    def _maybe_patch_l1_memory_manager() -> None:
-        module = sys.modules.get("lmcache.v1.distributed.memory_manager")
-        if module is not None and hasattr(module, "L1MemoryManager"):
-            _patch_l1_memory_manager(module)
-
-    def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0):
-        module = _orig_import(name, globals, locals, fromlist, level)
-        if name == "lmcache.v1.lazy_memory_allocator" or (
-            name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
-        ):
-            _maybe_patch_lazy_memory_allocator()
-        if name == "lmcache.v1.distributed.memory_manager" or (
-            name.startswith("lmcache")
-            and "lmcache.v1.distributed.memory_manager" in sys.modules
-        ):
-            _maybe_patch_l1_memory_manager()
-        return module
-
-    builtins.__import__ = _agentic_rocm_import
-    _maybe_patch_lazy_memory_allocator()
-    _maybe_patch_l1_memory_manager()
-
-if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1":
-    import torch
-    import lmcache.non_cuda_equivalents as lmc
-
-    if not hasattr(lmc, "multi_layer_block_kv_transfer"):
-        _DTYPE_BY_NAME = {
-            "bfloat16": torch.bfloat16,
-            "float16": torch.float16,
-            "float32": torch.float32,
-        }
-
-        def _dtype_from_env() -> torch.dtype:
-            name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16")
-            try:
-                return _DTYPE_BY_NAME[name]
-            except KeyError as exc:
-                raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc
-
-        def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-            block_stride = shape_desc.block_stride_elems or (
-                shape_desc.bs * shape_desc.nh * shape_desc.hs
-            )
-            base = lmc._tensor_from_ptr(
-                ptr,
-                (shape_desc.nb * block_stride,),
-                dtype,
-                device,
-            )
-            return torch.as_strided(
-                base,
-                (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs),
-                (block_stride, shape_desc.nh * shape_desc.hs, 1),
-            )
-
-        def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
-            return lmc._tensor_from_ptr(
-                ptr,
-                (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs),
-                dtype,
-                device,
-            )
-
-        def multi_layer_block_kv_transfer(
-            group_kv_pointers,
-            tmp_buffer_ptrs,
-            block_ids,
-            paged_memory_device,
-            direction,
-            shape_desc,
-            lmcache_chunk_size,
-            gpu_kv_format,
-            skip_blocks=0,
-        ) -> None:
-            # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with
-            # shape [num_blocks, block_size, hidden_size]. LMCache's Python
-            # fallback has no block-transfer entrypoint yet, so implement the
-            # same gather/scatter contract with torch indexing on ROCm.
-            if shape_desc.kv_size != 1:
-                raise NotImplementedError(
-                    "ROCm LMCache MP block fallback currently supports MLA KV caches only"
-                )
-
-            dtype = _dtype_from_env()
-            device = (
-                paged_memory_device
-                if isinstance(paged_memory_device, torch.device)
-                else torch.device(paged_memory_device)
-            )
-            num_layers = int(group_kv_pointers.numel())
-            blocks_per_chunk = lmcache_chunk_size // shape_desc.bs
-            direction_name = getattr(direction, "name", str(direction))
-
-            for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs):
-                start = chunk_idx * blocks_per_chunk
-                end = start + blocks_per_chunk
-                chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long)
-
-                dest_slot_offset = 0
-                if skip_blocks and chunk_idx == 0:
-                    chunk_blocks = chunk_blocks[int(skip_blocks):]
-                    dest_slot_offset = int(skip_blocks) * shape_desc.bs
-                if chunk_blocks.numel() == 0:
-                    continue
-
-                num_slots = int(chunk_blocks.numel()) * shape_desc.bs
-                tmp = _tmp_view(
-                    int(tmp_ptr),
-                    shape_desc,
-                    num_layers,
-                    lmcache_chunk_size,
-                    dtype,
-                    device,
-                )
-
-                for layer_idx in range(num_layers):
-                    paged = _paged_view(
-                        int(group_kv_pointers[layer_idx].item()),
-                        shape_desc,
-                        dtype,
-                        device,
-                    )
-                    tmp_slice = tmp[
-                        0,
-                        layer_idx,
-                        dest_slot_offset : dest_slot_offset + num_slots,
-                        :,
-                    ]
-                    if direction_name == "D2H":
-                        gathered = paged.index_select(0, chunk_blocks).reshape(
-                            num_slots, shape_desc.nh * shape_desc.hs
-                        )
-                        tmp_slice.copy_(gathered)
-                    elif direction_name == "H2D":
-                        src = tmp_slice.reshape(
-                            int(chunk_blocks.numel()),
-                            shape_desc.bs,
-                            shape_desc.nh * shape_desc.hs,
-                        )
-                        paged.index_copy_(0, chunk_blocks, src)
-                    else:
-                        raise ValueError(f"Unsupported transfer direction: {direction}")
-
-        lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer
-
-# ---- Chunked KV loading (prevents GPU block exhaustion at high concurrency) ----
-if os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "0") != "0":
-    import chunked_connector_patch  # noqa: F401
-
-# ---- vLLM scheduler assertion fix (stale KV transfer notifications) ----
-import scheduler_assertion_patch  # noqa: F401
-PY
-}
-
-write_chunked_connector_patch() {
-    local patch_dir="$1"
-    mkdir -p "$patch_dir"
-    cat > "$patch_dir/chunked_connector_patch.py" <<'PY'
-"""
-Monkey-patch for LMCacheMPConnector to add chunked KV loading.
-
-Fixes GPU block exhaustion deadlock at high concurrency by capping
-the number of external tokens reported AND retrieved per scheduling step.
-
-Usage: set CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD=<tokens> and import this
-module from sitecustomize.py before LMCache is loaded.
-"""
-
-import logging
-import os
-import sys
-import builtins
-
-logger = logging.getLogger("chunked_lmcache_patch")
-
-_MAX_TOKENS = int(os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "32768"))
-
-# Per-request chunk tracking (module-level, survives across calls)
-_chunk_state: dict[str, dict] = {}
-
-
-def _apply_patch():
-    """Patch LMCacheMPConnector in-place."""
-    mod = sys.modules.get("lmcache.integration.vllm.lmcache_mp_connector")
-    if mod is None:
-        return
-    cls = getattr(mod, "LMCacheMPConnector", None)
-    if cls is None or getattr(cls, "_chunked_patch_applied", False):
-        return
-
-    LMCacheMPRequestState = getattr(mod, "LMCacheMPRequestState", None)
-    _orig_get_matched = cls.get_num_new_matched_tokens
-    _orig_get_finished = cls.get_finished
-
-    def _get_blocks_per_chunk(self):
-        block_size = getattr(self, "block_size", 1)
-        return max(1, _MAX_TOKENS // block_size)
-
-    def _patched_get_num_new_matched_tokens(self, request, num_computed_tokens):
-        full_match = _orig_get_matched(self, request, num_computed_tokens)
-        if full_match <= 0 or _MAX_TOKENS <= 0:
-            return full_match
-
-        req_id = request.request_id
-        block_size = getattr(self, "block_size", 1)
-        blocks_per_chunk = _get_blocks_per_chunk(self)
-        full_match_blocks = full_match // block_size
-
-        state = _chunk_state.get(req_id)
-        if state is None or state.get("num_computed_at_start") != num_computed_tokens:
-            state = {
-                "full_match_blocks": full_match_blocks,
-                "chunk_end_blocks": 0,
-                "num_computed_at_start": num_computed_tokens,
-                "lookup_done": False,
-            }
-            _chunk_state[req_id] = state
-
-        if state["lookup_done"]:
-            return 0
-
-        remaining = state["full_match_blocks"] - state["chunk_end_blocks"]
-        if remaining <= 0:
-            state["lookup_done"] = True
-            return 0
-
-        this_chunk = min(remaining, blocks_per_chunk)
-        state["chunk_end_blocks"] += this_chunk
-        if state["chunk_end_blocks"] >= state["full_match_blocks"]:
-            state["lookup_done"] = True
-
-        capped = this_chunk * block_size
-        if capped < full_match:
-            logger.debug(
-                "Chunked LMCache: req %s capped %d -> %d tokens "
-                "(chunk %d/%d blocks)",
-                req_id, full_match, capped, this_chunk, full_match_blocks,
-            )
-
-        # Cap the tracker's hit blocks to match what we report
-        tracker = getattr(request, "kv_transfer_params", None)
-        if tracker is not None:
-            orig_hits = getattr(tracker, "num_lmcache_hit_blocks", 0)
-            if orig_hits > this_chunk:
-                tracker.num_lmcache_hit_blocks = this_chunk
-
-        return capped
-
-    def _patched_get_finished(self, scheduler_output):
-        result = _orig_get_finished(self, scheduler_output)
-        # Clean up chunk state for finished requests.
-        # vLLM passes scheduler_output as a set of request-ID strings
-        # (not a SchedulerOutput object), so iterate directly when it
-        # is a set/frozenset; fall back to the attribute path for
-        # forward compatibility.
-        if isinstance(scheduler_output, (set, frozenset)):
-            finished = scheduler_output
-        else:
-            finished = getattr(scheduler_output, "finished_req_ids", [])
-        for req in finished:
-            _chunk_state.pop(req, None)
-        return result
-
-    cls.get_num_new_matched_tokens = _patched_get_num_new_matched_tokens
-    cls.get_finished = _patched_get_finished
-    cls._chunked_patch_applied = True
-    logger.info(
-        "Chunked LMCache connector patch applied "
-        "(max_tokens_per_load=%d)", _MAX_TOKENS,
-    )
-
-
-_orig_import = builtins.__import__
-
-
-def _patching_import(name, *args, **kwargs):
-    module = _orig_import(name, *args, **kwargs)
-    if (
-        name == "lmcache.integration.vllm.lmcache_mp_connector"
-        or (
-            name.startswith("lmcache")
-            and "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules
-        )
-    ):
-        _apply_patch()
-    return module
-
-
-builtins.__import__ = _patching_import
-_apply_patch()
-PY
-}
-
-write_scheduler_assertion_patch() {
-    local patch_dir="$1"
-    mkdir -p "$patch_dir"
-    cat > "$patch_dir/scheduler_assertion_patch.py" <<'PY'
-"""
-Patch vLLM scheduler to handle stale finished_recving gracefully.
-
-The assertion at scheduler.py crashes when a KV transfer reports
-"finished recving" but the request is already in RUNNING state.
-This happens when transfers complete asynchronously and the scheduler
-has already moved the request forward.
-
-Fix: Instead of asserting, log a warning and skip.
-"""
-
-import logging
-import sys
-import builtins
-
-logger = logging.getLogger("scheduler_assertion_patch")
-
-
-def _apply_patch():
-    """Patch vLLM scheduler's _update_from_kv_xfer_finished."""
-    sched_mod = sys.modules.get("vllm.v1.core.sched.scheduler")
-    if sched_mod is None:
-        return
-    req_mod = sys.modules.get("vllm.v1.request")
-    if req_mod is None:
-        return
-    Scheduler = getattr(sched_mod, "Scheduler", None)
-    RequestStatus = getattr(req_mod, "RequestStatus", None)
-    if Scheduler is None or RequestStatus is None:
-        return
-    if getattr(Scheduler, "_kv_xfer_patch_applied", False):
-        return
-
-    _orig_update = Scheduler._update_from_kv_xfer_finished
-
-    def _patched_update(self, kv_connector_output):
-        if self.connector is not None:
-            self.connector.update_connector_output(kv_connector_output)
-        for req_id in kv_connector_output.finished_recving or ():
-            if req_id not in self.requests:
-                continue
-            req = self.requests[req_id]
-            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
-                self.finished_recving_kv_req_ids.add(req_id)
-            elif RequestStatus.is_finished(req.status):
-                self._free_blocks(self.requests[req_id])
-            else:
-                logger.warning(
-                    "Stale finished_recving for req %s in status %s; skipping.",
-                    req_id, req.status.name,
-                )
-        for req_id in kv_connector_output.finished_sending or ():
-            if req_id not in self.requests:
-                continue
-            self._free_blocks(self.requests[req_id])
-
-    Scheduler._update_from_kv_xfer_finished = _patched_update
-    Scheduler._kv_xfer_patch_applied = True
-    logger.info("Scheduler KV transfer assertion patch applied")
-
-
-_orig_import = builtins.__import__
-
-
-def _patching_import(name, *args, **kwargs):
-    module = _orig_import(name, *args, **kwargs)
-    if (
-        name == "vllm.v1.core.sched.scheduler"
-        or (
-            name.startswith("vllm")
-            and "vllm.v1.core.sched.scheduler" in sys.modules
-        )
-    ):
-        _apply_patch()
-    return module
-
-
-builtins.__import__ = _patching_import
-_apply_patch()
-PY
-}
-
-# Workaround for MEC FW <177 RCCL memory reclaim issue
-version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
-if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
-    export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
-export VLLM_ROCM_USE_AITER=1
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
 LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
@@ -591,6 +52,8 @@ mkdir -p "$RESULT_DIR"
 
 OFFLOAD_ARGS=()
 PREFIX_CACHE_ARGS=()
+
+# ---- Lmcache config ----------------------------------------------------------
 LMCACHE_PID=""
 
 cleanup_lmcache_server() {
@@ -648,7 +111,9 @@ case "$OFFLOADING" in
         # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
         # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
         # worker RSS / page cache / slurm cgroup).
-        TOTAL_CPU_DRAM_GB=2500
+        #TODO: fix
+        TOTAL_CPU_DRAM_GB=3000
+        TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
         # Use vLLM's regular native KV-offload path (OffloadingConnector),
         # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
         # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
@@ -659,7 +124,7 @@ case "$OFFLOADING" in
         # (vllm/config/vllm.py:662).
         OFFLOAD_ARGS=(
             --kv_offloading_backend native
-            --kv_offloading_size "$TOTAL_CPU_DRAM_GB"
+            --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
             --disable-hybrid-kv-cache-manager
         )
         ;;
@@ -667,74 +132,20 @@ case "$OFFLOADING" in
         { set +x; } 2>/dev/null
         unset VLLM_USE_SIMPLE_KV_OFFLOAD
 
-        agentic_pip_install --quiet --no-cache-dir lmcache
-        # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and
-        # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and
-        # during Kimi fused-MoE model inspection it imports nixl_ep whenever
-        # that module is importable, even when this run is not using EP/NIXL
-        # kernels. The CUDA extension then fails immediately on AMD nodes with
-        # "ImportError: libcuda.so.1".
-        #
-        # LMCache MP also uses CuPy stream APIs while registering vLLM's KV
-        # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime
-        # with cudaErrorInsufficientDriver when LMCache touches the stream. Use
-        # the ROCm 7 CuPy wheel so the same API dispatches through HIP.
-        python3 -m pip uninstall -y \
-            nixl nixl-cu12 nixl-cu13 nixl_ep \
-            >/dev/null 2>&1 || true
-        python3 -m pip uninstall -y \
-            cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \
-            >/dev/null 2>&1 || true
-        agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0
-        python3 - <<'PY'
-import importlib.util
-import sys
-
-spec = importlib.util.find_spec("nixl_ep")
-if spec is not None:
-    locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"])
-    print(
-        "Error: nixl_ep is still importable after LMCache install; "
-        "this ROCm Kimi run would import a CUDA-only nixl_ep module. "
-        f"location={locations}",
-        file=sys.stderr,
-    )
-    sys.exit(1)
-
-try:
-    from cupy_backends.cuda.api import runtime as cupy_runtime
-except Exception as exc:
-    print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr)
-    sys.exit(1)
-
-if not getattr(cupy_runtime, "is_hip", False):
-    print(
-        "Error: CuPy is still using the CUDA backend after installing "
-        "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.",
-        file=sys.stderr,
-    )
-    sys.exit(1)
-PY
-        LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch"
-        write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR"
-        write_chunked_connector_patch "$LMCACHE_ROCM_PATCH_DIR"
-        write_scheduler_assertion_patch "$LMCACHE_ROCM_PATCH_DIR"
-        export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1
-        export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
-        export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1
-        # Cap external KV tokens loaded per scheduling step to prevent GPU
-        # block exhaustion deadlock at high concurrency (c>=32).  Default
-        # 32768 keeps peak block demand within the GPU KV pool.  Set to 0 to
-        # disable chunking (only safe at low concurrency).
-        export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD="${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:-32768}"
-        export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
+        git clone https://github.com/LMCache/LMCache.git
+        cd LMCache
+        pip install -r requirements/build.txt 
+        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
+        cd ..
+
         python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
 
         # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
         # pool, but let the external MP server own that pool so vLLM does not
         # split --kv-offloading-size across TP ranks through the integrated
         # LMCache backend.
-        TOTAL_CPU_DRAM_GB=2500
+        #TODO: fix
+        TOTAL_CPU_DRAM_GB=3000
         LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
         LMCACHE_PORT="${LMCACHE_PORT:-5555}"
         LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
@@ -742,7 +153,7 @@ PY
         # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
         # ZMQ-style host string.
         LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
-        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}"
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
         LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
         # LMCache read locks are leases on chunks that lookup has promised
         # vLLM can retrieve. The default 300s TTL is too short for this
@@ -750,10 +161,11 @@ PY
         # lookup and retrieve while GPU KV is saturated, which leaves the
         # object present in L1 but no longer readable. Keep the 2.5 TB pool
         # size unchanged and only extend the lookup-to-retrieve lease.
-        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}"
+        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
         LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
         LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
         export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+        export LMCACHE_BLOCKING_TIMEOUT_SECS=120
 
         echo "Starting LMCache MP server..."
         LMCACHE_CMD=(
@@ -786,6 +198,7 @@ PY
     *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
 esac
 
+# ---- LLM server config ----------------------------------------------------------
 EP_ARGS=()
 if [ "$EP_SIZE" -gt 1 ]; then
     EP_ARGS=(--enable-expert-parallel)
@@ -794,17 +207,34 @@ fi
 echo "Starting vllm server..."
 export PYTHONNOUSERSITE=1
 
+# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
+pip install amd-quark
+
+# Disable AITER RMSNorm for TP < 8 due to accuracy issues
+if [ "${TP}" -lt 8 ]; then
+  export VLLM_ROCM_USE_AITER_RMSNORM=0
+fi
+
+# Workaround for MEC FW <177 RCCL memory reclaim issue
+version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
+if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
+    export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
+    vllm serve "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --tensor-parallel-size="$TP"
     "${EP_ARGS[@]}"
     --gpu-memory-utilization 0.90
+    --kv-cache-dtype fp8 \
     --block-size=1
     --trust-remote-code
-    --max-model-len "$MAX_MODEL_LEN"
     --max-num-seqs "$CONC"
     --mm-encoder-tp-mode data
     "${PREFIX_CACHE_ARGS[@]}"
@@ -821,4 +251,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh
new file mode 100755
index 000000000..f36fc59e9
--- /dev/null
+++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh
@@ -0,0 +1,256 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - vLLM GPU KV only.
+#   cpu     - vLLM native CPU offload.
+#   lmcache - LMCache MP server + vLLM LMCacheMPConnector.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
+
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# ROCR/HIP visibility for vLLM 0.14+
+if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
+    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
+fi
+
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+rocm-smi || true
+amd-smi || true
+
+# ---- Resolve traces and install deps ----------------------------------------
+# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
+# corpus has requests up to ~1M proxy tokens that would be rejected.
+# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
+
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
+mkdir -p "$RESULT_DIR"
+
+OFFLOAD_ARGS=()
+PREFIX_CACHE_ARGS=()
+
+# ---- Lmcache config ----------------------------------------------------------
+LMCACHE_PID=""
+
+cleanup_lmcache_server() {
+    if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
+        kill "$LMCACHE_PID" 2>/dev/null || true
+        wait "$LMCACHE_PID" 2>/dev/null || true
+    fi
+}
+
+trap cleanup_lmcache_server EXIT
+
+wait_for_lmcache_ready() {
+    { set +x; } 2>/dev/null
+    local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
+    local tail_pid=""
+
+    while [ ! -f "$LMCACHE_LOG" ]; do
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before creating log file. Exiting." >&2
+            exit 1
+        fi
+        sleep 1
+    done
+
+    tail -f -n +1 "$LMCACHE_LOG" &
+    tail_pid=$!
+
+    for ((i = 1; i <= attempts; i++)); do
+        if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            return 0
+        fi
+        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
+            echo "LMCache server died before becoming healthy. Log follows:" >&2
+            kill "$tail_pid" 2>/dev/null || true
+            wait "$tail_pid" 2>/dev/null || true
+            cat "$LMCACHE_LOG" >&2 || true
+            exit 1
+        fi
+        sleep 1
+    done
+
+    echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
+    kill "$tail_pid" 2>/dev/null || true
+    wait "$tail_pid" 2>/dev/null || true
+    cat "$LMCACHE_LOG" >&2 || true
+    exit 1
+}
+
+case "$OFFLOADING" in
+    none) ;;
+    cpu)
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+        # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
+        # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
+        # worker RSS / page cache / slurm cgroup).
+        TOTAL_CPU_DRAM_GB=3000
+        TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        # Use vLLM's regular native KV-offload path (OffloadingConnector),
+        # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
+        # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
+        # would switch it to SimpleCPUOffloadConnector. We intentionally leave
+        # that env var UNSET here so the regular OffloadingConnector path is
+        # used. The shortcut --kv_offloading_backend native + --kv_offloading_size
+        # form constructs the KVTransferConfig at engine startup
+        # (vllm/config/vllm.py:662).
+
+        # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
+        # This gives extra cache hit than disabling hybrid kv cache manager
+        # srok,
+        # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma
+        # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60
+        OFFLOAD_ARGS=(
+            --kv_offloading_backend native
+            --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
+            --disable-hybrid-kv-cache-manager
+        )
+        ;;
+    lmcache)
+        { set +x; } 2>/dev/null
+        unset VLLM_USE_SIMPLE_KV_OFFLOAD
+
+        git clone https://github.com/LMCache/LMCache.git
+        cd LMCache
+        pip install -r requirements/build.txt 
+        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
+        cd ..
+
+        python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
+
+        # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
+        # pool, but let the external MP server own that pool so vLLM does not
+        # split --kv-offloading-size across TP ranks through the integrated
+        # LMCache backend.
+        TOTAL_CPU_DRAM_GB=3000
+        LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
+        LMCACHE_PORT="${LMCACHE_PORT:-5555}"
+        LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
+        # LMCacheMPConnector concatenates lmcache.mp.host and port into the
+        # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
+        # ZMQ-style host string.
+        LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
+        # LMCache read locks are leases on chunks that lookup has promised
+        # vLLM can retrieve. The default 300s TTL is too short for this
+        # long-context agentic queue: TP8/conc32 can spend >300s between
+        # lookup and retrieve while GPU KV is saturated, which leaves the
+        # object present in L1 but no longer readable. Keep the 2.5 TB pool
+        # size unchanged and only extend the lookup-to-retrieve lease.
+        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
+        LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
+        LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
+        export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
+        export LMCACHE_BLOCKING_TIMEOUT_SECS=120
+
+        set -x
+        echo "Starting LMCache MP server..."
+        LMCACHE_CMD=(
+            lmcache server
+            --host "$LMCACHE_HOST"
+            --port "$LMCACHE_PORT"
+            --http-host "$LMCACHE_HOST"
+            --http-port "$LMCACHE_HTTP_PORT"
+            --l1-size-gb "$LMCACHE_L1_SIZE_GB"
+            --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
+            --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS"
+            --chunk-size "$LMCACHE_CHUNK_SIZE"
+            --max-workers "$LMCACHE_MAX_WORKERS"
+            --eviction-policy LRU
+        )
+        printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
+        printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
+        "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
+        LMCACHE_PID=$!
+        echo "LMCache server PID: $LMCACHE_PID"
+        wait_for_lmcache_ready
+
+        PREFIX_CACHE_ARGS=(--enable-prefix-caching)
+        # srok,
+        # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma
+        # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60
+        OFFLOAD_ARGS=(
+            --kv-transfer-config
+            "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
+            --disable-hybrid-kv-cache-manager
+        )
+        ;;
+    *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
+esac
+
+# ---- LLM server config ----------------------------------------------------------
+EP_ARGS=()
+if [ "$EP_SIZE" -gt 1 ]; then
+    EP_ARGS=(--enable-expert-parallel)
+fi
+
+echo "Starting vllm server..."
+export PYTHONNOUSERSITE=1
+
+# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
+pip install -q amd-quark
+
+# Workaround for MEC FW <177 RCCL memory reclaim issue
+version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
+if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
+    export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+{ set +x; } 2>/dev/null
+VLLM_CMD=(
+    vllm serve "$MODEL"
+    --host 0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size="$TP"
+    "${EP_ARGS[@]}"
+    --gpu-memory-utilization 0.95
+    --kv-cache-dtype fp8 \
+    --block-size=32
+    --trust-remote-code
+    --attention-backend "ROCM_AITER_FA" 
+    --max-num-seqs "$CONC"
+    "${PREFIX_CACHE_ARGS[@]}"
+    "${OFFLOAD_ARGS[@]}"
+)
+printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
+"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
index ff901b674..656e924dc 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
@@ -2,51 +2,117 @@
 set -euo pipefail
 set -x
 
-# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang.
+# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
+#
+# Base server recipe follows the upstream MI300X reference
+# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
+# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
+# The agentic harness (resolve_trace_source / build_replay_cmd /
+# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
+# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
 #
 # Required env vars:
-#   MODEL, TP, CONC, RESULT_DIR
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV with the default RadixAttention prefix cache.
+#   hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
 
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
+PORT=${PORT:-8888}
+DURATION=${DURATION:-1800}
+EP_SIZE=${EP_SIZE:-1}
+
+SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30}
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-# `hf download` creates the target dir if missing and is itself idempotent.
-# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
-# Either way, MODEL_PATH is what the server is launched with.
-if [[ -n "${MODEL_PATH:-}" ]]; then
-    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
-        hf download "$MODEL" --local-dir "$MODEL_PATH"
-    fi
-else
-    hf download "$MODEL"
-    export MODEL_PATH="$MODEL"
-fi
+if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 rocm-smi || true
 amd-smi || true
 
+# ---- Resolve traces and install deps ----------------------------------------
+# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
+# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
+# signal at high concurrency.
+#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
+#060226
+export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
+
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
 
-# ---- Start SGLang server ----------------------------------------------------
+# ---- Cache / offload config -------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
 mkdir -p "$RESULT_DIR"
 
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per
+        # TP rank (one hierarchical KV, one hierarchical Mamba), so the
+        # node-total DRAM budget divides by TP and the host-pool count.
+        TOTAL_CPU_DRAM_GB=3000
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
+        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+        # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which
+        # requires page_size=1. Keep the safer direct/layer_first copy path;
+        # kernel/page_first faults on first prefill in this mode on ROCm.
+        HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
+            HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
+        fi
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --page-size "$HICACHE_PAGE_SIZE"
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        # HiCache startup reaches API readiness but SGLang's internal warmup
+        # request can time out on this path; let aiperf own benchmark traffic.
+        WARMUP_ARGS=(--skip-server-warmup)
+        # Don't force ROCm graph capture at every high concurrency point; conc=16
+        # is the highest known-good capture size for this model/server path.
+        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}"
+        if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
+            CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
+        fi
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
 echo "Starting SGLang server..."
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
     --attention-backend triton \
-    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
+    --model-path $MODEL \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
@@ -56,10 +122,10 @@ python3 -m sglang.launch_server \
     --enable-aiter-allreduce-fusion \
     --cuda-graph-max-bs $CONC \
     --max-running-requests $CONC \
-    --max-prefill-tokens 32768 \
-    --scheduler-recv-interval 30 \
+    --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
     --mem-fraction-static 0.8 \
-    --context-length $MAX_MODEL_LEN \
+    "${CACHE_ARGS[@]}" \
+    "${WARMUP_ARGS[@]}" \
     --enable-metrics > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
@@ -69,4 +135,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
deleted file mode 100755
index cdded8860..000000000
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-set -x
-
-# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang.
-#
-# Required env vars:
-#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
-#
-# OFFLOADING values:
-#   none    - SGLang GPU KV only with radix cache disabled.
-#   hicache - SGLang HiCache with local CPU hierarchical cache.
-
-source "$(dirname "$0")/../../benchmark_lib.sh"
-
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
-
-SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30}
-if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
-    MAX_MODEL_LEN=131072
-fi
-
-if [[ -n "${SLURM_JOB_ID:-}" ]]; then
-    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
-fi
-
-# `hf download` creates the target dir if missing and is itself idempotent.
-# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
-# Either way, MODEL_PATH is what the server is launched with.
-if [[ -n "${MODEL_PATH:-}" ]]; then
-    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
-        hf download "$MODEL" --local-dir "$MODEL_PATH"
-    fi
-else
-    hf download "$MODEL"
-    export MODEL_PATH="$MODEL"
-fi
-rocm-smi || true
-amd-smi || true
-
-# ---- Resolve traces and install deps ----------------------------------------
-resolve_trace_source
-install_agentic_deps
-
-# ---- Server config ----------------------------------------------------------
-SERVER_LOG="$RESULT_DIR/server.log"
-mkdir -p "$RESULT_DIR"
-
-CACHE_ARGS=()
-WARMUP_ARGS=()
-CUDA_GRAPH_MAX_BS="$CONC"
-case "$OFFLOADING" in
-    none)
-        # Leave SGLang's default RadixAttention prefix cache on — agentic
-        # replay needs it; --disable-radix-cache would zero the hit rate.
-        ;;
-    hicache)
-        # MI355X nodes have about 3 TB of host DRAM, but Qwen3.5's hybrid
-        # GDN/Mamba path allocates two HiCache host pools per TP rank: one for
-        # hierarchical KV cache and one for hierarchical Mamba cache. A 2 TB
-        # node-total target at TP=8 is therefore 2000 / (8 * 2) = 125 GB per
-        # host pool, not 250 GB. Keep overrides for one-off tuning.
-        TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-2000}"
-        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
-        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-180}}"
-        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
-        # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler on
-        # MI355X, which requires page_size=1. The kernel/page_first HiCache
-        # transfer path faults on first prefill in this mode on ROCm, so keep
-        # the default on the safer direct/layer_first copy path. These remain
-        # env-overridable for future SGLang/ROCm fixes.
-        HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
-        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
-        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
-        # SGLang --hicache-size is per rank per host pool, while the workflow
-        # input is a node-total DRAM budget. Divide by TP and the number of
-        # host pools unless HICACHE_SIZE_GB is set directly for one-off tuning.
-        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
-        if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
-            HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
-        fi
-        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
-            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
-            exit 1
-        fi
-        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
-        CACHE_ARGS=(
-            --page-size "$HICACHE_PAGE_SIZE"
-            --enable-hierarchical-cache
-            --hicache-size "$HICACHE_SIZE_GB"
-            --hicache-io-backend "$HICACHE_IO_BACKEND"
-            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
-            --hicache-write-policy "$HICACHE_WRITE_POLICY"
-        )
-        # HiCache startup reaches API readiness, but SGLang's internal warmup
-        # request has timed out after 600s on this Qwen MI355X path. Let aiperf
-        # own benchmark traffic instead of blocking server readiness on it.
-        WARMUP_ARGS=(--skip-server-warmup)
-        # Keep request concurrency as the swept variable, but do not force
-        # HiCache runs to capture ROCm graphs at every high concurrency point.
-        # The conc=32 HiCache job crashed after startup readiness, before any
-        # aiperf traffic, while conc=16 is the highest known-good capture size
-        # for this model/server path. Requests above the capture size can still
-        # run; they just do not require a larger captured graph at startup.
-        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}"
-        if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
-            CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
-        fi
-        ;;
-    *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
-        exit 1
-        ;;
-esac
-
-echo "Starting SGLang server..."
-export PYTHONNOUSERSITE=1
-
-{ set +x; } 2>/dev/null
-SGLANG_CMD=(
-    python3 -m sglang.launch_server
-    --attention-backend triton
-    --model-path "$MODEL_PATH" --served-model-name "$MODEL"
-    --host=0.0.0.0
-    --port "$PORT"
-    --tensor-parallel-size "$TP"
-    --ep-size "$EP_SIZE"
-    --trust-remote-code
-    --tokenizer-worker-num 6
-    --enable-aiter-allreduce-fusion
-    --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
-    --max-running-requests "$CONC"
-    --max-prefill-tokens 32768
-    --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL"
-    --mem-fraction-static 0.8
-    --context-length "$MAX_MODEL_LEN"
-    --enable-metrics
-    "${CACHE_ARGS[@]}"
-    "${WARMUP_ARGS[@]}"
-)
-printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt"
-printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt"
-"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 &
-SERVER_PID=$!
-echo "Server PID: $SERVER_PID"
-
-wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
-
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"

From 574d8914475ba7d5f8cc0ec9d17ea79aca03e95d Mon Sep 17 00:00:00 2001
From: seungrokj <seungrok.jung@amd.com>
Date: Wed, 3 Jun 2026 14:46:29 +0900
Subject: [PATCH 15/15] Revert "[AMD] agentx-v0.4: add MiniMax agentic script,
 refactor Kimi/Qwen scripts" and "[AMD] agentx-v0.4: add MiniMax/Kimi lmcache
 agentic entries, update Qwen hicache config"

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml               |  35 +-
 .../agentic/kimik2.5_fp4_mi355x.sh            | 674 ++++++++++++++++--
 .../agentic/minimaxm2.5_fp4_mi355x.sh         | 256 -------
 .../single_node/agentic/qwen3.5_fp8_mi355x.sh | 112 +--
 .../agentic/qwen3.5_fp8_mi355x_sglang.sh      | 152 ++++
 5 files changed, 804 insertions(+), 425 deletions(-)
 delete mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh
 create mode 100755 benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 134af929a..7f1c8192d 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -872,21 +872,6 @@ minimaxm2.5-fp4-mi355x-atom:
       - { tp: 4, conc-start: 4, conc-end: 128 }
       - { tp: 8, conc-start: 4, conc-end: 16 }
 
-minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: amd/MiniMax-M2.5-MXFP4
-  model-prefix: minimaxm2.5
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  multinode: false
-  scenarios:
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 1, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] }
-      - { tp: 1, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48] }
-
 minimaxm2.5-fp4-mi355x-vllm:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: amd/MiniMax-M2.5-MXFP4
@@ -2533,16 +2518,6 @@ kimik2.5-fp4-mi355x-vllm-agentic:
       - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] }
       - { tp: 4, offloading: cpu,  conc-list: [16, 24, 32, 40] }
 
-kimik2.5-fp4-mi355x-vllm-agentic-lmcache:
-  image: vllm/vllm-openai-rocm:v0.22.0
-  model: amd/Kimi-K2.5-MXFP4
-  model-prefix: kimik2.5
-    agentic-coding:
-    - duration: 1800
-      search-space:
-      - { tp: 4, ep: 1, offloading: none, conc-list:    [4, 8, 16, 32, 40, 48, 56, 64, 72] }
-      - { tp: 4, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] }
-
 minimaxm2.5-fp8-mi355x-vllm-agentic:
   image: vllm/vllm-openai-rocm:v0.22.0
   model: MiniMaxAI/MiniMax-M2.5
@@ -2599,15 +2574,19 @@ minimaxm2.5-fp8-mi325x-vllm-agentic:
       - { tp: 4, offloading: cpu,  conc-list: [16, 20, 24, 28, 32] }
 
 qwen3.5-fp8-mi355x-sglang-agentic-hicache:
-  image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531
+  image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
   runner: mi355x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
     agentic-coding:
     - duration: 1800
       search-space:
-      - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] }
-      - { tp: 4, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] }
+      - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] }
+      - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] }
 
 dsv4-fp4-mi355x-vllm-agentic:
   image: vllm/vllm-openai-rocm:v0.22.0
diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
index d05b27253..139b12256 100755
--- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
+++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh
@@ -14,11 +14,15 @@ set -x
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
-
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-EP_SIZE=${EP_SIZE:-1}
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
+# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0.
+# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this
+# script we need the concrete value so AgentX filters prompt+max_tokens against
+# the same limit vLLM enforces.
+if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then
+    MAX_MODEL_LEN=262144
+fi
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
@@ -29,22 +33,557 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
-# ---- Resolve traces and install deps ----------------------------------------
-# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
-# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
-# signal at high concurrency.
-#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
-#060226
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
-
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
 
+# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
+pip install amd-quark
+
+# Disable AITER RMSNorm for TP < 8 due to accuracy issues
+if [ "${TP}" -lt 8 ]; then
+  export VLLM_ROCM_USE_AITER_RMSNORM=0
+fi
+
+write_lmcache_rocm_mp_patch() {
+    local patch_dir="$1"
+    mkdir -p "$patch_dir"
+    cat > "$patch_dir/sitecustomize.py" <<'PY'
+"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches."""
+
+import os
+import threading
+
+if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1":
+    import builtins
+    import sys
+
+    _orig_import = builtins.__import__
+
+    def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None:
+        _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator
+
+        if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False):
+            return
+
+        _orig_init = _LazyMemoryAllocator.__init__
+        _orig_allocate = _LazyMemoryAllocator.allocate
+        _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate
+
+        def _expand_to(self, target_size: int) -> None:
+            target_size = min(
+                self._final_size,
+                _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE),
+            )
+            lock = self._agentic_rocm_demand_expand_lock
+            with lock:
+                if target_size <= self._curr_size:
+                    return
+
+                start_size = self._curr_size
+                while self._curr_size < target_size:
+                    commit_start = self._curr_size
+                    commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE)
+                    while self._curr_size < commit_target:
+                        self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE)
+                        self._curr_size += self.PIN_CHUNK_SIZE
+                    self._commit_expansion(self._curr_size - commit_start)
+
+                self._log_expansion_progress(self._curr_size - start_size)
+
+        def _retry_with_demand_expansion(self, allocate_once):
+            obj = allocate_once()
+            step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64"))
+            step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3)))
+
+            while obj is None and self._curr_size < self._final_size:
+                _expand_to(self, self._curr_size + step_bytes)
+                obj = allocate_once()
+
+            return obj
+
+        def _patched_init(self, *args, **kwargs):
+            _orig_init(self, *args, **kwargs)
+            self._agentic_rocm_demand_expand_lock = threading.Lock()
+
+            # LMCache MP's upstream LazyMemoryAllocator currently expands to
+            # the final pinned size in a background thread. On ROCm Kimi TP4,
+            # vLLM reaches KV-cache registration only after that 2.5 TB pool
+            # is fully pinned, and the server-side IPC open path can stall
+            # before acknowledging register_kv_caches. Keep the same final
+            # capacity, but pin/commit extra host memory only when L1
+            # allocations actually need it.
+            self._stop_expand.set()
+            self._expand_thread.join()
+            _lazy_memory_allocator.logger.info(
+                "Agentic ROCm patch: using demand-driven LMCache pinned "
+                "memory expansion; final capacity remains %s MB",
+                self._final_size >> 20,
+            )
+
+        def _patched_allocate(
+            self,
+            shapes,
+            dtypes,
+            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
+            allocator_type=None,
+        ):
+            return _retry_with_demand_expansion(
+                self,
+                lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type),
+            )
+
+        def _patched_batched_allocate(
+            self,
+            shapes,
+            dtypes,
+            batch_size,
+            fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED,
+            allocator_type=None,
+        ):
+            return _retry_with_demand_expansion(
+                self,
+                lambda: _orig_batched_allocate(
+                    self, shapes, dtypes, batch_size, fmt, allocator_type
+                ),
+            )
+
+        _LazyMemoryAllocator.__init__ = _patched_init
+        _LazyMemoryAllocator.allocate = _patched_allocate
+        _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate
+        _LazyMemoryAllocator._agentic_rocm_demand_patch = True
+
+    def _patch_l1_memory_manager(_memory_manager) -> None:
+        _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None)
+        _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None)
+        if _L1MemoryManager is None or _LazyMemoryAllocator is None:
+            return
+        if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False):
+            return
+
+        _orig_get_memory_usage = _L1MemoryManager.get_memory_usage
+
+        def _patched_get_memory_usage(self):
+            allocator = getattr(self, "_allocator", None)
+            if isinstance(allocator, _LazyMemoryAllocator):
+                address_manager = allocator.get_address_manager()
+                used_size = (
+                    address_manager.get_heap_size() - address_manager.get_free_size()
+                )
+                return used_size, allocator._final_size
+            return _orig_get_memory_usage(self)
+
+        _L1MemoryManager.get_memory_usage = _patched_get_memory_usage
+        _L1MemoryManager._agentic_rocm_final_capacity_patch = True
+
+    def _maybe_patch_lazy_memory_allocator() -> None:
+        module = sys.modules.get("lmcache.v1.lazy_memory_allocator")
+        if module is not None and hasattr(module, "LazyMemoryAllocator"):
+            _patch_lazy_memory_allocator(module)
+
+    def _maybe_patch_l1_memory_manager() -> None:
+        module = sys.modules.get("lmcache.v1.distributed.memory_manager")
+        if module is not None and hasattr(module, "L1MemoryManager"):
+            _patch_l1_memory_manager(module)
+
+    def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0):
+        module = _orig_import(name, globals, locals, fromlist, level)
+        if name == "lmcache.v1.lazy_memory_allocator" or (
+            name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules
+        ):
+            _maybe_patch_lazy_memory_allocator()
+        if name == "lmcache.v1.distributed.memory_manager" or (
+            name.startswith("lmcache")
+            and "lmcache.v1.distributed.memory_manager" in sys.modules
+        ):
+            _maybe_patch_l1_memory_manager()
+        return module
+
+    builtins.__import__ = _agentic_rocm_import
+    _maybe_patch_lazy_memory_allocator()
+    _maybe_patch_l1_memory_manager()
+
+if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1":
+    import torch
+    import lmcache.non_cuda_equivalents as lmc
+
+    if not hasattr(lmc, "multi_layer_block_kv_transfer"):
+        _DTYPE_BY_NAME = {
+            "bfloat16": torch.bfloat16,
+            "float16": torch.float16,
+            "float32": torch.float32,
+        }
+
+        def _dtype_from_env() -> torch.dtype:
+            name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16")
+            try:
+                return _DTYPE_BY_NAME[name]
+            except KeyError as exc:
+                raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc
+
+        def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+            block_stride = shape_desc.block_stride_elems or (
+                shape_desc.bs * shape_desc.nh * shape_desc.hs
+            )
+            base = lmc._tensor_from_ptr(
+                ptr,
+                (shape_desc.nb * block_stride,),
+                dtype,
+                device,
+            )
+            return torch.as_strided(
+                base,
+                (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs),
+                (block_stride, shape_desc.nh * shape_desc.hs, 1),
+            )
+
+        def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+            return lmc._tensor_from_ptr(
+                ptr,
+                (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs),
+                dtype,
+                device,
+            )
+
+        def multi_layer_block_kv_transfer(
+            group_kv_pointers,
+            tmp_buffer_ptrs,
+            block_ids,
+            paged_memory_device,
+            direction,
+            shape_desc,
+            lmcache_chunk_size,
+            gpu_kv_format,
+            skip_blocks=0,
+        ) -> None:
+            # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with
+            # shape [num_blocks, block_size, hidden_size]. LMCache's Python
+            # fallback has no block-transfer entrypoint yet, so implement the
+            # same gather/scatter contract with torch indexing on ROCm.
+            if shape_desc.kv_size != 1:
+                raise NotImplementedError(
+                    "ROCm LMCache MP block fallback currently supports MLA KV caches only"
+                )
+
+            dtype = _dtype_from_env()
+            device = (
+                paged_memory_device
+                if isinstance(paged_memory_device, torch.device)
+                else torch.device(paged_memory_device)
+            )
+            num_layers = int(group_kv_pointers.numel())
+            blocks_per_chunk = lmcache_chunk_size // shape_desc.bs
+            direction_name = getattr(direction, "name", str(direction))
+
+            for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs):
+                start = chunk_idx * blocks_per_chunk
+                end = start + blocks_per_chunk
+                chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long)
+
+                dest_slot_offset = 0
+                if skip_blocks and chunk_idx == 0:
+                    chunk_blocks = chunk_blocks[int(skip_blocks):]
+                    dest_slot_offset = int(skip_blocks) * shape_desc.bs
+                if chunk_blocks.numel() == 0:
+                    continue
+
+                num_slots = int(chunk_blocks.numel()) * shape_desc.bs
+                tmp = _tmp_view(
+                    int(tmp_ptr),
+                    shape_desc,
+                    num_layers,
+                    lmcache_chunk_size,
+                    dtype,
+                    device,
+                )
+
+                for layer_idx in range(num_layers):
+                    paged = _paged_view(
+                        int(group_kv_pointers[layer_idx].item()),
+                        shape_desc,
+                        dtype,
+                        device,
+                    )
+                    tmp_slice = tmp[
+                        0,
+                        layer_idx,
+                        dest_slot_offset : dest_slot_offset + num_slots,
+                        :,
+                    ]
+                    if direction_name == "D2H":
+                        gathered = paged.index_select(0, chunk_blocks).reshape(
+                            num_slots, shape_desc.nh * shape_desc.hs
+                        )
+                        tmp_slice.copy_(gathered)
+                    elif direction_name == "H2D":
+                        src = tmp_slice.reshape(
+                            int(chunk_blocks.numel()),
+                            shape_desc.bs,
+                            shape_desc.nh * shape_desc.hs,
+                        )
+                        paged.index_copy_(0, chunk_blocks, src)
+                    else:
+                        raise ValueError(f"Unsupported transfer direction: {direction}")
+
+        lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer
+
+# ---- Chunked KV loading (prevents GPU block exhaustion at high concurrency) ----
+if os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "0") != "0":
+    import chunked_connector_patch  # noqa: F401
+
+# ---- vLLM scheduler assertion fix (stale KV transfer notifications) ----
+import scheduler_assertion_patch  # noqa: F401
+PY
+}
+
+write_chunked_connector_patch() {
+    local patch_dir="$1"
+    mkdir -p "$patch_dir"
+    cat > "$patch_dir/chunked_connector_patch.py" <<'PY'
+"""
+Monkey-patch for LMCacheMPConnector to add chunked KV loading.
+
+Fixes GPU block exhaustion deadlock at high concurrency by capping
+the number of external tokens reported AND retrieved per scheduling step.
+
+Usage: set CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD=<tokens> and import this
+module from sitecustomize.py before LMCache is loaded.
+"""
+
+import logging
+import os
+import sys
+import builtins
+
+logger = logging.getLogger("chunked_lmcache_patch")
+
+_MAX_TOKENS = int(os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "32768"))
+
+# Per-request chunk tracking (module-level, survives across calls)
+_chunk_state: dict[str, dict] = {}
+
+
+def _apply_patch():
+    """Patch LMCacheMPConnector in-place."""
+    mod = sys.modules.get("lmcache.integration.vllm.lmcache_mp_connector")
+    if mod is None:
+        return
+    cls = getattr(mod, "LMCacheMPConnector", None)
+    if cls is None or getattr(cls, "_chunked_patch_applied", False):
+        return
+
+    LMCacheMPRequestState = getattr(mod, "LMCacheMPRequestState", None)
+    _orig_get_matched = cls.get_num_new_matched_tokens
+    _orig_get_finished = cls.get_finished
+
+    def _get_blocks_per_chunk(self):
+        block_size = getattr(self, "block_size", 1)
+        return max(1, _MAX_TOKENS // block_size)
+
+    def _patched_get_num_new_matched_tokens(self, request, num_computed_tokens):
+        full_match = _orig_get_matched(self, request, num_computed_tokens)
+        if full_match <= 0 or _MAX_TOKENS <= 0:
+            return full_match
+
+        req_id = request.request_id
+        block_size = getattr(self, "block_size", 1)
+        blocks_per_chunk = _get_blocks_per_chunk(self)
+        full_match_blocks = full_match // block_size
+
+        state = _chunk_state.get(req_id)
+        if state is None or state.get("num_computed_at_start") != num_computed_tokens:
+            state = {
+                "full_match_blocks": full_match_blocks,
+                "chunk_end_blocks": 0,
+                "num_computed_at_start": num_computed_tokens,
+                "lookup_done": False,
+            }
+            _chunk_state[req_id] = state
+
+        if state["lookup_done"]:
+            return 0
+
+        remaining = state["full_match_blocks"] - state["chunk_end_blocks"]
+        if remaining <= 0:
+            state["lookup_done"] = True
+            return 0
+
+        this_chunk = min(remaining, blocks_per_chunk)
+        state["chunk_end_blocks"] += this_chunk
+        if state["chunk_end_blocks"] >= state["full_match_blocks"]:
+            state["lookup_done"] = True
+
+        capped = this_chunk * block_size
+        if capped < full_match:
+            logger.debug(
+                "Chunked LMCache: req %s capped %d -> %d tokens "
+                "(chunk %d/%d blocks)",
+                req_id, full_match, capped, this_chunk, full_match_blocks,
+            )
+
+        # Cap the tracker's hit blocks to match what we report
+        tracker = getattr(request, "kv_transfer_params", None)
+        if tracker is not None:
+            orig_hits = getattr(tracker, "num_lmcache_hit_blocks", 0)
+            if orig_hits > this_chunk:
+                tracker.num_lmcache_hit_blocks = this_chunk
+
+        return capped
+
+    def _patched_get_finished(self, scheduler_output):
+        result = _orig_get_finished(self, scheduler_output)
+        # Clean up chunk state for finished requests.
+        # vLLM passes scheduler_output as a set of request-ID strings
+        # (not a SchedulerOutput object), so iterate directly when it
+        # is a set/frozenset; fall back to the attribute path for
+        # forward compatibility.
+        if isinstance(scheduler_output, (set, frozenset)):
+            finished = scheduler_output
+        else:
+            finished = getattr(scheduler_output, "finished_req_ids", [])
+        for req in finished:
+            _chunk_state.pop(req, None)
+        return result
+
+    cls.get_num_new_matched_tokens = _patched_get_num_new_matched_tokens
+    cls.get_finished = _patched_get_finished
+    cls._chunked_patch_applied = True
+    logger.info(
+        "Chunked LMCache connector patch applied "
+        "(max_tokens_per_load=%d)", _MAX_TOKENS,
+    )
+
+
+_orig_import = builtins.__import__
+
+
+def _patching_import(name, *args, **kwargs):
+    module = _orig_import(name, *args, **kwargs)
+    if (
+        name == "lmcache.integration.vllm.lmcache_mp_connector"
+        or (
+            name.startswith("lmcache")
+            and "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules
+        )
+    ):
+        _apply_patch()
+    return module
+
+
+builtins.__import__ = _patching_import
+_apply_patch()
+PY
+}
+
+write_scheduler_assertion_patch() {
+    local patch_dir="$1"
+    mkdir -p "$patch_dir"
+    cat > "$patch_dir/scheduler_assertion_patch.py" <<'PY'
+"""
+Patch vLLM scheduler to handle stale finished_recving gracefully.
+
+The assertion at scheduler.py crashes when a KV transfer reports
+"finished recving" but the request is already in RUNNING state.
+This happens when transfers complete asynchronously and the scheduler
+has already moved the request forward.
+
+Fix: Instead of asserting, log a warning and skip.
+"""
+
+import logging
+import sys
+import builtins
+
+logger = logging.getLogger("scheduler_assertion_patch")
+
+
+def _apply_patch():
+    """Patch vLLM scheduler's _update_from_kv_xfer_finished."""
+    sched_mod = sys.modules.get("vllm.v1.core.sched.scheduler")
+    if sched_mod is None:
+        return
+    req_mod = sys.modules.get("vllm.v1.request")
+    if req_mod is None:
+        return
+    Scheduler = getattr(sched_mod, "Scheduler", None)
+    RequestStatus = getattr(req_mod, "RequestStatus", None)
+    if Scheduler is None or RequestStatus is None:
+        return
+    if getattr(Scheduler, "_kv_xfer_patch_applied", False):
+        return
+
+    _orig_update = Scheduler._update_from_kv_xfer_finished
+
+    def _patched_update(self, kv_connector_output):
+        if self.connector is not None:
+            self.connector.update_connector_output(kv_connector_output)
+        for req_id in kv_connector_output.finished_recving or ():
+            if req_id not in self.requests:
+                continue
+            req = self.requests[req_id]
+            if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
+                self.finished_recving_kv_req_ids.add(req_id)
+            elif RequestStatus.is_finished(req.status):
+                self._free_blocks(self.requests[req_id])
+            else:
+                logger.warning(
+                    "Stale finished_recving for req %s in status %s; skipping.",
+                    req_id, req.status.name,
+                )
+        for req_id in kv_connector_output.finished_sending or ():
+            if req_id not in self.requests:
+                continue
+            self._free_blocks(self.requests[req_id])
+
+    Scheduler._update_from_kv_xfer_finished = _patched_update
+    Scheduler._kv_xfer_patch_applied = True
+    logger.info("Scheduler KV transfer assertion patch applied")
+
+
+_orig_import = builtins.__import__
+
+
+def _patching_import(name, *args, **kwargs):
+    module = _orig_import(name, *args, **kwargs)
+    if (
+        name == "vllm.v1.core.sched.scheduler"
+        or (
+            name.startswith("vllm")
+            and "vllm.v1.core.sched.scheduler" in sys.modules
+        )
+    ):
+        _apply_patch()
+    return module
+
+
+builtins.__import__ = _patching_import
+_apply_patch()
+PY
+}
+
+# Workaround for MEC FW <177 RCCL memory reclaim issue
+version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
+if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
+    export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
 # ---- Server config ----------------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
 LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
@@ -52,8 +591,6 @@ mkdir -p "$RESULT_DIR"
 
 OFFLOAD_ARGS=()
 PREFIX_CACHE_ARGS=()
-
-# ---- Lmcache config ----------------------------------------------------------
 LMCACHE_PID=""
 
 cleanup_lmcache_server() {
@@ -111,9 +648,7 @@ case "$OFFLOADING" in
         # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
         # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
         # worker RSS / page cache / slurm cgroup).
-        #TODO: fix
-        TOTAL_CPU_DRAM_GB=3000
-        TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        TOTAL_CPU_DRAM_GB=2500
         # Use vLLM's regular native KV-offload path (OffloadingConnector),
         # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
         # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
@@ -124,7 +659,7 @@ case "$OFFLOADING" in
         # (vllm/config/vllm.py:662).
         OFFLOAD_ARGS=(
             --kv_offloading_backend native
-            --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
+            --kv_offloading_size "$TOTAL_CPU_DRAM_GB"
             --disable-hybrid-kv-cache-manager
         )
         ;;
@@ -132,20 +667,74 @@ case "$OFFLOADING" in
         { set +x; } 2>/dev/null
         unset VLLM_USE_SIMPLE_KV_OFFLOAD
 
-        git clone https://github.com/LMCache/LMCache.git
-        cd LMCache
-        pip install -r requirements/build.txt 
-        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
-        cd ..
-
+        agentic_pip_install --quiet --no-cache-dir lmcache
+        # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and
+        # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and
+        # during Kimi fused-MoE model inspection it imports nixl_ep whenever
+        # that module is importable, even when this run is not using EP/NIXL
+        # kernels. The CUDA extension then fails immediately on AMD nodes with
+        # "ImportError: libcuda.so.1".
+        #
+        # LMCache MP also uses CuPy stream APIs while registering vLLM's KV
+        # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime
+        # with cudaErrorInsufficientDriver when LMCache touches the stream. Use
+        # the ROCm 7 CuPy wheel so the same API dispatches through HIP.
+        python3 -m pip uninstall -y \
+            nixl nixl-cu12 nixl-cu13 nixl_ep \
+            >/dev/null 2>&1 || true
+        python3 -m pip uninstall -y \
+            cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \
+            >/dev/null 2>&1 || true
+        agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0
+        python3 - <<'PY'
+import importlib.util
+import sys
+
+spec = importlib.util.find_spec("nixl_ep")
+if spec is not None:
+    locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"])
+    print(
+        "Error: nixl_ep is still importable after LMCache install; "
+        "this ROCm Kimi run would import a CUDA-only nixl_ep module. "
+        f"location={locations}",
+        file=sys.stderr,
+    )
+    sys.exit(1)
+
+try:
+    from cupy_backends.cuda.api import runtime as cupy_runtime
+except Exception as exc:
+    print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr)
+    sys.exit(1)
+
+if not getattr(cupy_runtime, "is_hip", False):
+    print(
+        "Error: CuPy is still using the CUDA backend after installing "
+        "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.",
+        file=sys.stderr,
+    )
+    sys.exit(1)
+PY
+        LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch"
+        write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR"
+        write_chunked_connector_patch "$LMCACHE_ROCM_PATCH_DIR"
+        write_scheduler_assertion_patch "$LMCACHE_ROCM_PATCH_DIR"
+        export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1
+        export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16
+        export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1
+        # Cap external KV tokens loaded per scheduling step to prevent GPU
+        # block exhaustion deadlock at high concurrency (c>=32).  Default
+        # 32768 keeps peak block demand within the GPU KV pool.  Set to 0 to
+        # disable chunking (only safe at low concurrency).
+        export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD="${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:-32768}"
+        export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}"
         python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
 
         # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
         # pool, but let the external MP server own that pool so vLLM does not
         # split --kv-offloading-size across TP ranks through the integrated
         # LMCache backend.
-        #TODO: fix
-        TOTAL_CPU_DRAM_GB=3000
+        TOTAL_CPU_DRAM_GB=2500
         LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
         LMCACHE_PORT="${LMCACHE_PORT:-5555}"
         LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
@@ -153,7 +742,7 @@ case "$OFFLOADING" in
         # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
         # ZMQ-style host string.
         LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
-        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
+        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}"
         LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
         # LMCache read locks are leases on chunks that lookup has promised
         # vLLM can retrieve. The default 300s TTL is too short for this
@@ -161,11 +750,10 @@ case "$OFFLOADING" in
         # lookup and retrieve while GPU KV is saturated, which leaves the
         # object present in L1 but no longer readable. Keep the 2.5 TB pool
         # size unchanged and only extend the lookup-to-retrieve lease.
-        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
+        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}"
         LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
         LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
         export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
-        export LMCACHE_BLOCKING_TIMEOUT_SECS=120
 
         echo "Starting LMCache MP server..."
         LMCACHE_CMD=(
@@ -198,7 +786,6 @@ case "$OFFLOADING" in
     *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
 esac
 
-# ---- LLM server config ----------------------------------------------------------
 EP_ARGS=()
 if [ "$EP_SIZE" -gt 1 ]; then
     EP_ARGS=(--enable-expert-parallel)
@@ -207,34 +794,17 @@ fi
 echo "Starting vllm server..."
 export PYTHONNOUSERSITE=1
 
-# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
-pip install amd-quark
-
-# Disable AITER RMSNorm for TP < 8 due to accuracy issues
-if [ "${TP}" -lt 8 ]; then
-  export VLLM_ROCM_USE_AITER_RMSNORM=0
-fi
-
-# Workaround for MEC FW <177 RCCL memory reclaim issue
-version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
-if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
-    export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
-export VLLM_ROCM_USE_AITER=1
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
 { set +x; } 2>/dev/null
 VLLM_CMD=(
-    vllm serve "$MODEL"
+    vllm serve "$MODEL_PATH" --served-model-name "$MODEL"
     --host 0.0.0.0
     --port "$PORT"
     --tensor-parallel-size="$TP"
     "${EP_ARGS[@]}"
     --gpu-memory-utilization 0.90
-    --kv-cache-dtype fp8 \
     --block-size=1
     --trust-remote-code
+    --max-model-len "$MAX_MODEL_LEN"
     --max-num-seqs "$CONC"
     --mm-encoder-tp-mode data
     "${PREFIX_CACHE_ARGS[@]}"
@@ -251,4 +821,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh
deleted file mode 100755
index f36fc59e9..000000000
--- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh
+++ /dev/null
@@ -1,256 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-set -x
-
-# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM.
-#
-# Required env vars:
-#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
-#
-# OFFLOADING values:
-#   none    - vLLM GPU KV only.
-#   cpu     - vLLM native CPU offload.
-#   lmcache - LMCache MP server + vLLM LMCacheMPConnector.
-
-source "$(dirname "$0")/../../benchmark_lib.sh"
-
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
-
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-EP_SIZE=${EP_SIZE:-1}
-
-if [[ -n "${SLURM_JOB_ID:-}" ]]; then
-    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
-fi
-
-# ROCR/HIP visibility for vLLM 0.14+
-if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then
-    export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
-fi
-
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
-rocm-smi || true
-amd-smi || true
-
-# ---- Resolve traces and install deps ----------------------------------------
-# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726
-# corpus has requests up to ~1M proxy tokens that would be rejected.
-# Switch to the 256k-capped variant (470 traces, max in+out <= 256k).
-#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
-#060226
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
-
-resolve_trace_source
-install_agentic_deps
-
-# ---- Server config ----------------------------------------------------------
-SERVER_LOG="$RESULT_DIR/server.log"
-LMCACHE_LOG="$RESULT_DIR/lmcache_server.log"
-mkdir -p "$RESULT_DIR"
-
-OFFLOAD_ARGS=()
-PREFIX_CACHE_ARGS=()
-
-# ---- Lmcache config ----------------------------------------------------------
-LMCACHE_PID=""
-
-cleanup_lmcache_server() {
-    if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then
-        kill "$LMCACHE_PID" 2>/dev/null || true
-        wait "$LMCACHE_PID" 2>/dev/null || true
-    fi
-}
-
-trap cleanup_lmcache_server EXIT
-
-wait_for_lmcache_ready() {
-    { set +x; } 2>/dev/null
-    local attempts="${LMCACHE_READY_ATTEMPTS:-120}"
-    local tail_pid=""
-
-    while [ ! -f "$LMCACHE_LOG" ]; do
-        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
-            echo "LMCache server died before creating log file. Exiting." >&2
-            exit 1
-        fi
-        sleep 1
-    done
-
-    tail -f -n +1 "$LMCACHE_LOG" &
-    tail_pid=$!
-
-    for ((i = 1; i <= attempts; i++)); do
-        if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then
-            kill "$tail_pid" 2>/dev/null || true
-            wait "$tail_pid" 2>/dev/null || true
-            return 0
-        fi
-        if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then
-            echo "LMCache server died before becoming healthy. Log follows:" >&2
-            kill "$tail_pid" 2>/dev/null || true
-            wait "$tail_pid" 2>/dev/null || true
-            cat "$LMCACHE_LOG" >&2 || true
-            exit 1
-        fi
-        sleep 1
-    done
-
-    echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2
-    kill "$tail_pid" 2>/dev/null || true
-    wait "$tail_pid" 2>/dev/null || true
-    cat "$LMCACHE_LOG" >&2 || true
-    exit 1
-}
-
-case "$OFFLOADING" in
-    none) ;;
-    cpu)
-        unset VLLM_USE_SIMPLE_KV_OFFLOAD
-        # MI355X nodes have ~2.7 TiB of host DRAM available for offload;
-        # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for
-        # worker RSS / page cache / slurm cgroup).
-        TOTAL_CPU_DRAM_GB=3000
-        TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
-        # Use vLLM's regular native KV-offload path (OffloadingConnector),
-        # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to
-        # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1
-        # would switch it to SimpleCPUOffloadConnector. We intentionally leave
-        # that env var UNSET here so the regular OffloadingConnector path is
-        # used. The shortcut --kv_offloading_backend native + --kv_offloading_size
-        # form constructs the KVTransferConfig at engine startup
-        # (vllm/config/vllm.py:662).
-
-        # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default)
-        # This gives extra cache hit than disabling hybrid kv cache manager
-        # srok,
-        # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma
-        # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60
-        OFFLOAD_ARGS=(
-            --kv_offloading_backend native
-            --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB"
-            --disable-hybrid-kv-cache-manager
-        )
-        ;;
-    lmcache)
-        { set +x; } 2>/dev/null
-        unset VLLM_USE_SIMPLE_KV_OFFLOAD
-
-        git clone https://github.com/LMCache/LMCache.git
-        cd LMCache
-        pip install -r requirements/build.txt 
-        CXX=hipcc BUILD_WITH_HIP=1 pip install -e .   --no-build-isolation
-        cd ..
-
-        python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null
-
-        # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV
-        # pool, but let the external MP server own that pool so vLLM does not
-        # split --kv-offloading-size across TP ranks through the integrated
-        # LMCache backend.
-        TOTAL_CPU_DRAM_GB=3000
-        LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
-        LMCACHE_PORT="${LMCACHE_PORT:-5555}"
-        LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
-        # LMCacheMPConnector concatenates lmcache.mp.host and port into the
-        # ZMQ endpoint. Bind the server to a raw host, but pass the connector a
-        # ZMQ-style host string.
-        LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}"
-        LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}"
-        LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
-        # LMCache read locks are leases on chunks that lookup has promised
-        # vLLM can retrieve. The default 300s TTL is too short for this
-        # long-context agentic queue: TP8/conc32 can spend >300s between
-        # lookup and retrieve while GPU KV is saturated, which leaves the
-        # object present in L1 but no longer readable. Keep the 2.5 TB pool
-        # size unchanged and only extend the lookup-to-retrieve lease.
-        LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}"
-        LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
-        LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}"
-        export PYTHONHASHSEED="${PYTHONHASHSEED:-0}"
-        export LMCACHE_BLOCKING_TIMEOUT_SECS=120
-
-        set -x
-        echo "Starting LMCache MP server..."
-        LMCACHE_CMD=(
-            lmcache server
-            --host "$LMCACHE_HOST"
-            --port "$LMCACHE_PORT"
-            --http-host "$LMCACHE_HOST"
-            --http-port "$LMCACHE_HTTP_PORT"
-            --l1-size-gb "$LMCACHE_L1_SIZE_GB"
-            --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB"
-            --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS"
-            --chunk-size "$LMCACHE_CHUNK_SIZE"
-            --max-workers "$LMCACHE_MAX_WORKERS"
-            --eviction-policy LRU
-        )
-        printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt"
-        printf '\n' >> "$RESULT_DIR/lmcache_command.txt"
-        "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 &
-        LMCACHE_PID=$!
-        echo "LMCache server PID: $LMCACHE_PID"
-        wait_for_lmcache_ready
-
-        PREFIX_CACHE_ARGS=(--enable-prefix-caching)
-        # srok,
-        # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma
-        # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60
-        OFFLOAD_ARGS=(
-            --kv-transfer-config
-            "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}"
-            --disable-hybrid-kv-cache-manager
-        )
-        ;;
-    *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;;
-esac
-
-# ---- LLM server config ----------------------------------------------------------
-EP_ARGS=()
-if [ "$EP_SIZE" -gt 1 ]; then
-    EP_ARGS=(--enable-expert-parallel)
-fi
-
-echo "Starting vllm server..."
-export PYTHONNOUSERSITE=1
-
-# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug)
-pip install -q amd-quark
-
-# Workaround for MEC FW <177 RCCL memory reclaim issue
-version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}')
-if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then
-    export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
-export VLLM_ROCM_USE_AITER=1
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-{ set +x; } 2>/dev/null
-VLLM_CMD=(
-    vllm serve "$MODEL"
-    --host 0.0.0.0
-    --port "$PORT"
-    --tensor-parallel-size="$TP"
-    "${EP_ARGS[@]}"
-    --gpu-memory-utilization 0.95
-    --kv-cache-dtype fp8 \
-    --block-size=32
-    --trust-remote-code
-    --attention-backend "ROCM_AITER_FA" 
-    --max-num-seqs "$CONC"
-    "${PREFIX_CACHE_ARGS[@]}"
-    "${OFFLOAD_ARGS[@]}"
-)
-printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt"
-printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt"
-"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 &
-SERVER_PID=$!
-echo "Server PID: $SERVER_PID"
-
-wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
-
-# ---- Run benchmark ----------------------------------------------------------
-build_replay_cmd "$RESULT_DIR"
-
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
index 656e924dc..ff901b674 100755
--- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh
@@ -2,117 +2,51 @@
 set -euo pipefail
 set -x
 
-# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang.
-#
-# Base server recipe follows the upstream MI300X reference
-# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe):
-# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75.
-# The agentic harness (resolve_trace_source / build_replay_cmd /
-# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and
-# --disable-radix-cache is dropped because agentic replay needs prefix reuse.
+# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang.
 #
 # Required env vars:
-#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE
-#
-# OFFLOADING values:
-#   none    - SGLang GPU KV with the default RadixAttention prefix cache.
-#   hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix.
+#   MODEL, TP, CONC, RESULT_DIR
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
-check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION
+check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE
 
-PORT=${PORT:-8888}
-DURATION=${DURATION:-1800}
-EP_SIZE=${EP_SIZE:-1}
-
-SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30}
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=131072
+fi
 
 if [[ -n "${SLURM_JOB_ID:-}" ]]; then
     echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
 rocm-smi || true
 amd-smi || true
 
-# ---- Resolve traces and install deps ----------------------------------------
-# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the
-# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf
-# signal at high concurrency.
-#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k
-#060226
-export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k
-
 # ---- Resolve traces and install deps ----------------------------------------
 resolve_trace_source
 install_agentic_deps
 
-# ---- Cache / offload config -------------------------------------------------
+# ---- Start SGLang server ----------------------------------------------------
 SERVER_LOG="$RESULT_DIR/server.log"
 mkdir -p "$RESULT_DIR"
 
-CACHE_ARGS=()
-WARMUP_ARGS=()
-CUDA_GRAPH_MAX_BS="$CONC"
-case "$OFFLOADING" in
-    none)
-        # Leave SGLang's default RadixAttention prefix cache on — agentic
-        # replay needs it; --disable-radix-cache would zero the hit rate.
-        ;;
-    hicache)
-        # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per
-        # TP rank (one hierarchical KV, one hierarchical Mamba), so the
-        # node-total DRAM budget divides by TP and the host-pool count.
-        TOTAL_CPU_DRAM_GB=3000
-        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
-        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}"
-        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
-        # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which
-        # requires page_size=1. Keep the safer direct/layer_first copy path;
-        # kernel/page_first faults on first prefill in this mode on ROCm.
-        HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
-        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
-        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
-        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
-        if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
-            HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
-        fi
-        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
-            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
-            exit 1
-        fi
-        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
-        CACHE_ARGS=(
-            --page-size "$HICACHE_PAGE_SIZE"
-            --enable-hierarchical-cache
-            --hicache-size "$HICACHE_SIZE_GB"
-            --hicache-io-backend "$HICACHE_IO_BACKEND"
-            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
-            --hicache-write-policy "$HICACHE_WRITE_POLICY"
-        )
-        # HiCache startup reaches API readiness but SGLang's internal warmup
-        # request can time out on this path; let aiperf own benchmark traffic.
-        WARMUP_ARGS=(--skip-server-warmup)
-        # Don't force ROCm graph capture at every high concurrency point; conc=16
-        # is the highest known-good capture size for this model/server path.
-        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}"
-        if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
-            CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
-        fi
-        ;;
-    *)
-        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
-        exit 1
-        ;;
-esac
-
 echo "Starting SGLang server..."
 export PYTHONNOUSERSITE=1
 
 python3 -m sglang.launch_server \
     --attention-backend triton \
-    --model-path $MODEL \
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL" \
     --host=0.0.0.0 \
     --port $PORT \
     --tensor-parallel-size $TP \
@@ -122,10 +56,10 @@ python3 -m sglang.launch_server \
     --enable-aiter-allreduce-fusion \
     --cuda-graph-max-bs $CONC \
     --max-running-requests $CONC \
-    --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
+    --max-prefill-tokens 32768 \
+    --scheduler-recv-interval 30 \
     --mem-fraction-static 0.8 \
-    "${CACHE_ARGS[@]}" \
-    "${WARMUP_ARGS[@]}" \
+    --context-length $MAX_MODEL_LEN \
     --enable-metrics > "$SERVER_LOG" 2>&1 &
 SERVER_PID=$!
 echo "Server PID: $SERVER_PID"
@@ -135,4 +69,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S
 # ---- Run benchmark ----------------------------------------------------------
 build_replay_cmd "$RESULT_DIR"
 
-run_agentic_replay_and_write_outputs "$RESULT_DIR"
\ No newline at end of file
+run_agentic_replay_and_write_outputs "$RESULT_DIR"
diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
new file mode 100755
index 000000000..cdded8860
--- /dev/null
+++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh
@@ -0,0 +1,152 @@
+#!/usr/bin/env bash
+set -euo pipefail
+set -x
+
+# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang.
+#
+# Required env vars:
+#   MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR
+#
+# OFFLOADING values:
+#   none    - SGLang GPU KV only with radix cache disabled.
+#   hicache - SGLang HiCache with local CPU hierarchical cache.
+
+source "$(dirname "$0")/../../benchmark_lib.sh"
+
+check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE
+
+SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30}
+if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then
+    MAX_MODEL_LEN=131072
+fi
+
+if [[ -n "${SLURM_JOB_ID:-}" ]]; then
+    echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}"
+fi
+
+# `hf download` creates the target dir if missing and is itself idempotent.
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+rocm-smi || true
+amd-smi || true
+
+# ---- Resolve traces and install deps ----------------------------------------
+resolve_trace_source
+install_agentic_deps
+
+# ---- Server config ----------------------------------------------------------
+SERVER_LOG="$RESULT_DIR/server.log"
+mkdir -p "$RESULT_DIR"
+
+CACHE_ARGS=()
+WARMUP_ARGS=()
+CUDA_GRAPH_MAX_BS="$CONC"
+case "$OFFLOADING" in
+    none)
+        # Leave SGLang's default RadixAttention prefix cache on — agentic
+        # replay needs it; --disable-radix-cache would zero the hit rate.
+        ;;
+    hicache)
+        # MI355X nodes have about 3 TB of host DRAM, but Qwen3.5's hybrid
+        # GDN/Mamba path allocates two HiCache host pools per TP rank: one for
+        # hierarchical KV cache and one for hierarchical Mamba cache. A 2 TB
+        # node-total target at TP=8 is therefore 2000 / (8 * 2) = 125 GB per
+        # host pool, not 250 GB. Keep overrides for one-off tuning.
+        TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-2000}"
+        HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}"
+        HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-180}}"
+        HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}"
+        # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler on
+        # MI355X, which requires page_size=1. The kernel/page_first HiCache
+        # transfer path faults on first prefill in this mode on ROCm, so keep
+        # the default on the safer direct/layer_first copy path. These remain
+        # env-overridable for future SGLang/ROCm fixes.
+        HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}"
+        HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}"
+        HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}"
+        # SGLang --hicache-size is per rank per host pool, while the workflow
+        # input is a node-total DRAM budget. Divide by TP and the number of
+        # host pools unless HICACHE_SIZE_GB is set directly for one-off tuning.
+        HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}"
+        if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then
+            HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL"
+        fi
+        if [ "$HICACHE_SIZE_GB" -lt 1 ]; then
+            echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2
+            exit 1
+        fi
+        echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}"
+        CACHE_ARGS=(
+            --page-size "$HICACHE_PAGE_SIZE"
+            --enable-hierarchical-cache
+            --hicache-size "$HICACHE_SIZE_GB"
+            --hicache-io-backend "$HICACHE_IO_BACKEND"
+            --hicache-mem-layout "$HICACHE_MEM_LAYOUT"
+            --hicache-write-policy "$HICACHE_WRITE_POLICY"
+        )
+        # HiCache startup reaches API readiness, but SGLang's internal warmup
+        # request has timed out after 600s on this Qwen MI355X path. Let aiperf
+        # own benchmark traffic instead of blocking server readiness on it.
+        WARMUP_ARGS=(--skip-server-warmup)
+        # Keep request concurrency as the swept variable, but do not force
+        # HiCache runs to capture ROCm graphs at every high concurrency point.
+        # The conc=32 HiCache job crashed after startup readiness, before any
+        # aiperf traffic, while conc=16 is the highest known-good capture size
+        # for this model/server path. Requests above the capture size can still
+        # run; they just do not require a larger captured graph at startup.
+        HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}"
+        if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then
+            CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS"
+        fi
+        ;;
+    *)
+        echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2
+        exit 1
+        ;;
+esac
+
+echo "Starting SGLang server..."
+export PYTHONNOUSERSITE=1
+
+{ set +x; } 2>/dev/null
+SGLANG_CMD=(
+    python3 -m sglang.launch_server
+    --attention-backend triton
+    --model-path "$MODEL_PATH" --served-model-name "$MODEL"
+    --host=0.0.0.0
+    --port "$PORT"
+    --tensor-parallel-size "$TP"
+    --ep-size "$EP_SIZE"
+    --trust-remote-code
+    --tokenizer-worker-num 6
+    --enable-aiter-allreduce-fusion
+    --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
+    --max-running-requests "$CONC"
+    --max-prefill-tokens 32768
+    --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL"
+    --mem-fraction-static 0.8
+    --context-length "$MAX_MODEL_LEN"
+    --enable-metrics
+    "${CACHE_ARGS[@]}"
+    "${WARMUP_ARGS[@]}"
+)
+printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt"
+printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt"
+"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+SERVER_PID=$!
+echo "Server PID: $SERVER_PID"
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+# ---- Run benchmark ----------------------------------------------------------
+build_replay_cmd "$RESULT_DIR"
+
+run_agentic_replay_and_write_outputs "$RESULT_DIR"