From f632aa42c2872eecaa0089d119e6f1fea1a5c2ec Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 12:21:55 -0500 Subject: [PATCH 01/15] agentic(trace-source): default non-DSv4 to v6 (060226) corpus resolve_trace_source() now picks a model-prefix-aware default: MODEL_PREFIX == dsv4 -> semianalysis_cc_traces_weka_with_subagents (052726, the v5 baseline, unchanged for continuity with prior DSv4 published runs) everything else -> semianalysis_cc_traces_weka_with_subagents_060226 (060226, newer v6 corpus with fresher CC recording windows) WEKA_LOADER_OVERRIDE still wins. Allowed values widened from the two 052726 loaders to all four: semianalysis_cc_traces_weka_with_subagents (052726) semianalysis_cc_traces_weka_with_subagents_256k (052726-256k) semianalysis_cc_traces_weka_with_subagents_060226 (060226) semianalysis_cc_traces_weka_with_subagents_060226_256k (060226-256k) Bumps utils/aiperf submodule to de3ad1c1, which registers the two 060226 plugin entries those new loader names resolve through. The pre-cache log line now also includes MODEL_PREFIX so it's obvious in CI which default fired. Signed-off-by: Cam Quilici --- benchmarks/benchmark_lib.sh | 25 +++++++++++++++++++++---- utils/aiperf | 2 +- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index e3080b4bf..e062b42f1 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -924,8 +924,19 @@ resolve_trace_source() { # public-dataset loader names allowed by the inferencex-agentx-mvp # scenario. Used by recipes whose servers have non-default context # caps (e.g. minimaxm2.5 at max_model_len ~256k can't replay the - # unfiltered 052726 corpus and switches to the 256k-capped variant). - local loader="${WEKA_LOADER_OVERRIDE:-semianalysis_cc_traces_weka_with_subagents}" + # unfiltered corpus and switches to the 256k-capped variant), or + # by recipes that want to pin a specific corpus generation rather + # than ride the model-prefix-aware default below. + # + # Default (no override) is model-prefix-aware: + # DSv4 recipes -> 052726 (v5 corpus, the original baseline) + # everything else -> 060226 (v6 corpus, newer CC versions) + # DSv4 stays on 052726 for continuity with prior published baselines. + local default_loader="semianalysis_cc_traces_weka_with_subagents_060226" + if [[ "${MODEL_PREFIX:-}" == "dsv4" ]]; then + default_loader="semianalysis_cc_traces_weka_with_subagents" + fi + local loader="${WEKA_LOADER_OVERRIDE:-$default_loader}" local dataset case "$loader" in semianalysis_cc_traces_weka_with_subagents) @@ -934,13 +945,19 @@ resolve_trace_source() { semianalysis_cc_traces_weka_with_subagents_256k) dataset="semianalysisai/cc-traces-weka-with-subagents-052726-256k" ;; + semianalysis_cc_traces_weka_with_subagents_060226) + dataset="semianalysisai/cc-traces-weka-with-subagents-060226" + ;; + semianalysis_cc_traces_weka_with_subagents_060226_256k) + dataset="semianalysisai/cc-traces-weka-with-subagents-060226-256k" + ;; *) - echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k" >&2 + echo "Error: unknown WEKA_LOADER_OVERRIDE='$loader'. Allowed: semianalysis_cc_traces_weka_with_subagents, semianalysis_cc_traces_weka_with_subagents_256k, semianalysis_cc_traces_weka_with_subagents_060226, semianalysis_cc_traces_weka_with_subagents_060226_256k" >&2 exit 1 ;; esac TRACE_SOURCE_FLAG="--public-dataset $loader" - echo "Loading traces via aiperf public-dataset: $loader ($dataset)" + echo "Loading traces via aiperf public-dataset: $loader ($dataset) [MODEL_PREFIX=${MODEL_PREFIX:-unset}]" # Pre-download the dataset into the shared HF_HUB_CACHE (same mount used # for model weights) so subsequent runs read from cache instead of # re-downloading every job. diff --git a/utils/aiperf b/utils/aiperf index 062a5de92..de3ad1c18 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit 062a5de92c8ac8a0a6dd5d2a7fb9a539a147f3d9 +Subproject commit de3ad1c18b704a60c43bcc5f76dfb2ac7e346fd1 From 5544a448d594a5ff3b8b83a25d714a8635adc3b7 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 13:16:10 -0500 Subject: [PATCH 02/15] configs(master): consolidate agentic recipes at end + split combined dsr1-trt entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reorganizes both master YAMLs so all pure-agentic (agentic-coding-only) recipes sit at the bottom of the file behind an "# Agentic configs" divider, separated from fixed-seq-len / synthetic / prefix-share entries above. No functional change to any non-agentic recipe. nvidia-master.yaml: splits dsr1-fp4-b200-dynamo-trt — which previously mixed fixed-seq-len + agentic-coding in one entry — into the original entry (fixed-seq-len only) plus a new sibling dsr1-fp4-b200-dynamo-trt-agentic carrying the agentic-coding scenario. 22 pure-agentic entries moved. amd-master.yaml: no split needed (no combined entries); 9 pure-agentic entries moved to the end. Verified via deep YAML parse: nvidia adds 1 key (the split sibling) and modifies the source key's scenarios from [agentic-coding, fixed-seq-len] to [fixed-seq-len]; amd has 0 keys added/removed/modified. All other entries are byte-equal after round-trip. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/amd-master.yaml | 358 ++++++------ .github/configs/nvidia-master.yaml | 893 +++++++++++++++-------------- 2 files changed, 637 insertions(+), 614 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index fb3966ce6..0495ebf16 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -304,25 +304,6 @@ qwen3.5-fp8-mi355x-sglang-mtp: - { tp: 2, ep: 2, conc-start: 4, conc-end: 32, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 32, conc-end: 256, spec-decoding: mtp } -# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main. -qwen3.5-fp8-mi355x-sglang-agentic: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - qwen3.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -704,26 +685,6 @@ glm5.1-fp4-mi355x-sglang: - { tp: 2, conc-start: 4, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 16 } -# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main. -glm5.1-fp4-mi355x-sglang-agentic: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 - model: amd/GLM-5.1-MXFP4 - model-prefix: glm5.1 - runner: mi355x - precision: fp4 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - glm5.1-fp4-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: amd/GLM-5.1-MXFP4 @@ -821,38 +782,6 @@ kimik2.5-fp4-mi355x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } -# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0' -kimik2.5-fp4-mi355x-vllm-agentic: - # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin - # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm - # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and - # includes all subsequent ROCm offload work. - image: vllm/vllm-openai-rocm:v0.21.0 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } - # CPU offload only above the KV cliff. Lower concurrencies fit - # entirely on-GPU, so paying the offload-path overhead there would - # just slow them down without measuring anything new. - - { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] } - # TP=4 probe: half-node layout doubles per-GPU weight footprint - # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to - # cliff-region concurrencies on both offload modes so we can directly - # compare TP=4 vs TP=8 at the same conc points. - - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } - kimik2.5-fp4-mi355x-atom: image: rocm/atom:rocm7.2.3_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom20260511 model: amd/Kimi-K2.5-MXFP4 @@ -897,33 +826,6 @@ minimaxm2.5-fp8-mi355x-vllm: - { tp: 4, ep: 4, conc-start: 4, conc-end: 512 } - { tp: 8, ep: 8, conc-start: 2, conc-end: 2 } -# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' -minimaxm2.5-fp8-mi355x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector] - # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"), - # which enables SimpleCPUOffloadConnector on ROCm. Required for the - # cpu-offload sweep points to use the same offload path as the NVIDIA - # agentic-coding configs. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi355x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical). - # Compute saturates first; cpu offload likely won't help, but worth confirming. - # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). - - duration: 1800 - search-space: - - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] } - - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } - minimaxm2.5-fp8-mi355x-atom: image: rocm/atom:rocm7.2.2_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.2.post model: MiniMaxAI/MiniMax-M2.5 @@ -1014,29 +916,6 @@ minimaxm2.5-fp8-mi300x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } -# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' -minimaxm2.5-fp8-mi300x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi300x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200); - # KV cliff ~52. Compute saturates first. - # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } - - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } - minimaxm2.5-fp8-mi325x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -1058,30 +937,6 @@ minimaxm2.5-fp8-mi325x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } -# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' -minimaxm2.5-fp8-mi325x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: mi325x - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # MI325X tp=4: cloned from MI300X recipe (slightly faster compute, - # similar HBM profile). Compute saturates first; cpu-offload window - # exercises the SimpleCPUOffloadConnector path enabled by the rocm - # nightly. Mirror MI300X conc grid for cross-vendor comparability. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } - - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } - gptoss-fp4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.17.0 model: openai/gpt-oss-120b @@ -2415,37 +2270,6 @@ glm5-fp8-mi325x-sglang-mtp: # brought in here. # ============================================================================ -qwen3.5-fp8-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } - -dsv4-fp4-mi355x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.21.0 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4] } - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } - - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } - dsr1-fp4-mi355x-sglang-disagg-mtp: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 model: amd/DeepSeek-R1-0528-MXFP4-v2 @@ -2679,6 +2503,188 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: # amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the # image tag, so bumping sglang is just an image tag bump here. Sweeps # DP-attention on/off and EP=8. +# ============================================================================= +# Agentic configs +# ----------------------------------------------------------------------------- +# All entries below run the agentic-coding scenario (Weka trace replay). +# Above this divider: fixed-seq-len / synthetic / prefix-share scenarios only. +# ============================================================================= + +# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is +# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main. +qwen3.5-fp8-mi355x-sglang-agentic: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + +# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is +# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main. +glm5.1-fp4-mi355x-sglang-agentic: + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 + model: amd/GLM-5.1-MXFP4 + model-prefix: glm5.1 + runner: mi355x + precision: fp4 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + +# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below; +# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0' +kimik2.5-fp4-mi355x-vllm-agentic: + # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin + # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm + # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and + # includes all subsequent ROCm offload work. + image: vllm/vllm-openai-rocm:v0.21.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 32, 40, 48] } + # CPU offload only above the KV cliff. Lower concurrencies fit + # entirely on-GPU, so paying the offload-path overhead there would + # just slow them down without measuring anything new. + - { tp: 8, offloading: cpu, conc-list: [32, 40, 48, 56] } + # TP=4 probe: half-node layout doubles per-GPU weight footprint + # (~62 GB on MI355X's 288 GB HBM, plenty of headroom). Restrict to + # cliff-region concurrencies on both offload modes so we can directly + # compare TP=4 vs TP=8 at the same conc points. + - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } + - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } + +# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' +minimaxm2.5-fp8-mi355x-vllm-agentic: + # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector] + # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"), + # which enables SimpleCPUOffloadConnector on ROCm. Required for the + # cpu-offload sweep points to use the same offload path as the NVIDIA + # agentic-coding configs. + image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi355x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI355X tp=4 ep=4: compute ceiling ~60 (empirical), KV cliff ~91 (analytical). + # Compute saturates first; cpu offload likely won't help, but worth confirming. + # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). + - duration: 1800 + search-space: + - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] } + - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } + +# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' +minimaxm2.5-fp8-mi300x-vllm-agentic: + # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. + image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi300x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI300X tp=4: compute ceiling ~25 (estimated, between H100 and H200); + # KV cliff ~52. Compute saturates first. + # AMD uses native OffloadingConnector (NOT SimpleCPUOffloadConnector). + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } + - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } + +# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' +minimaxm2.5-fp8-mi325x-vllm-agentic: + # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. + image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: mi325x + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # MI325X tp=4: cloned from MI300X recipe (slightly faster compute, + # similar HBM profile). Compute saturates first; cpu-offload window + # exercises the SimpleCPUOffloadConnector path enabled by the rocm + # nightly. Mirror MI300X conc grid for cross-vendor comparability. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } + - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } + +qwen3.5-fp8-mi355x-sglang-agentic-hicache: + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + +dsv4-fp4-mi355x-vllm-agentic: + image: vllm/vllm-openai-rocm:v0.21.0 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4] } + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } # Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below; # the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d3b1b6729..04764831c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -384,25 +384,6 @@ dsr1-fp4-b200-dynamo-trt: ep: 8 dp-attn: true - agentic-coding: - - duration: 300 - search-space: - - spec-decoding: "none" - conc-list: [ 1, 2, 4, 8, 16, 32 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml - - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: false - dsr1-fp8-b200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 model: deepseek-ai/DeepSeek-R1-0528 @@ -1778,28 +1759,6 @@ dsv4-fp4-b200-vllm: - { tp: 8, conc-start: 1, conc-end: 32 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 } -# Diverged from dsv4-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original dsv4-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'b200-dsv4' -> 'b200-dgxc' -dsv4-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.20.0-cu130 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: b200-dgxc - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # cpu offload only this iteration — none entries already validated in - # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%). - # Re-add when investigating regressions in offload=none. - - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - dsv4-fp4-b200-trt: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 model: deepseek-ai/DeepSeek-V4-Pro @@ -2143,25 +2102,6 @@ qwen3.5-fp8-b200-sglang: - { tp: 8, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } -# Diverged from qwen3.5-fp8-b200-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's qwen3.5-fp8-b200-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original qwen3.5-fp8-b200-sglang entry stays byte-identical to origin/main. -qwen3.5-fp8-b200-sglang-agentic: - image: lmsysorg/sglang:nightly-dev-20260422-de962f32 - model: Qwen/Qwen3.5-397B-A17B-FP8 - model-prefix: qwen3.5 - runner: b200 - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - qwen3.5-fp4-b200-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: nvidia/Qwen3.5-397B-A17B-NVFP4 @@ -2245,26 +2185,6 @@ glm5-fp8-b200-sglang-mtp: # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/GLM/GLM-5.1 # does not have a B300-specific recipe, so this config reuses the existing GLM5 FP8 # B200 SGLang recipe as-is until B300-specific tuning is available. -# Diverged from glm5-fp8-b200-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's glm5-fp8-b200-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original glm5-fp8-b200-sglang entry stays byte-identical to origin/main. -glm5-fp8-b200-sglang-agentic: - image: lmsysorg/sglang:v0.5.12-cu130 - model: zai-org/GLM-5-FP8 - model-prefix: glm5 - runner: b200 - precision: fp8 - framework: sglang - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] } - glm5-fp8-b300-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: zai-org/GLM-5-FP8 @@ -2553,37 +2473,6 @@ kimik2.5-int4-b200-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-int4-b200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-int4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'b200' -> 'b200-dgxc' -kimik2.5-int4-b200-vllm-agentic: - # Bumped from v0.19.1 — that release tripped a bug in - # `flashinfer_trtllm_mxint4_moe` ('list' object has no attribute 'to') - # during warmup `profile_run` on the agentic-coding path - # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the - # flashinfer fix. - image: vllm/vllm-openai:v0.20.2 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - runner: b200-dgxc - precision: int4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 8, offloading: cpu, conc-list: [32, 64, 96, 128] } - -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. - kimik2.5-int4-b300-vllm: image: vllm/vllm-openai:v0.21.0 model: moonshotai/Kimi-K2.5 @@ -2624,29 +2513,6 @@ kimik2.5-int4-h200-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } -# Diverged from kimik2.5-int4-h200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-int4-h200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'h200' -> 'h200-dgxc' -kimik2.5-int4-h200-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with - # the /home/sa-shared/gharunners/ai-perf-cache mount where aiperf's - # content-addressed dataset mmap cache lives. Other h200 pools (cw, nb) - # don't have that mount and would re-materialize 65 GB to /tmp every job. - runner: h200-dgxc - precision: int4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] } - - { tp: 8, offloading: cpu, conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] } - kimik2.5-fp4-b200-vllm: image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 @@ -2668,38 +2534,6 @@ kimik2.5-fp4-b200-vllm: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2' -# - runner: 'b200' -> 'b200-dgxc' -kimik2.5-fp4-b200-vllm-agentic: - # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that - # cleared the agentic-coding warmup crash on max_model_len=131072 + - # prefix caching. - image: vllm/vllm-openai:v0.20.2 - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - runner: b200-dgxc - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] } - - { tp: 8, ep: 1, offloading: cpu, conc-list: [16, 24, 32, 36] } - - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] } - - { tp: 4, ep: 1, offloading: cpu, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } - -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. - kimik2.5-fp4-b300-vllm: image: vllm/vllm-openai:v0.21.0 model: nvidia/Kimi-K2.5-NVFP4 @@ -2763,34 +2597,6 @@ dsr1-fp8-b300-sglang-mtp: search-space: - { tp: 8, ep: 1, conc-start: 1, conc-end: 512, spec-decoding: mtp } -# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130' -# - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4' -# - model-prefix: 'dsr1' -> 'kimik2.5' -# - precision: 'fp8' -> 'fp4' -# - framework: 'sglang' -> 'vllm' -kimik2.5-fp4-b300-vllm-agentic: - # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM - # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the - # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted - # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the - # INT4 B300 sister already uses successfully. - image: vllm/vllm-openai:v0.20.0-cu130 - model: nvidia/Kimi-K2.5-NVFP4 - model-prefix: kimik2.5 - runner: b300 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - - { tp: 8, ep: 1, offloading: cpu, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - dsr1-fp8-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc14 model: deepseek-ai/DeepSeek-R1-0528 @@ -2924,31 +2730,6 @@ dsv4-fp8-h200-vllm-mtp: - { tp: 8, ep: 1, dp-attn: false, conc-start: 1, conc-end: 256, spec-decoding: mtp } - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 256, spec-decoding: mtp } -# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). -# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper -# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up. -# Diverged from dsv4-fp8-h200-vllm (agentic-coding sibling). Reasons below; -# the original dsv4-fp8-h200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129' -dsv4-fp8-h200-vllm-agentic: - image: vllm/vllm-openai:deepseekv4-cu129 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: h200 - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [1, 2, 4, 8, 16] } - -# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image -# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds -# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. - dsv4-fp8-h200-sglang: image: lmsysorg/sglang:deepseek-v4-hopper@sha256:7f19c6dc092e47a10fac2e41f47eab78970280d06648b8e50d312a82f0ae722f model: deepseek-ai/DeepSeek-V4-Pro @@ -3024,30 +2805,6 @@ dsv4-fp4-b300-vllm: - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } -# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main. -dsv4-fp4-b300-vllm-agentic: - image: vllm/vllm-openai:v0.20.0-cu130 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - runner: b300 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - # cpu offload only this iteration — none entries already validated in - # earlier runs. Re-add when investigating regressions in offload=none. - - { tp: 4, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } - dsv4-fp4-b300-trt: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 model: deepseek-ai/DeepSeek-V4-Pro @@ -4284,31 +4041,10 @@ gptoss-fp4-b200-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 4 } -# Diverged from gptoss-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original gptoss-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1' -gptoss-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 - model: openai/gpt-oss-120b - model-prefix: gptoss - runner: b200 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] } - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] } - - { tp: 4, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } - - { tp: 8, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } - -minimaxm2.5-fp8-b200-vllm: - image: vllm/vllm-openai:v0.22.0 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 +minimaxm2.5-fp8-b200-vllm: + image: vllm/vllm-openai:v0.22.0 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 runner: b200 precision: fp8 framework: vllm @@ -4330,33 +4066,6 @@ minimaxm2.5-fp8-b200-vllm: # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from minimaxm2.5-fp8-b200-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' -# - runner: 'b200' -> 'b200-dgxc' -minimaxm2.5-fp8-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: b200-dgxc - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # B200 tp=4: compute ceiling ~50 (empirical), KV cliff ~48 (analytical). - # Push none past the KV cliff (96, 128) to make the no-offload throughput - # collapse visible; cpu range overlaps fully for same-conc comparison. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 96, 128] } - - { tp: 4, offloading: cpu, conc-list: [48, 56, 64, 96, 128] } - - # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html - # does not have a B300-specific recipe, so this config reuses the existing - # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. - minimaxm2.5-fp8-b300-vllm: image: vllm/vllm-openai:v0.21.0 model: MiniMaxAI/MiniMax-M2.5 @@ -4381,31 +4090,6 @@ minimaxm2.5-fp8-b300-vllm: - { tp: 2, conc-start: 64, conc-end: 256 } - { tp: 4, conc-start: 4, conc-end: 8 } -# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' -minimaxm2.5-fp8-b300-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: b300 - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # B300 tp=4: compute ceiling ~60 (empirical), KV cliff ~85 (analytical). - # Push none past the KV cliff (96, 128, 192) so the no-offload throughput - # collapse is visible; cpu range overlaps fully so each high-conc point - # has a same-conc no-offload counterpart for direct comparison. - # Dense sampling between 96 and 128 (step=4) to resolve the sharp dropoff - # observed in v6 cpu data right past conc=96. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } - - { tp: 4, offloading: cpu, conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } - minimaxm2.5-fp4-b200-vllm: image: vllm/vllm-openai:v0.22.0 model: nvidia/MiniMax-M2.5-NVFP4 @@ -4438,29 +4122,6 @@ minimaxm2.5-fp4-b200-vllm: # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from minimaxm2.5-fp4-b200-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp4-b200-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main. -minimaxm2.5-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.0-cu130 - model: nvidia/MiniMax-M2.5-NVFP4 - model-prefix: minimaxm2.5 - runner: b200 - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html - # does not have a B300-specific recipe, so this config reuses the existing - # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. - minimaxm2.5-fp4-b300-vllm: image: vllm/vllm-openai:v0.21.0 model: nvidia/MiniMax-M2.5-NVFP4 @@ -4530,29 +4191,6 @@ minimaxm2.5-fp8-h100-vllm: search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } -# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main. -minimaxm2.5-fp8-h100-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: h100 - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # H100 tp=4 ep=4: compute ceiling ~10 (empirical), KV cliff ~6 (analytical). - # Best cpu-offload demo SKU — 4-conc-point window between cliffs. - # Dense sampling 4-12 covers both cliffs; conc 16 confirms compute plateau. - - duration: 1800 - search-space: - - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] } - - { tp: 4, ep: 4, offloading: cpu, conc-list: [5, 6, 7, 8, 10, 12] } - dsr1-fp8-h100-dynamo-sglang: image: lmsysorg/sglang:v0.5.8-cu130 model: deepseek-ai/DeepSeek-R1-0528 @@ -4757,28 +4395,6 @@ minimaxm2.5-fp8-h200-vllm: search-space: - { tp: 4, conc-start: 1, conc-end: 256 } -# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main. -minimaxm2.5-fp8-h200-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 - model: MiniMaxAI/MiniMax-M2.5 - model-prefix: minimaxm2.5 - runner: h200 - precision: fp8 - framework: vllm - multinode: false - scenarios: - agentic-coding: - # H200 tp=4: compute ceiling ~35 (empirical), KV cliff ~29 (analytical). - # cpu offload window conc 29-35 — dense sampling 24-40 captures both cliffs. - - duration: 1800 - search-space: - - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] } - - { tp: 4, offloading: cpu, conc-list: [24, 28, 32, 36, 40, 48] } - dsr1-fp4-gb200-dynamo-trt: image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 model: nvidia/DeepSeek-R1-0528-NVFP4-v2 @@ -9203,26 +8819,6 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: dp-attn: true -kimik2.5-int4-h100-vllm: - image: vllm/vllm-openai:v0.20.2 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - runner: h100 - precision: int4 - framework: vllm - multinode: false - scenarios: - # New entry, agentic-coding only: this PR intentionally does NOT add - # fixed-seq-len coverage for kimik2.5-int4 on H100 to keep the - # fixed-seq-len test surface identical to origin/main. - # H100 has 80 GB HBM per GPU (smallest in this set); the KV cliff arrives - # early. Sweep saturates conc=20 to keep total HBM headroom. - agentic-coding: - - duration: 1800 - search-space: - - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 20] } - - { tp: 8, offloading: cpu, conc-list: [1, 2, 4, 8, 12, 16, 20] } - qwen3.5-fp8-h100-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -9687,12 +9283,45 @@ glm5-fp8-gb300-dynamo-sglang: # to preserve main behavior; PR-branch modifications to those recipes are NOT # brought in here. # ============================================================================ +# ============================================================================= +# Agentic configs +# ----------------------------------------------------------------------------- +# All entries below run the agentic-coding scenario (Weka trace replay). +# Above this divider: fixed-seq-len / synthetic / prefix-share scenarios only. +# ============================================================================= -qwen3.5-fp8-b300-sglang-agentic-hicache: - image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd +# Diverged from dsv4-fp4-b200-vllm (agentic-coding sibling). Reasons below; +# the original dsv4-fp4-b200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - runner: 'b200-dsv4' -> 'b200-dgxc' +dsv4-fp4-b200-vllm-agentic: + image: vllm/vllm-openai:v0.20.0-cu130 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b200-dgxc + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # cpu offload only this iteration — none entries already validated in + # earlier runs (B200 25332045030: TP=8 1..32 + DEP=8 16..128 all 100%). + # Re-add when investigating regressions in offload=none. + - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } + - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } + +# Diverged from qwen3.5-fp8-b200-sglang (agentic-coding sibling). Metadata is +# identical to origin/main's qwen3.5-fp8-b200-sglang; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original qwen3.5-fp8-b200-sglang entry stays byte-identical to origin/main. +qwen3.5-fp8-b200-sglang-agentic: + image: lmsysorg/sglang:nightly-dev-20260422-de962f32 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 - runner: b300 + runner: b200 precision: fp8 framework: sglang multinode: false @@ -9700,46 +9329,404 @@ qwen3.5-fp8-b300-sglang-agentic-hicache: agentic-coding: - duration: 1800 search-space: - - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } -kimik2.5-fp4-b200-vllm-agentic-lmcache: - image: vllm/vllm-openai:v0.21.0 - model: nvidia/Kimi-K2.5-NVFP4 +# Diverged from glm5-fp8-b200-sglang (agentic-coding sibling). Metadata is +# identical to origin/main's glm5-fp8-b200-sglang; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original glm5-fp8-b200-sglang entry stays byte-identical to origin/main. +glm5-fp8-b200-sglang-agentic: + image: lmsysorg/sglang:v0.5.12-cu130 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: b200 + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] } + +# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html +# does not have a B300-specific recipe, so this config reuses the existing +# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. +# Diverged from kimik2.5-int4-b200-vllm (agentic-coding sibling). Reasons below; +# the original kimik2.5-int4-b200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - runner: 'b200' -> 'b200-dgxc' +kimik2.5-int4-b200-vllm-agentic: + # Bumped from v0.19.1 — that release tripped a bug in + # `flashinfer_trtllm_mxint4_moe` ('list' object has no attribute 'to') + # during warmup `profile_run` on the agentic-coding path + # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the + # flashinfer fix. + image: vllm/vllm-openai:v0.20.2 + model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: b200-dgxc - precision: fp4 + precision: int4 framework: vllm multinode: false scenarios: agentic-coding: - duration: 1800 search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] } - - { tp: 8, ep: 1, offloading: lmcache, conc-list: [16, 24, 32, 36] } - - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] } - - { tp: 4, ep: 1, offloading: lmcache, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 8, offloading: cpu, conc-list: [32, 64, 96, 128] } # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html # does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. +# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons -# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to -# origin/main so its fixed-seq-len sweep is unaffected. -# - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape -# mirroring the conc=192 point in the base entry's fixed-seq-len sweep. -# - additional-settings.CONFIG_FILE: points at the new agentic recipe under -# recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh -# overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC -# branch). Local-overlay pattern mirrors the existing 8k1k overlay. -dsv4-fp4-gb300-dynamo-vllm-agentic: - image: vllm/vllm-openai:v0.21.0-ubuntu2404 - model: deepseek-ai/DeepSeek-V4-Pro - model-prefix: dsv4 - # gb300-nv (not generic gb300) — the generic label is shared by both NV - # and CW runner pools, so runs-on: gb300 lets CW runners pick up shards. - # The gb300-nv label is on NV runners only (per .github/configs/runners.yaml +# Diverged from kimik2.5-int4-h200-vllm (agentic-coding sibling). Reasons below; +# the original kimik2.5-int4-h200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - runner: 'h200' -> 'h200-dgxc' +kimik2.5-int4-h200-vllm-agentic: + image: vllm/vllm-openai:v0.20.2 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with + # the /home/sa-shared/gharunners/ai-perf-cache mount where aiperf's + # content-addressed dataset mmap cache lives. Other h200 pools (cw, nb) + # don't have that mount and would re-materialize 65 GB to /tmp every job. + runner: h200-dgxc + precision: int4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] } + - { tp: 8, offloading: cpu, conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] } + +# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html +# does not have a B300-specific recipe, so this config reuses the existing +# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. +# Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below; +# the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2' +# - runner: 'b200' -> 'b200-dgxc' +kimik2.5-fp4-b200-vllm-agentic: + # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that + # cleared the agentic-coding warmup crash on max_model_len=131072 + + # prefix caching. + image: vllm/vllm-openai:v0.20.2 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: b200-dgxc + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] } + - { tp: 8, ep: 1, offloading: cpu, conc-list: [16, 24, 32, 36] } + - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] } + - { tp: 4, ep: 1, offloading: cpu, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } + +# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html +# does not have a B300-specific recipe, so this config reuses the existing +# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. + +# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below; +# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130' +# - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4' +# - model-prefix: 'dsr1' -> 'kimik2.5' +# - precision: 'fp8' -> 'fp4' +# - framework: 'sglang' -> 'vllm' +kimik2.5-fp4-b300-vllm-agentic: + # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM + # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the + # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted + # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the + # INT4 B300 sister already uses successfully. + image: vllm/vllm-openai:v0.20.0-cu130 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + - { tp: 8, ep: 1, offloading: cpu, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } + +# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). +# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper +# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up. +# Diverged from dsv4-fp8-h200-vllm (agentic-coding sibling). Reasons below; +# the original dsv4-fp8-h200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129' +dsv4-fp8-h200-vllm-agentic: + image: vllm/vllm-openai:deepseekv4-cu129 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: h200 + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [1, 2, 4, 8, 16] } + +# MTP variant of dsv4-fp8-h200-vllm. Uses the canonical v0.20.1 image +# (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds +# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. + +# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is +# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main. +dsv4-fp4-b300-vllm-agentic: + image: vllm/vllm-openai:v0.20.0-cu130 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + # cpu offload only this iteration — none entries already validated in + # earlier runs. Re-add when investigating regressions in offload=none. + - { tp: 4, offloading: cpu, conc-list: [16, 32, 64] } + - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } + - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } + - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } + +# Diverged from gptoss-fp4-b200-vllm (agentic-coding sibling). Reasons below; +# the original gptoss-fp4-b200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1' +gptoss-fp4-b200-vllm-agentic: + image: vllm/vllm-openai:v0.19.1 + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: b200 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] } + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64] } + - { tp: 4, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } + - { tp: 8, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } + +# Diverged from minimaxm2.5-fp8-b200-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-b200-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' +# - runner: 'b200' -> 'b200-dgxc' +minimaxm2.5-fp8-b200-vllm-agentic: + image: vllm/vllm-openai:v0.19.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: b200-dgxc + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # B200 tp=4: compute ceiling ~50 (empirical), KV cliff ~48 (analytical). + # Push none past the KV cliff (96, 128) to make the no-offload throughput + # collapse visible; cpu range overlaps fully for same-conc comparison. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 96, 128] } + - { tp: 4, offloading: cpu, conc-list: [48, 56, 64, 96, 128] } + + # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html + # does not have a B300-specific recipe, so this config reuses the existing + # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. + +# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below; +# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so +# its fixed-seq-len sweep is unaffected. +# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' +minimaxm2.5-fp8-b300-vllm-agentic: + image: vllm/vllm-openai:v0.19.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: b300 + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # B300 tp=4: compute ceiling ~60 (empirical), KV cliff ~85 (analytical). + # Push none past the KV cliff (96, 128, 192) so the no-offload throughput + # collapse is visible; cpu range overlaps fully so each high-conc point + # has a same-conc no-offload counterpart for direct comparison. + # Dense sampling between 96 and 128 (step=4) to resolve the sharp dropoff + # observed in v6 cpu data right past conc=96. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } + - { tp: 4, offloading: cpu, conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } + +# Diverged from minimaxm2.5-fp4-b200-vllm (agentic-coding sibling). Metadata is +# identical to origin/main's minimaxm2.5-fp4-b200-vllm; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main. +minimaxm2.5-fp4-b200-vllm-agentic: + image: vllm/vllm-openai:v0.19.0-cu130 + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + runner: b200 + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + + # NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/MiniMax/MiniMax-M2.html + # does not have a B300-specific recipe, so this config reuses the existing + # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. + +# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is +# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main. +minimaxm2.5-fp8-h100-vllm-agentic: + image: vllm/vllm-openai:v0.20.2 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: h100 + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # H100 tp=4 ep=4: compute ceiling ~10 (empirical), KV cliff ~6 (analytical). + # Best cpu-offload demo SKU — 4-conc-point window between cliffs. + # Dense sampling 4-12 covers both cliffs; conc 16 confirms compute plateau. + - duration: 1800 + search-space: + - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] } + - { tp: 4, ep: 4, offloading: cpu, conc-list: [5, 6, 7, 8, 10, 12] } + +# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is +# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this +# PR adds an agentic-coding scenarios block that differs from main +# (either main had none or had a different conc/offload sweep). +# The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main. +minimaxm2.5-fp8-h200-vllm-agentic: + image: vllm/vllm-openai:v0.20.2 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: h200 + precision: fp8 + framework: vllm + multinode: false + scenarios: + agentic-coding: + # H200 tp=4: compute ceiling ~35 (empirical), KV cliff ~29 (analytical). + # cpu offload window conc 29-35 — dense sampling 24-40 captures both cliffs. + - duration: 1800 + search-space: + - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 24, 28, 32, 36, 48] } + - { tp: 4, offloading: cpu, conc-list: [24, 28, 32, 36, 40, 48] } + +kimik2.5-int4-h100-vllm: + image: vllm/vllm-openai:v0.20.2 + model: moonshotai/Kimi-K2.5 + model-prefix: kimik2.5 + runner: h100 + precision: int4 + framework: vllm + multinode: false + scenarios: + # New entry, agentic-coding only: this PR intentionally does NOT add + # fixed-seq-len coverage for kimik2.5-int4 on H100 to keep the + # fixed-seq-len test surface identical to origin/main. + # H100 has 80 GB HBM per GPU (smallest in this set); the KV cliff arrives + # early. Sweep saturates conc=20 to keep total HBM headroom. + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 16, 20] } + - { tp: 8, offloading: cpu, conc-list: [1, 2, 4, 8, 12, 16, 20] } + +qwen3.5-fp8-b300-sglang-agentic-hicache: + image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd + model: Qwen/Qwen3.5-397B-A17B-FP8 + model-prefix: qwen3.5 + runner: b300 + precision: fp8 + framework: sglang + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + +kimik2.5-fp4-b200-vllm-agentic-lmcache: + image: vllm/vllm-openai:v0.21.0 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: b200-dgxc + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 24] } + - { tp: 8, ep: 1, offloading: lmcache, conc-list: [16, 24, 32, 36] } + - { tp: 4, ep: 1, offloading: none, conc-list: [8, 12, 14, 16, 18, 20] } + - { tp: 4, ep: 1, offloading: lmcache, conc-list: [12, 14, 16, 18, 20, 22, 24, 32] } + +# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html +# does not have a B300-specific recipe, so this config reuses the existing +# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. + +# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons +# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to +# origin/main so its fixed-seq-len sweep is unaffected. +# - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape +# mirroring the conc=192 point in the base entry's fixed-seq-len sweep. +# - additional-settings.CONFIG_FILE: points at the new agentic recipe under +# recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh +# overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC +# branch). Local-overlay pattern mirrors the existing 8k1k overlay. +dsv4-fp4-gb300-dynamo-vllm-agentic: + image: vllm/vllm-openai:v0.21.0-ubuntu2404 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + # gb300-nv (not generic gb300) — the generic label is shared by both NV + # and CW runner pools, so runs-on: gb300 lets CW runners pick up shards. + # The gb300-nv label is on NV runners only (per .github/configs/runners.yaml # + actual runner label listings). Pins agentic to the NVIDIA cluster # for initial validation. Drop -nv suffix to widen later. runner: gb300-nv @@ -9905,3 +9892,33 @@ qwen3.5-fp8-h100-sglang-agentic: search-space: - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] } - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] } + +# Split from dsr1-fp4-b200-dynamo-trt: agentic-coding scenario only. +dsr1-fp4-b200-dynamo-trt-agentic: + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post1 + model: deepseek-r1-fp4 + model-prefix: dsr1 + runner: b200-multinode + precision: fp4 + framework: dynamo-trt + multinode: true + disagg: true + scenarios: + agentic-coding: + - duration: 300 + search-space: + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/cquil11/srt-slurm-nv/blob/cam/sa-submission-q2-2026/recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml + - "CONFIG_FILE=recipes/trtllm/b200-fp4/agentic/ctx1_gen1_tep8_128k_agentic.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false From 76aedd65780ddaabfb2cb0d630081a42e6cb72ac Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 13:20:16 -0500 Subject: [PATCH 03/15] configs(master): bump all vllm images to v0.22.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps every non-comment `image:` line in both master configs to the unsuffixed v0.22.0 tag: - vllm/vllm-openai:* -> vllm/vllm-openai:v0.22.0 - vllm/vllm-openai-rocm:* -> vllm/vllm-openai-rocm:v0.22.0 Covers all prior variants: v0.17–v0.21 numbered releases, the -cu130 / -ubuntu2404 / deepseekv4-cu129 build-variant tags, and the nightly- ROCm pins (which were holding DSv4 ROCm support that has since landed in the tagged release). Comment-line tag references in the agentic divergence change-log blocks are intentionally untouched so their "X -> Y" history reads correctly. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/amd-master.yaml | 24 +++++------ .github/configs/nvidia-master.yaml | 64 +++++++++++++++--------------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 0495ebf16..ee4276a26 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -705,7 +705,7 @@ glm5.1-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 256 } kimik2.5-int4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi355x @@ -724,7 +724,7 @@ kimik2.5-int4-mi355x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi325x @@ -743,7 +743,7 @@ kimik2.5-int4-mi325x-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: mi300x @@ -896,7 +896,7 @@ minimaxm2.5-fp4-mi355x-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi300x @@ -938,7 +938,7 @@ minimaxm2.5-fp8-mi325x-vllm: - { tp: 8, ep: 8, conc-start: 4, conc-end: 256 } gptoss-fp4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.17.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi300x @@ -1379,7 +1379,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" kimik2.5-fp4-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-bf610c2f56764e1b30bc6065f4ceace3d6e59036 + image: vllm/vllm-openai-rocm:v0.22.0 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x-disagg @@ -1433,7 +1433,7 @@ kimik2.5-fp4-mi355x-vllm-disagg: - "DECODE_NODES=2" minimaxm2.5-fp8-mi355x-vllm-disagg: - image: vllm/vllm-openai-rocm:nightly-a6682d1d259cca69a9ae737ea5608fbbe7520031 + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x-disagg @@ -2558,7 +2558,7 @@ kimik2.5-fp4-mi355x-vllm-agentic: # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and # includes all subsequent ROCm offload work. - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 runner: mi355x @@ -2591,7 +2591,7 @@ minimaxm2.5-fp8-mi355x-vllm-agentic: # which enables SimpleCPUOffloadConnector on ROCm. Required for the # cpu-offload sweep points to use the same offload path as the NVIDIA # agentic-coding configs. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi355x @@ -2614,7 +2614,7 @@ minimaxm2.5-fp8-mi355x-vllm-agentic: # - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' minimaxm2.5-fp8-mi300x-vllm-agentic: # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi300x @@ -2637,7 +2637,7 @@ minimaxm2.5-fp8-mi300x-vllm-agentic: # - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' minimaxm2.5-fp8-mi325x-vllm-agentic: # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. - image: vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf + image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: mi325x @@ -2671,7 +2671,7 @@ qwen3.5-fp8-mi355x-sglang-agentic-hicache: - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } dsv4-fp4-mi355x-vllm-agentic: - image: vllm/vllm-openai-rocm:v0.21.0 + image: vllm/vllm-openai-rocm:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 04764831c..d7791fa11 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1804,7 +1804,7 @@ dsv4-fp4-b200-trt-mtp: # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. dsv4-fp4-b200-vllm-mtp: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dsv4 @@ -2474,7 +2474,7 @@ kimik2.5-int4-b200-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } kimik2.5-int4-b300-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: b300 @@ -2535,7 +2535,7 @@ kimik2.5-fp4-b200-vllm: - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } kimik2.5-fp4-b300-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: b300 @@ -2686,7 +2686,7 @@ dsr1-fp8-h200-sglang-mtp: # Uses the cu129 image. H200 has no FP4 path, so the FP4 indexer cache # flag is omitted. Max-model-len is pinned at 800k per the recipe. dsv4-fp8-h200-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 @@ -2710,7 +2710,7 @@ dsv4-fp8-h200-vllm: # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. dsv4-fp8-h200-vllm-mtp: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 @@ -2852,7 +2852,7 @@ dsv4-fp4-b300-trt-mtp: - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024, spec-decoding: mtp } dsv4-fp4-b300-vllm-mtp: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -4067,7 +4067,7 @@ minimaxm2.5-fp8-b200-vllm: # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. minimaxm2.5-fp8-b300-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: b300 @@ -4123,7 +4123,7 @@ minimaxm2.5-fp4-b200-vllm: # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. minimaxm2.5-fp4-b300-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: nvidia/MiniMax-M2.5-NVFP4 model-prefix: minimaxm2.5 runner: b300 @@ -4150,7 +4150,7 @@ minimaxm2.5-fp4-b300-vllm: - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: h100 @@ -7883,7 +7883,7 @@ kimik2.5-fp4-gb200-dynamo-trt: dp-attn: true kimik2.5-fp4-gb200-dynamo-vllm: - image: vllm/vllm-openai:v0.18.0-cu130 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: gb200 @@ -7985,7 +7985,7 @@ kimik2.5-fp4-gb200-dynamo-vllm: dp-attn: true dsv4-fp4-b200-dynamo-vllm: - image: vllm/vllm-openai:v0.20.1 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-multinode @@ -8041,7 +8041,7 @@ dsv4-fp4-b200-dynamo-vllm: dp-attn: true dsv4-fp4-gb200-dynamo-vllm: - image: vllm/vllm-openai:v0.20.0-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb200 @@ -8141,7 +8141,7 @@ dsv4-fp4-gb200-dynamo-vllm: # MTP2 variant of dsv4-fp4-gb200-dynamo-vllm. Uses the vLLM 0.20.1 image # and hand-picked 8k/1k Pareto points mirrored from NVIDIA/srt-slurm. dsv4-fp4-gb200-dynamo-vllm-mtp2: - image: vllm/vllm-openai:v0.20.1-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb200 @@ -8221,7 +8221,7 @@ dsv4-fp4-gb200-dynamo-vllm-mtp2: dp-attn: true dsv4-fp4-b300-dynamo-vllm: - image: vllm/vllm-openai:v0.20.1 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -8277,7 +8277,7 @@ dsv4-fp4-b300-dynamo-vllm: dp-attn: true dsv4-fp4-gb300-dynamo-vllm: - image: vllm/vllm-openai:v0.20.0-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-nv @@ -9295,7 +9295,7 @@ glm5-fp8-gb300-dynamo-sglang: # its fixed-seq-len sweep is unaffected. # - runner: 'b200-dsv4' -> 'b200-dgxc' dsv4-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.20.0-cu130 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200-dgxc @@ -9364,7 +9364,7 @@ kimik2.5-int4-b200-vllm-agentic: # during warmup `profile_run` on the agentic-coding path # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the # flashinfer fix. - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: b200-dgxc @@ -9387,7 +9387,7 @@ kimik2.5-int4-b200-vllm-agentic: # its fixed-seq-len sweep is unaffected. # - runner: 'h200' -> 'h200-dgxc' kimik2.5-int4-h200-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 # Pinned to h200-dgxc (NVIDIA DGX Cloud Slurm pool) so we hit a host with @@ -9417,7 +9417,7 @@ kimik2.5-fp4-b200-vllm-agentic: # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that # cleared the agentic-coding warmup crash on max_model_len=131072 + # prefix caching. - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: b200-dgxc @@ -9451,7 +9451,7 @@ kimik2.5-fp4-b300-vllm-agentic: # trtllm_fp4_block_scale_moe path. v0.20.0-cu130 is the Blackwell-targeted # build that has the full sm_10x/sm_11x/sm_12x kernel set and is what the # INT4 B300 sister already uses successfully. - image: vllm/vllm-openai:v0.20.0-cu130 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: b300 @@ -9473,7 +9473,7 @@ kimik2.5-fp4-b300-vllm-agentic: # its fixed-seq-len sweep is unaffected. # - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129' dsv4-fp8-h200-vllm-agentic: - image: vllm/vllm-openai:deepseekv4-cu129 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: h200 @@ -9496,7 +9496,7 @@ dsv4-fp8-h200-vllm-agentic: # (either main had none or had a different conc/offload sweep). # The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main. dsv4-fp4-b300-vllm-agentic: - image: vllm/vllm-openai:v0.20.0-cu130 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 @@ -9519,7 +9519,7 @@ dsv4-fp4-b300-vllm-agentic: # its fixed-seq-len sweep is unaffected. # - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1' gptoss-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 + image: vllm/vllm-openai:v0.22.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: b200 @@ -9541,7 +9541,7 @@ gptoss-fp4-b200-vllm-agentic: # - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' # - runner: 'b200' -> 'b200-dgxc' minimaxm2.5-fp8-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 + image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: b200-dgxc @@ -9567,7 +9567,7 @@ minimaxm2.5-fp8-b200-vllm-agentic: # its fixed-seq-len sweep is unaffected. # - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' minimaxm2.5-fp8-b300-vllm-agentic: - image: vllm/vllm-openai:v0.19.1 + image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: b300 @@ -9593,7 +9593,7 @@ minimaxm2.5-fp8-b300-vllm-agentic: # (either main had none or had a different conc/offload sweep). # The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp4-b200-vllm-agentic: - image: vllm/vllm-openai:v0.19.0-cu130 + image: vllm/vllm-openai:v0.22.0 model: nvidia/MiniMax-M2.5-NVFP4 model-prefix: minimaxm2.5 runner: b200 @@ -9616,7 +9616,7 @@ minimaxm2.5-fp4-b200-vllm-agentic: # (either main had none or had a different conc/offload sweep). # The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp8-h100-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h100 @@ -9639,7 +9639,7 @@ minimaxm2.5-fp8-h100-vllm-agentic: # (either main had none or had a different conc/offload sweep). # The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp8-h200-vllm-agentic: - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 runner: h200 @@ -9656,7 +9656,7 @@ minimaxm2.5-fp8-h200-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [24, 28, 32, 36, 40, 48] } kimik2.5-int4-h100-vllm: - image: vllm/vllm-openai:v0.20.2 + image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 runner: h100 @@ -9691,7 +9691,7 @@ qwen3.5-fp8-b300-sglang-agentic-hicache: - { tp: 4, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } kimik2.5-fp4-b200-vllm-agentic-lmcache: - image: vllm/vllm-openai:v0.21.0 + image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 runner: b200-dgxc @@ -9721,7 +9721,7 @@ kimik2.5-fp4-b200-vllm-agentic-lmcache: # overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC # branch). Local-overlay pattern mirrors the existing 8k1k overlay. dsv4-fp4-gb300-dynamo-vllm-agentic: - image: vllm/vllm-openai:v0.21.0-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 # gb300-nv (not generic gb300) — the generic label is shared by both NV @@ -9810,7 +9810,7 @@ dsv4-fp4-gb300-dynamo-vllm-agentic: # overlay (recipes/vllm/deepseek-v4/agentic/), so a change to the recipe # applies to both clusters with no duplication. dsv4-fp4-gb300-cw-dynamo-vllm-agentic: - image: vllm/vllm-openai:v0.21.0-ubuntu2404 + image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw From 6dede7b24c94f68a74acd537c552950ef74531af Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 13:46:25 -0500 Subject: [PATCH 04/15] configs(master): strip stale narrative comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes ~240 lines of slop comments that no longer earn their keep: - "Diverged from X (agentic-coding sibling)..." rationale blocks (24 occurrences) — the sibling split is now durable and the "preserved on main" framing isn't meaningful on a branch - "Net-new agentic recipes from chore/agentx-v0.3" PR-context headers - "agentic-coding sibling — temporarily disabled" + the entire commented-out qwen3.5-bf16-b200-sglang-agentic placeholder block - Orphan boundary comments ("# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720..." / "# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm nightly...") that were stranded by prior entry moves - Inline image-bump rationale that's now stale ("# Bumped from v0.19.1...", "# Same image as the INT4 sibling: v0.20.x...", "# Nightly carrying vllm-project/vllm@20cac26b...", "# v0.21.0 (released 2026-05-14)...") since everything is on v0.22.0 Verified via YAML deep-equal: 0 keys added/removed/modified in either file — purely comment removal. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/amd-master.yaml | 110 ----------------------- .github/configs/nvidia-master.yaml | 140 ----------------------------- 2 files changed, 250 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ee4276a26..7f1c8192d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1826,7 +1826,6 @@ dsr1-fp4-mi355x-sglang-disagg-1k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260529 model: amd/DeepSeek-R1-0528-MXFP4-v2 @@ -1937,7 +1936,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" - # 1*DEP8 + 1*DEP8 - spec-decoding: "mtp" conc-list: [ 128 ] @@ -1995,11 +1993,6 @@ dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - -# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the -# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the -# image tag, so bumping sglang is just an image tag bump here. Sweeps -# DP-attention on/off and EP=8. dsv4-fp4-mi355x-sglang: image: rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4 model: deepseek-ai/DeepSeek-V4-Pro @@ -2056,25 +2049,6 @@ dsv4-fp4-mi355x-sglang-mtp: - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048, spec-decoding: mtp } - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32, spec-decoding: mtp } -# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm -# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged -# on 2026-05-05, so any nightly built after that includes the -# DeepseekV4ForCausalLM model class. -# -# IMPORTANT: pin to a digest-suffixed nightly tag rather than the -# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs -# files keyed on the image string and short-circuits re-import if the -# file already exists, so the floating tag silently keeps a stale build -# even after Docker Hub updates `:nightly`. -# -# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the -# rest); InferenceX classifies this as fp4 — same as the sister sglang -# and atom DSv4 mi355x entries below. Image and serving flags follow the -# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp -# executor, triton_unfused MoE (required for the FP4 expert format), -# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, -# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 -# probe to validate the ROCm DP+EP path. dsv4-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -2263,13 +2237,6 @@ glm5-fp8-mi325x-sglang-mtp: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } -# ============================================================================ -# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries). -# Recipes that ALREADY existed on main were intentionally left at main's version -# to preserve main behavior; PR-branch modifications to those recipes are NOT -# brought in here. -# ============================================================================ - dsr1-fp4-mi355x-sglang-disagg-mtp: image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519 model: amd/DeepSeek-R1-0528-MXFP4-v2 @@ -2498,23 +2465,6 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" - -# DSv4-Pro FP4 on MI355X via SGLang. Uses a rocm720 mi35x image built off the -# amd/deepseek_v4 branch in sgl-project/sglang; the SHA is encoded in the -# image tag, so bumping sglang is just an image tag bump here. Sweeps -# DP-attention on/off and EP=8. -# ============================================================================= -# Agentic configs -# ----------------------------------------------------------------------------- -# All entries below run the agentic-coding scenario (Weka trace replay). -# Above this divider: fixed-seq-len / synthetic / prefix-share scenarios only. -# ============================================================================= - -# Diverged from qwen3.5-fp8-mi355x-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's qwen3.5-fp8-mi355x-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original qwen3.5-fp8-mi355x-sglang entry stays byte-identical to origin/main. qwen3.5-fp8-mi355x-sglang-agentic: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260414 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -2529,11 +2479,6 @@ qwen3.5-fp8-mi355x-sglang-agentic: search-space: - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } -# Diverged from glm5.1-fp4-mi355x-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's glm5.1-fp4-mi355x-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original glm5.1-fp4-mi355x-sglang entry stays byte-identical to origin/main. glm5.1-fp4-mi355x-sglang-agentic: image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260415 model: amd/GLM-5.1-MXFP4 @@ -2549,15 +2494,7 @@ glm5.1-fp4-mi355x-sglang-agentic: # sglang manages KV eviction; mi355x glm5.1 caps at tp=4 conc=16 in fixed-seq, so cap conservatively - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } -# Diverged from kimik2.5-fp4-mi355x-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-mi355x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:v0.21.0' kimik2.5-fp4-mi355x-vllm-agentic: - # v0.21.0 (released 2026-05-14) supersedes the prior nightly pin - # (51f22dcf...) which was carrying the SimpleCPUOffloadConnector ROCm - # cpu_offload_blocks > 0 fix. v0.21.0 is much newer than that fix and - # includes all subsequent ROCm offload work. image: vllm/vllm-openai-rocm:v0.22.0 model: amd/Kimi-K2.5-MXFP4 model-prefix: kimik2.5 @@ -2581,16 +2518,7 @@ kimik2.5-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } -# Diverged from minimaxm2.5-fp8-mi355x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi355x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.19.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' minimaxm2.5-fp8-mi355x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b ("[Bug fix][KV Connector] - # add cpu_offload_blocks > 0 check before maybe_run_layer_kv_offload"), - # which enables SimpleCPUOffloadConnector on ROCm. Required for the - # cpu-offload sweep points to use the same offload path as the NVIDIA - # agentic-coding configs. image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 @@ -2608,12 +2536,7 @@ minimaxm2.5-fp8-mi355x-vllm-agentic: - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 56, 64, 72, 96] } - { tp: 4, ep: 4, offloading: cpu, conc-list: [48, 56, 64, 72, 96] } -# Diverged from minimaxm2.5-fp8-mi300x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi300x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.16.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' minimaxm2.5-fp8-mi300x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 @@ -2631,12 +2554,7 @@ minimaxm2.5-fp8-mi300x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 20, 24, 28, 32, 40, 48] } - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } -# Diverged from minimaxm2.5-fp8-mi325x-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-mi325x-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai-rocm:v0.18.0' -> 'vllm/vllm-openai-rocm:nightly-51f22dcfd068fe8f1e3192da2a1e825b930223cf' minimaxm2.5-fp8-mi325x-vllm-agentic: - # Nightly carrying vllm-project/vllm@20cac26b — see mi355x config above. image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 model-prefix: minimaxm2.5 @@ -2686,14 +2604,6 @@ dsv4-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 10, 12, 16] } - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [16, 24, 32, 40, 48] } -# Diverged from dsv4-fp4-mi355x-sglang (agentic-coding sibling). Reasons below; -# the original dsv4-fp4-mi355x-sglang entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - scenarios: replaced fixed-seq-len with agentic-coding. -# Image is identical to the base entry (rocm/sgl-dev DSv4 build). -# CONC ranges mirror dsv4-fp4-b200-vllm-agentic for cross-hardware -# comparability. Offload sweep is none-only (SGLang has no equivalent of -# vLLM's SimpleCPUOffloadConnector path that we exercise on b200). dsv4-fp4-mi355x-sglang-agentic: image: rocm/sgl-dev:rocm720-mi35x-0363e6c-20260509-DSv4 model: deepseek-ai/DeepSeek-V4-Pro @@ -2708,23 +2618,3 @@ dsv4-fp4-mi355x-sglang-agentic: search-space: - { tp: 8, offloading: none, conc-list: [16, 32, 64] } - { tp: 8, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } - -# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm -# nightly image. DSv4 base ROCm support (vllm-project/vllm#40871) merged -# on 2026-05-05, so any nightly built after that includes the -# DeepseekV4ForCausalLM model class. -# -# IMPORTANT: pin to a digest-suffixed nightly tag rather than the -# floating `:nightly`. launch_mi355x-amds.sh caches enroot squashfs -# files keyed on the image string and short-circuits re-import if the -# file already exists, so the floating tag silently keeps a stale build -# even after Docker Hub updates `:nightly`. -# -# DeepSeek-V4-Pro is FP4+FP8 mixed (FP4 MoE expert weights, FP8 for the -# rest); InferenceX classifies this as fp4 — same as the sister sglang -# and atom DSv4 mi355x entries below. Image and serving flags follow the -# validated recipe from vllm-project/recipes#433: AITER+AITER_LINEAR, mp -# executor, triton_unfused MoE (required for the FP4 expert format), -# async scheduling, max-num-seqs=128, max-num-batched-tokens=8192, -# gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 -# probe to validate the ROCm DP+EP path. diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d7791fa11..77c5d17ce 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2064,23 +2064,6 @@ qwen3.5-bf16-b200-sglang-mtp: search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } -# agentic-coding sibling — temporarily disabled, blocked by e2e-tests.yml -# artifact-name mismatch (downloads `agentic_*` but benchmark-tmpl.yml uploads -# as `bmk_agentic_*`). Re-enable once that workflow is aligned. -# qwen3.5-bf16-b200-sglang-agentic: -# image: lmsysorg/sglang:v0.5.12-cu130 -# model: Qwen/Qwen3.5-397B-A17B -# model-prefix: qwen3.5 -# runner: b200 -# precision: bf16 -# framework: sglang -# multinode: false -# scenarios: -# agentic-coding: -# - duration: 1800 -# search-space: -# - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - qwen3.5-fp8-b200-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -2331,7 +2314,6 @@ qwen3.5-fp8-b200-sglang-mtp: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - qwen3.5-fp8-b300-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -8818,7 +8800,6 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 8 dp-attn: true - qwen3.5-fp8-h100-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -9277,23 +9258,6 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: false -# ============================================================================ -# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries). -# Recipes that ALREADY existed on main were intentionally left at main's version -# to preserve main behavior; PR-branch modifications to those recipes are NOT -# brought in here. -# ============================================================================ -# ============================================================================= -# Agentic configs -# ----------------------------------------------------------------------------- -# All entries below run the agentic-coding scenario (Weka trace replay). -# Above this divider: fixed-seq-len / synthetic / prefix-share scenarios only. -# ============================================================================= - -# Diverged from dsv4-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original dsv4-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'b200-dsv4' -> 'b200-dgxc' dsv4-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -9312,11 +9276,6 @@ dsv4-fp4-b200-vllm-agentic: - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } -# Diverged from qwen3.5-fp8-b200-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's qwen3.5-fp8-b200-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original qwen3.5-fp8-b200-sglang entry stays byte-identical to origin/main. qwen3.5-fp8-b200-sglang-agentic: image: lmsysorg/sglang:nightly-dev-20260422-de962f32 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -9331,11 +9290,6 @@ qwen3.5-fp8-b200-sglang-agentic: search-space: - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } -# Diverged from glm5-fp8-b200-sglang (agentic-coding sibling). Metadata is -# identical to origin/main's glm5-fp8-b200-sglang; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original glm5-fp8-b200-sglang entry stays byte-identical to origin/main. glm5-fp8-b200-sglang-agentic: image: lmsysorg/sglang:v0.5.12-cu130 model: zai-org/GLM-5-FP8 @@ -9351,19 +9305,7 @@ glm5-fp8-b200-sglang-agentic: # sglang manages its own KV eviction via radix cache, so just sweep concurrency on offloading=none - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 64, 128] } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-int4-b200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-int4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'b200' -> 'b200-dgxc' kimik2.5-int4-b200-vllm-agentic: - # Bumped from v0.19.1 — that release tripped a bug in - # `flashinfer_trtllm_mxint4_moe` ('list' object has no attribute 'to') - # during warmup `profile_run` on the agentic-coding path - # (max_model_len=131072 + prefix caching enabled). v0.20.x carries the - # flashinfer fix. image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 model-prefix: kimik2.5 @@ -9382,10 +9324,6 @@ kimik2.5-int4-b200-vllm-agentic: # does not have a B300-specific recipe, so this config reuses the existing # Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-int4-h200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-int4-h200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - runner: 'h200' -> 'h200-dgxc' kimik2.5-int4-h200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: moonshotai/Kimi-K2.5 @@ -9405,18 +9343,7 @@ kimik2.5-int4-h200-vllm-agentic: - { tp: 8, offloading: none, conc-list: [1, 2, 3, 4, 5, 6, 7] } - { tp: 8, offloading: cpu, conc-list: [6, 7, 8, 9, 10, 11, 12, 13, 14] } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.17.0' -> 'vllm/vllm-openai:v0.20.2' -# - runner: 'b200' -> 'b200-dgxc' kimik2.5-fp4-b200-vllm-agentic: - # Same image as the INT4 sibling: v0.20.x carries the flashinfer fix that - # cleared the agentic-coding warmup crash on max_model_len=131072 + - # prefix caching. image: vllm/vllm-openai:v0.22.0 model: nvidia/Kimi-K2.5-NVFP4 model-prefix: kimik2.5 @@ -9437,14 +9364,6 @@ kimik2.5-fp4-b200-vllm-agentic: # does not have a B300-specific recipe, so this config reuses the existing # Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130' -# - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4' -# - model-prefix: 'dsr1' -> 'kimik2.5' -# - precision: 'fp8' -> 'fp4' -# - framework: 'sglang' -> 'vllm' kimik2.5-fp4-b300-vllm-agentic: # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the @@ -9465,13 +9384,6 @@ kimik2.5-fp4-b300-vllm-agentic: - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } - { tp: 8, ep: 1, offloading: cpu, conc-list: [1, 2, 4, 8, 16, 32, 40, 48, 56, 64] } -# DeepSeek-V4-Pro H200 single-node with SGLang (Marlin FP8, TP-only). -# Pinned to the h200-dgxc-slurm runner pool because the deepseek-v4-hopper -# image needs the /ix mount layout that only launch_h200-dgxc-slurm.sh sets up. -# Diverged from dsv4-fp8-h200-vllm (agentic-coding sibling). Reasons below; -# the original dsv4-fp8-h200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.20.1@sha256:9eff9734a30b6713a8566217d36f8277630fd2d31cec7f0a0292835901a23aa4' -> 'vllm/vllm-openai:deepseekv4-cu129' dsv4-fp8-h200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -9490,11 +9402,6 @@ dsv4-fp8-h200-vllm-agentic: # (the non-MTP entry above is still on the deepseekv4-cu129 tag) and adds # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. -# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main. dsv4-fp4-b300-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -9514,10 +9421,6 @@ dsv4-fp4-b300-vllm-agentic: - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } -# Diverged from gptoss-fp4-b200-vllm (agentic-coding sibling). Reasons below; -# the original gptoss-fp4-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.15.1' -> 'vllm/vllm-openai:v0.19.1' gptoss-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: openai/gpt-oss-120b @@ -9535,11 +9438,6 @@ gptoss-fp4-b200-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } - { tp: 8, offloading: cpu, conc-list: [64, 96, 128, 192, 256] } -# Diverged from minimaxm2.5-fp8-b200-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-b200-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' -# - runner: 'b200' -> 'b200-dgxc' minimaxm2.5-fp8-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -9562,10 +9460,6 @@ minimaxm2.5-fp8-b200-vllm-agentic: # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP8 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' minimaxm2.5-fp8-b300-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -9587,11 +9481,6 @@ minimaxm2.5-fp8-b300-vllm-agentic: - { tp: 4, offloading: none, conc-list: [1, 2, 4, 8, 16, 32, 48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } - { tp: 4, offloading: cpu, conc-list: [48, 64, 96, 100, 104, 108, 112, 116, 120, 124, 128, 192] } -# Diverged from minimaxm2.5-fp4-b200-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp4-b200-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp4-b200-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: nvidia/MiniMax-M2.5-NVFP4 @@ -9610,11 +9499,6 @@ minimaxm2.5-fp4-b200-vllm-agentic: # does not have a B300-specific recipe, so this config reuses the existing # MiniMax-M2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp8-h100-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp8-h100-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -9633,11 +9517,6 @@ minimaxm2.5-fp8-h100-vllm-agentic: - { tp: 4, ep: 4, offloading: none, conc-list: [1, 2, 4, 5, 6, 7, 8, 10, 12, 16] } - { tp: 4, ep: 4, offloading: cpu, conc-list: [5, 6, 7, 8, 10, 12] } -# Diverged from minimaxm2.5-fp8-h200-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's minimaxm2.5-fp8-h200-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original minimaxm2.5-fp8-h200-vllm entry stays byte-identical to origin/main. minimaxm2.5-fp8-h200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -9711,15 +9590,6 @@ kimik2.5-fp4-b200-vllm-agentic-lmcache: # does not have a B300-specific recipe, so this config reuses the existing # Kimi-K2.5 FP4 B200 vLLM recipe as-is until B300-specific tuning is available. -# Diverged from dsv4-fp4-gb300-dynamo-vllm (agentic-coding sibling). Reasons -# below; the original dsv4-fp4-gb300-dynamo-vllm entry is left identical to -# origin/main so its fixed-seq-len sweep is unaffected. -# - scenarios: replaced fixed-seq-len with agentic-coding; single 1p6d shape -# mirroring the conc=192 point in the base entry's fixed-seq-len sweep. -# - additional-settings.CONFIG_FILE: points at the new agentic recipe under -# recipes/vllm/deepseek-v4/agentic/, which runners/launch_gb300-nv.sh -# overlays into the cquil11/srt-slurm-nv fork at run time (the IS_AGENTIC -# branch). Local-overlay pattern mirrors the existing 8k1k overlay. dsv4-fp4-gb300-dynamo-vllm-agentic: image: vllm/vllm-openai:v0.22.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -9868,16 +9738,6 @@ dsv4-fp4-gb300-cw-dynamo-vllm-agentic: ep: 8 dp-attn: true -# Diverged from qwen3.5-fp8-h100-sglang (agentic-coding sibling). Reasons below; -# the original qwen3.5-fp8-h100-sglang entry stays byte-identical to origin/main -# so its fixed-seq-len sweep is unaffected. -# - scenarios: replaced fixed-seq-len with agentic-coding. -# - runner: 'h100' -> 'h100-dgxc' (agentic runs need the dgxc-slurm cluster). -# Image is identical to the base entry (lmsysorg/sglang:v0.5.12-cu130). -# CONC range conservative for H100's 80 GB HBM3 under the long-ISL with- -# subagents corpus. hicache arm capped at conc 16 since high-conc + hicache -# tends to flake on first runs and conc 16 covers the cliff. The bench script -# sets WEKA_LOADER_OVERRIDE to the 256k-capped corpus variant. qwen3.5-fp8-h100-sglang-agentic: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 From 32572755524d98283c5339350a049fd7c6aad43d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 14:42:39 -0500 Subject: [PATCH 05/15] chore(aiperf): bump submodule for 060226 loader allowlist fix Picks up SemiAnalysisAI/aiperf@47e6e206, which adds the 060226 and 060226_256k loader names to the inferencex-agentx-mvp scenario's require_loader allowlist. Without this bump, dispatching any non-DSv4 agentic run on this branch fails preflight because benchmark_lib.sh now defaults the loader to the 060226 corpus. Co-Authored-By: Claude Opus 4.7 (1M context) --- utils/aiperf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/aiperf b/utils/aiperf index de3ad1c18..47e6e2060 160000 --- a/utils/aiperf +++ b/utils/aiperf @@ -1 +1 @@ -Subproject commit de3ad1c18b704a60c43bcc5f76dfb2ac7e346fd1 +Subproject commit 47e6e206001a85a3cc4c6212a1e0425f045bbcb3 From 321fd445c301c5c52901b8f37e295ee38a10f39f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 14:58:48 -0500 Subject: [PATCH 06/15] (testing) b300 dsv4 simple offloading --- .github/configs/nvidia-master.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 77c5d17ce..5b0792d08 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9403,7 +9403,9 @@ dsv4-fp8-h200-vllm-agentic: # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'. dsv4-fp4-b300-vllm-agentic: - image: vllm/vllm-openai:v0.22.0 + # image: vllm/vllm-openai:v0.22.0 + # includes https://github.com/vllm-project/vllm/pull/43447 up to 7ead0a0f27fc2b34efdcc8a557d542c5a372306f + image: cquil/vllm-openai:v0.22.0-7ead0a0f27fc2b34efdcc8a557d542c5a372306f model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From 32839349559d13a51537879b32dd05e8f60e0661 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 15:22:03 -0500 Subject: [PATCH 07/15] runners(b300-nv): remap container UID to root for apt-get install Same root cause as 967c50ca (h200-dgxc-slurm fix): vllm/vllm-openai images ship as non-root, and on b300-nv the pyxis/enroot config does NOT implicitly remap the calling user to UID 0 inside the container. benchmark_lib.sh::install_agentic_deps runs apt-get install -y git, which fails with "dpkg: error: requested operation requires superuser privilege" (see run 26844610474 / dsv4 b300 simple offloading). Adding --container-remap-root to the srun line matches b200-dgxc and h200-dgxc-slurm behavior; benchmark_lib.sh stays untouched. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b300-nv.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 67e8b48cc..cb4a634c3 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -387,6 +387,7 @@ else --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT,$WRITABLE_MODELS_DIR:$WRITABLE_MODELS_DIR \ --no-container-mount-home \ + --container-remap-root \ --container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash "$BENCH_SCRIPT" From 360bcf089130808b0f2a3a249dfdd38e19772c1b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 15:45:01 -0500 Subject: [PATCH 08/15] benchmarks(agentic): skip hf download when MODEL_PATH is pre-staged Replaces the simple unguarded download in every agentic recipe: - if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi with the same MODEL_PATH-aware logic that the fixed-seq-len B300 recipes already use: if [[ -n "${MODEL_PATH:-}" ]]; then if [[ ! -d "$MODEL_PATH" || empty ]]; then hf download "$MODEL" --local-dir "$MODEL_PATH" fi else hf download "$MODEL" export MODEL_PATH="$MODEL" fi Effect: on clusters where launch_*.sh exports MODEL_PATH pointing at a pre-staged on-node copy (e.g. b300-nv sets it to /scratch/models/), the agentic recipe now correctly short- circuits the hf-download instead of re-pulling 700 GB of DSv4-Pro into $HOME/.cache/huggingface every run. Touches 33 scripts; same edit in each. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/agentic/dsr1_fp4_b200.sh | 12 +++++++++++- benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh | 12 +++++++++++- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 12 +++++++++++- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 12 +++++++++++- .../single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 12 +++++++++++- benchmarks/single_node/agentic/dsv4_fp8_h200.sh | 12 +++++++++++- benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh | 12 +++++++++++- benchmarks/single_node/agentic/glm5_fp8_b200.sh | 12 +++++++++++- benchmarks/single_node/agentic/gptoss_fp4_b200.sh | 12 +++++++++++- benchmarks/single_node/agentic/gptoss_fp4_h100.sh | 12 +++++++++++- benchmarks/single_node/agentic/gptoss_fp4_h200.sh | 12 +++++++++++- benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh | 12 +++++++++++- benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh | 12 +++++++++++- benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh | 12 +++++++++++- benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh | 12 +++++++++++- .../single_node/agentic/kimik2.5_fp4_mi355x.sh | 12 +++++++++++- benchmarks/single_node/agentic/kimik2.5_int4_b200.sh | 12 +++++++++++- benchmarks/single_node/agentic/kimik2.5_int4_h100.sh | 12 +++++++++++- benchmarks/single_node/agentic/kimik2.5_int4_h200.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp4_b200.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_b200.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_b300.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_h100.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_h200.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_mi300x.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_mi325x.sh | 12 +++++++++++- .../single_node/agentic/minimaxm2.5_fp8_mi355x.sh | 12 +++++++++++- benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh | 12 +++++++++++- benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh | 12 +++++++++++- .../single_node/agentic/qwen3.5_fp8_b300_sglang.sh | 12 +++++++++++- benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh | 12 +++++++++++- benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh | 12 +++++++++++- .../single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh | 12 +++++++++++- 33 files changed, 363 insertions(+), 33 deletions(-) diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh index f9955adc7..23cf71e7d 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh @@ -17,7 +17,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh index ff76b768d..c67fc7ebf 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi amd-smi || true diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 108347479..7bc18ce22 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -38,7 +38,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index f6748a5f8..7a130673d 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -32,7 +32,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh index 99aec25fe..ab2897d88 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -31,7 +31,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh index 0a0177983..c1e2f50b3 100755 --- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh index 500b456f5..5987a789e 100755 --- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh index 259c19586..3d601193f 100755 --- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh index 6e921db58..ec8c4c9f8 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh index 557986b0d..443bc8bcc 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh index 1592a8d5c..7a93c71c5 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh index eb1883ff1..8ca6d805c 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi amd-smi || true diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh index 99e29c819..6e41756a0 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi # If the machine runs a MEC FW older than 177, RCCL cannot reclaim some memory. diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index ad0b4495a..e5c87b14a 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh index 8cebe4f20..8ab9672af 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh @@ -21,7 +21,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index fd0ce3677..734f63766 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -33,7 +33,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh index 697d3fa45..ab91c99c5 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh index 2fd3b381c..fa867d976 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh index 97929e43e..08549e93a 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh @@ -16,7 +16,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh index 38ef72b56..195b285c6 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh index 4ce131cba..af7c7a216 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh index 9f2d83a0b..d3ea641ef 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh index d21690da6..48f2ab388 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh index ed59991cb..15e5798c6 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh index 260bbdc68..add2a8fa0 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh @@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh index edac27a45..57746eef6 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh @@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh index 39dd63293..eac820aa0 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -24,7 +24,17 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh index 4ba87976b..ee40e1855 100755 --- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh @@ -20,7 +20,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh index 3432af5c9..4d39f2c81 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh @@ -20,7 +20,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh index 9d9c1d7d5..d926288ae 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh @@ -24,7 +24,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh index 95f0397a0..9db72e569 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh @@ -27,7 +27,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index aef9650ca..a78ee87b9 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -19,7 +19,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh index 5427d0d31..f5e2d2e6f 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh @@ -24,7 +24,17 @@ if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true From 57d4adb4fb5fbebc478f628c522a0a49cec9e072 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 16:23:34 -0500 Subject: [PATCH 09/15] benchmarks(agentic): launch server from MODEL_PATH, not the HF id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Companion to 360bcf08. That commit made the agentic recipes skip hf-download when MODEL_PATH was already pre-staged — but the recipes still invoked the server with the HF id ("vllm serve \$MODEL" / "--model-path \$MODEL"), so the engine looked up the HF cache (now empty, because we just skipped the download) and tried to download from scratch itself. With the model not in cache, vllm/sglang would deadlock in the auto-download path rather than fall through to a clean error. This commit aligns every agentic recipe with the fixed-seq-len B300 pattern verbatim: vllm serve "$MODEL_PATH" --served-model-name "$MODEL" python3 -m sglang.launch_server --model-path "$MODEL_PATH" --served-model-name "$MODEL" Net effect: server loads weights directly from /scratch/models// (or wherever the launch script staged the model) and reports the HF id as the served-model-name for downstream tooling. Touches all 33 agentic scripts. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/agentic/dsr1_fp4_b200.sh | 2 +- benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh | 2 +- benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh | 2 +- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 2 +- benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh | 2 +- benchmarks/single_node/agentic/dsv4_fp8_h200.sh | 2 +- benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh | 2 +- benchmarks/single_node/agentic/glm5_fp8_b200.sh | 2 +- benchmarks/single_node/agentic/gptoss_fp4_b200.sh | 2 +- benchmarks/single_node/agentic/gptoss_fp4_h100.sh | 2 +- benchmarks/single_node/agentic/gptoss_fp4_h200.sh | 2 +- benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh | 2 +- benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh | 2 +- benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh | 2 +- benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh | 2 +- benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh | 2 +- benchmarks/single_node/agentic/kimik2.5_int4_b200.sh | 2 +- benchmarks/single_node/agentic/kimik2.5_int4_h100.sh | 2 +- benchmarks/single_node/agentic/kimik2.5_int4_h200.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh | 2 +- benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh | 2 +- benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh | 2 +- benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh | 2 +- benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh | 2 +- benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh | 2 +- benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh | 2 +- benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh | 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) diff --git a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh index 23cf71e7d..16dc3bfd5 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_b200.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_b200.sh @@ -43,7 +43,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ ---model-path $MODEL \ +--model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh index c67fc7ebf..3b2561fe2 100755 --- a/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/dsr1_fp4_mi355x.sh @@ -44,7 +44,7 @@ export ROCM_QUICK_REDUCE_QUANTIZATION=INT4 export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ ---model-path=$MODEL \ +--model-path=$MODEL_PATH --served-model-name=$MODEL \ --host=0.0.0.0 \ --port=$PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh index 7bc18ce22..e80008f71 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b200_vllm.sh @@ -231,7 +231,7 @@ export VLLM_FLOAT32_MATMUL_PRECISION=high { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --trust-remote-code diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 7a130673d..88f4b38f5 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -123,7 +123,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve "$MODEL" \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port "$PORT" \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh index ab2897d88..029c8ea7f 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_mi355x_sglang.sh @@ -145,7 +145,7 @@ fi echo "Starting sglang server..." python3 -m sglang.launch_server \ - --model-path "$MODEL" \ + --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port "$PORT" \ "${PARALLEL_ARGS[@]}" \ diff --git a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh index c1e2f50b3..799c2bf26 100755 --- a/benchmarks/single_node/agentic/dsv4_fp8_h200.sh +++ b/benchmarks/single_node/agentic/dsv4_fp8_h200.sh @@ -50,7 +50,7 @@ export PYTHONNOUSERSITE=1 # Per recipe: EP + DP=8 (no --tensor-parallel-size). TP from search space is # used for GPU allocation by the runner and as the DP size. -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh index 5987a789e..3b85a31cd 100755 --- a/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/glm5.1_fp4_mi355x.sh @@ -52,7 +52,7 @@ echo "Starting SGLang server..." export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ - --model-path $MODEL \ + --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ diff --git a/benchmarks/single_node/agentic/glm5_fp8_b200.sh b/benchmarks/single_node/agentic/glm5_fp8_b200.sh index 3d601193f..b3597cf52 100755 --- a/benchmarks/single_node/agentic/glm5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/glm5_fp8_b200.sh @@ -49,7 +49,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ ---model-path=$MODEL \ +--model-path=$MODEL_PATH --served-model-name=$MODEL \ --host=0.0.0.0 \ --port=$PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh index ec8c4c9f8..80d70e724 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_b200.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_b200.sh @@ -63,7 +63,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --config "$RESULT_DIR/config.yaml" \ diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh index 443bc8bcc..13e32d315 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_h100.sh @@ -67,7 +67,7 @@ export TORCH_CUDA_ARCH_LIST="9.0" export PYTHONNOUSERSITE=1 export VLLM_MXFP4_USE_MARLIN=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --config "$RESULT_DIR/config.yaml" \ diff --git a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh index 7a93c71c5..e0d967246 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_h200.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_h200.sh @@ -67,7 +67,7 @@ export TORCH_CUDA_ARCH_LIST="9.0" export PYTHONNOUSERSITE=1 export VLLM_MXFP4_USE_MARLIN=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --config "$RESULT_DIR/config.yaml" \ diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh index 8ca6d805c..ff597c9a4 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_mi300x.sh @@ -75,7 +75,7 @@ esac echo "Starting vllm server..." -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --attention-backend ROCM_AITER_UNIFIED_ATTN \ diff --git a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh index 6e41756a0..1f8c29351 100755 --- a/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/agentic/gptoss_fp4_mi325x.sh @@ -74,7 +74,7 @@ esac echo "Starting vllm server..." -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --attention-backend ROCM_AITER_UNIFIED_ATTN \ diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh index e5c87b14a..34b45c9ec 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b200.sh @@ -188,7 +188,7 @@ export VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0 { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh index 8ab9672af..9667003e1 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_b300.sh @@ -95,7 +95,7 @@ export PYTHONNOUSERSITE=1 { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index 734f63766..139b12256 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -796,7 +796,7 @@ export PYTHONNOUSERSITE=1 { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh index ab91c99c5..5685f098c 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_b200.sh @@ -55,7 +55,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_INT4=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --gpu-memory-utilization 0.95 \ diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh index fa867d976..cb6c67f4b 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h100.sh @@ -55,7 +55,7 @@ echo "Starting vllm server..." export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_INT4=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --gpu-memory-utilization 0.95 \ diff --git a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh index 08549e93a..1bfa0c33b 100755 --- a/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh +++ b/benchmarks/single_node/agentic/kimik2.5_int4_h200.sh @@ -66,7 +66,7 @@ echo "Starting vllm server..." export PYTHONNOUSERSITE=1 export VLLM_USE_FLASHINFER_MOE_INT4=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --gpu-memory-utilization 0.95 \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh index 195b285c6..b4a63eff3 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_b200.sh @@ -68,7 +68,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ $PARALLEL_ARGS \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh index af7c7a216..0724aba5b 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b200.sh @@ -72,7 +72,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh index d3ea641ef..c291a2ceb 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_b300.sh @@ -72,7 +72,7 @@ export TORCH_CUDA_ARCH_LIST="10.0" export PYTHONNOUSERSITE=1 export VLLM_FLOAT32_MATMUL_PRECISION=high -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh index 48f2ab388..516bc4696 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h100.sh @@ -68,7 +68,7 @@ echo "Starting vllm server..." export TORCH_CUDA_ARCH_LIST="9.0" export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh index 15e5798c6..e6343b8ba 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_h200.sh @@ -68,7 +68,7 @@ echo "Starting vllm server..." export TORCH_CUDA_ARCH_LIST="9.0" export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh index add2a8fa0..8988316d3 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi300x.sh @@ -74,7 +74,7 @@ echo "Starting vllm server..." export VLLM_ROCM_USE_AITER=1 export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh index 57746eef6..caa70de63 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi325x.sh @@ -71,7 +71,7 @@ echo "Starting vllm server..." export VLLM_ROCM_USE_AITER=1 export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh index eac820aa0..cd114fe96 100755 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp8_mi355x.sh @@ -75,7 +75,7 @@ export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 export PYTHONNOUSERSITE=1 -vllm serve $MODEL \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" \ --host 0.0.0.0 \ --port $PORT \ --tensor-parallel-size=$TP \ diff --git a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh index ee40e1855..d06d82ec8 100755 --- a/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_bf16_b200.sh @@ -49,7 +49,7 @@ export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true python3 -m sglang.launch_server \ ---model-path=$MODEL \ +--model-path=$MODEL_PATH --served-model-name=$MODEL \ --host=0.0.0.0 \ --port=$PORT \ --served-model-name "Qwen/Qwen3.5-397B-A17B" \ diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh index 4d39f2c81..ad49b2b67 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b200.sh @@ -49,7 +49,7 @@ export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true python3 -m sglang.launch_server \ ---model-path=$MODEL \ +--model-path=$MODEL_PATH --served-model-name=$MODEL \ --host=0.0.0.0 \ --port=$PORT \ --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" \ diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh index d926288ae..4f9b12659 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_b300_sglang.sh @@ -95,7 +95,7 @@ export SGLANG_ENABLE_FLASHINFER_GEMM=true { set +x; } 2>/dev/null SGLANG_CMD=( python3 -m sglang.launch_server - --model-path="$MODEL" + --model-path="$MODEL_PATH" --served-model-name="$MODEL" --host=0.0.0.0 --port="$PORT" --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh index 9db72e569..b280fff8b 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_h100.sh @@ -108,7 +108,7 @@ fi { set +x; } 2>/dev/null SGLANG_CMD=( python3 -m sglang.launch_server - --model-path="$MODEL" + --model-path="$MODEL_PATH" --served-model-name="$MODEL" --host=0.0.0.0 --port="$PORT" --served-model-name "Qwen/Qwen3.5-397B-A17B-FP8" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index a78ee87b9..ff901b674 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -46,7 +46,7 @@ export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ - --model-path $MODEL \ + --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh index f5e2d2e6f..cdded8860 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh @@ -120,7 +120,7 @@ export PYTHONNOUSERSITE=1 SGLANG_CMD=( python3 -m sglang.launch_server --attention-backend triton - --model-path "$MODEL" + --model-path "$MODEL_PATH" --served-model-name "$MODEL" --host=0.0.0.0 --port "$PORT" --tensor-parallel-size "$TP" From 1bccc5cacf281a1221dac8f0558248f220786311 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 17:40:48 -0500 Subject: [PATCH 10/15] benchmarks(dsv4-b300): enable VLLM_PREFIX_CACHE_RETENTION_INTERVAL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The custom cquil/vllm-openai image integrates vllm-project/vllm#43447, which fixes the DSv4 sliding-window prefix-cache eviction issue. But the fix is opt-in via VLLM_PREFIX_CACHE_RETENTION_INTERVAL — without setting it, vllm falls back to the legacy cache-every-segment path that this PR was written to repair, so the trace-replay cache hit rate stays near 0% even though the patched code is loaded. Sets the env var to 32768 (32k tokens), matching the value the PR author validated to take cache hit rate from 0% -> 74% on a comparable agentic trace-replay benchmark. On stock vllm images that don't carry the patch, the env var is simply ignored — safe to land. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 4 ++++ benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5b0792d08..4d7785c2a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9422,6 +9422,10 @@ dsv4-fp4-b300-vllm-agentic: - { tp: 8, offloading: cpu, conc-list: [16, 32, 64] } - { tp: 4, ep: 4, dp-attn: true, offloading: cpu, conc-list: [64, 128, 256] } - { tp: 8, ep: 8, dp-attn: true, offloading: cpu, conc-list: [128, 256, 512] } + - { tp: 4, offloading: none, conc-list: [16, 32, 64] } + - { tp: 8, offloading: none, conc-list: [16, 32, 64] } + - { tp: 4, ep: 4, dp-attn: true, offloading: none, conc-list: [64, 128, 256] } + - { tp: 8, ep: 8, dp-attn: true, offloading: none, conc-list: [128, 256, 512] } gptoss-fp4-b200-vllm-agentic: image: vllm/vllm-openai:v0.22.0 diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 88f4b38f5..837345423 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -52,6 +52,13 @@ install_agentic_deps # DeepSeek-V4-Pro weights are large; engine startup can exceed default 600s. export VLLM_ENGINE_READY_TIMEOUT_S=3600 +# vllm-project/vllm#43447: keep SWA prefix-cache tails sparsely so transient +# sliding-window allocations don't evict useful prefix entries. 32k matches +# the trace-replay tuning the PR author validated (0% -> 74% hit rate). +# Requires the custom image (cquil/vllm-openai:*-7ead0a0f...) that carries +# the patch; on stock images the env var is ignored. +export VLLM_PREFIX_CACHE_RETENTION_INTERVAL=32768 + # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" mkdir -p "$RESULT_DIR" From 094610734aa2e88a4ccbfad503002d7810bc8f8f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 23:04:36 -0500 Subject: [PATCH 11/15] configs(dsv4-b300-vllm-agentic): bump cquil image to 6c529f30 for retention-interval env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 7ead0a0 only carried the "Prepend uncached blocks in SWA free()" hunk of PR vllm-project/vllm#43447 — it did NOT modify vllm/envs.py to register the VLLM_PREFIX_CACHE_RETENTION_INTERVAL env var. That registration didn't land until commit 7c909f8 in the PR, and 6c529f30 is the latest merge of main into the PR branch. Effect: the export in dsv4_fp4_b300_vllm.sh (1bccc5ca) finally takes effect — vllm stops logging "Unknown vLLM environment variable detected" and actually activates the SWA prefix-cache retention path. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4d7785c2a..380c799e1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9404,8 +9404,8 @@ dsv4-fp8-h200-vllm-agentic: dsv4-fp4-b300-vllm-agentic: # image: vllm/vllm-openai:v0.22.0 - # includes https://github.com/vllm-project/vllm/pull/43447 up to 7ead0a0f27fc2b34efdcc8a557d542c5a372306f - image: cquil/vllm-openai:v0.22.0-7ead0a0f27fc2b34efdcc8a557d542c5a372306f + # includes https://github.com/vllm-project/vllm/pull/43447 up to 6c529f3001ab8bf44b1657e779dc54b622397045 + image: cquil/vllm-openai:v0.22.0-6c529f3001ab8bf44b1657e779dc54b622397045 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 From 38c365c77bf0cd1214ea6d1b81f7f7ed2c56b750 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Jun 2026 23:09:58 -0500 Subject: [PATCH 12/15] benchmarks(dsv4-b300-vllm): override trace loader to 060226 (v6) DSv4 recipes inherit the benchmark_lib carveout that defaults to the 052726 corpus for backward-compat with prior published baselines. This recipe is opting out to ride the v6 060226 corpus that all non-DSv4 recipes already use, exercising the newer CC versions / longer-tail trace mix. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh index 837345423..fdb7a49b6 100755 --- a/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/agentic/dsv4_fp4_b300_vllm.sh @@ -46,6 +46,8 @@ fi nvidia-smi # ---- Resolve traces and install deps ---------------------------------------- +# Opt this recipe out of the DSv4 052726 default; use the v6 corpus. +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226 resolve_trace_source install_agentic_deps From ee8d74391ba7674ba77f67c1e764fc200be1956d Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 14:33:00 +0900 Subject: [PATCH 13/15] [AMD] agentx-v0.4: add MiniMax/Kimi lmcache agentic entries, update Qwen hicache config Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 35 ++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7f1c8192d..134af929a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -872,6 +872,21 @@ minimaxm2.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 16 } +minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache: + image: vllm/vllm-openai-rocm:v0.22.0 + model: amd/MiniMax-M2.5-MXFP4 + model-prefix: minimaxm2.5 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + scenarios: + agentic-coding: + - duration: 1800 + search-space: + - { tp: 1, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] } + - { tp: 1, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48] } + minimaxm2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: amd/MiniMax-M2.5-MXFP4 @@ -2518,6 +2533,16 @@ kimik2.5-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } +kimik2.5-fp4-mi355x-vllm-agentic-lmcache: + image: vllm/vllm-openai-rocm:v0.22.0 + model: amd/Kimi-K2.5-MXFP4 + model-prefix: kimik2.5 + agentic-coding: + - duration: 1800 + search-space: + - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] } + - { tp: 4, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] } + minimaxm2.5-fp8-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -2574,19 +2599,15 @@ minimaxm2.5-fp8-mi325x-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } qwen3.5-fp8-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 + image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x - precision: fp8 - framework: sglang - multinode: false - scenarios: agentic-coding: - duration: 1800 search-space: - - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } - - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } + - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] } + - { tp: 4, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] } dsv4-fp4-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 From 616f4db634e13e57b3244853dd317bd3f8a5bd1c Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 14:40:55 +0900 Subject: [PATCH 14/15] [AMD] agentx-v0.4: add MiniMax agentic script, refactor Kimi/Qwen scripts Co-Authored-By: Claude Sonnet 4.6 --- .../agentic/kimik2.5_fp4_mi355x.sh | 674 ++---------------- .../agentic/minimaxm2.5_fp4_mi355x.sh | 256 +++++++ .../single_node/agentic/qwen3.5_fp8_mi355x.sh | 112 ++- .../agentic/qwen3.5_fp8_mi355x_sglang.sh | 152 ---- 4 files changed, 397 insertions(+), 797 deletions(-) create mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh delete mode 100755 benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index 139b12256..d05b27253 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -14,15 +14,11 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE - -# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0. -# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this -# script we need the concrete value so AgentX filters prompt+max_tokens against -# the same limit vLLM enforces. -if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then - MAX_MODEL_LEN=262144 -fi +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -33,557 +29,22 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi rocm-smi || true amd-smi || true +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps -# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) -pip install amd-quark - -# Disable AITER RMSNorm for TP < 8 due to accuracy issues -if [ "${TP}" -lt 8 ]; then - export VLLM_ROCM_USE_AITER_RMSNORM=0 -fi - -write_lmcache_rocm_mp_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/sitecustomize.py" <<'PY' -"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches.""" - -import os -import threading - -if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1": - import builtins - import sys - - _orig_import = builtins.__import__ - - def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None: - _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator - - if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False): - return - - _orig_init = _LazyMemoryAllocator.__init__ - _orig_allocate = _LazyMemoryAllocator.allocate - _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate - - def _expand_to(self, target_size: int) -> None: - target_size = min( - self._final_size, - _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE), - ) - lock = self._agentic_rocm_demand_expand_lock - with lock: - if target_size <= self._curr_size: - return - - start_size = self._curr_size - while self._curr_size < target_size: - commit_start = self._curr_size - commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE) - while self._curr_size < commit_target: - self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE) - self._curr_size += self.PIN_CHUNK_SIZE - self._commit_expansion(self._curr_size - commit_start) - - self._log_expansion_progress(self._curr_size - start_size) - - def _retry_with_demand_expansion(self, allocate_once): - obj = allocate_once() - step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64")) - step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3))) - - while obj is None and self._curr_size < self._final_size: - _expand_to(self, self._curr_size + step_bytes) - obj = allocate_once() - - return obj - - def _patched_init(self, *args, **kwargs): - _orig_init(self, *args, **kwargs) - self._agentic_rocm_demand_expand_lock = threading.Lock() - - # LMCache MP's upstream LazyMemoryAllocator currently expands to - # the final pinned size in a background thread. On ROCm Kimi TP4, - # vLLM reaches KV-cache registration only after that 2.5 TB pool - # is fully pinned, and the server-side IPC open path can stall - # before acknowledging register_kv_caches. Keep the same final - # capacity, but pin/commit extra host memory only when L1 - # allocations actually need it. - self._stop_expand.set() - self._expand_thread.join() - _lazy_memory_allocator.logger.info( - "Agentic ROCm patch: using demand-driven LMCache pinned " - "memory expansion; final capacity remains %s MB", - self._final_size >> 20, - ) - - def _patched_allocate( - self, - shapes, - dtypes, - fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, - allocator_type=None, - ): - return _retry_with_demand_expansion( - self, - lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type), - ) - - def _patched_batched_allocate( - self, - shapes, - dtypes, - batch_size, - fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, - allocator_type=None, - ): - return _retry_with_demand_expansion( - self, - lambda: _orig_batched_allocate( - self, shapes, dtypes, batch_size, fmt, allocator_type - ), - ) - - _LazyMemoryAllocator.__init__ = _patched_init - _LazyMemoryAllocator.allocate = _patched_allocate - _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate - _LazyMemoryAllocator._agentic_rocm_demand_patch = True - - def _patch_l1_memory_manager(_memory_manager) -> None: - _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None) - _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None) - if _L1MemoryManager is None or _LazyMemoryAllocator is None: - return - if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False): - return - - _orig_get_memory_usage = _L1MemoryManager.get_memory_usage - - def _patched_get_memory_usage(self): - allocator = getattr(self, "_allocator", None) - if isinstance(allocator, _LazyMemoryAllocator): - address_manager = allocator.get_address_manager() - used_size = ( - address_manager.get_heap_size() - address_manager.get_free_size() - ) - return used_size, allocator._final_size - return _orig_get_memory_usage(self) - - _L1MemoryManager.get_memory_usage = _patched_get_memory_usage - _L1MemoryManager._agentic_rocm_final_capacity_patch = True - - def _maybe_patch_lazy_memory_allocator() -> None: - module = sys.modules.get("lmcache.v1.lazy_memory_allocator") - if module is not None and hasattr(module, "LazyMemoryAllocator"): - _patch_lazy_memory_allocator(module) - - def _maybe_patch_l1_memory_manager() -> None: - module = sys.modules.get("lmcache.v1.distributed.memory_manager") - if module is not None and hasattr(module, "L1MemoryManager"): - _patch_l1_memory_manager(module) - - def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0): - module = _orig_import(name, globals, locals, fromlist, level) - if name == "lmcache.v1.lazy_memory_allocator" or ( - name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules - ): - _maybe_patch_lazy_memory_allocator() - if name == "lmcache.v1.distributed.memory_manager" or ( - name.startswith("lmcache") - and "lmcache.v1.distributed.memory_manager" in sys.modules - ): - _maybe_patch_l1_memory_manager() - return module - - builtins.__import__ = _agentic_rocm_import - _maybe_patch_lazy_memory_allocator() - _maybe_patch_l1_memory_manager() - -if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1": - import torch - import lmcache.non_cuda_equivalents as lmc - - if not hasattr(lmc, "multi_layer_block_kv_transfer"): - _DTYPE_BY_NAME = { - "bfloat16": torch.bfloat16, - "float16": torch.float16, - "float32": torch.float32, - } - - def _dtype_from_env() -> torch.dtype: - name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16") - try: - return _DTYPE_BY_NAME[name] - except KeyError as exc: - raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc - - def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor: - block_stride = shape_desc.block_stride_elems or ( - shape_desc.bs * shape_desc.nh * shape_desc.hs - ) - base = lmc._tensor_from_ptr( - ptr, - (shape_desc.nb * block_stride,), - dtype, - device, - ) - return torch.as_strided( - base, - (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs), - (block_stride, shape_desc.nh * shape_desc.hs, 1), - ) - - def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: - return lmc._tensor_from_ptr( - ptr, - (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs), - dtype, - device, - ) - - def multi_layer_block_kv_transfer( - group_kv_pointers, - tmp_buffer_ptrs, - block_ids, - paged_memory_device, - direction, - shape_desc, - lmcache_chunk_size, - gpu_kv_format, - skip_blocks=0, - ) -> None: - # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with - # shape [num_blocks, block_size, hidden_size]. LMCache's Python - # fallback has no block-transfer entrypoint yet, so implement the - # same gather/scatter contract with torch indexing on ROCm. - if shape_desc.kv_size != 1: - raise NotImplementedError( - "ROCm LMCache MP block fallback currently supports MLA KV caches only" - ) - - dtype = _dtype_from_env() - device = ( - paged_memory_device - if isinstance(paged_memory_device, torch.device) - else torch.device(paged_memory_device) - ) - num_layers = int(group_kv_pointers.numel()) - blocks_per_chunk = lmcache_chunk_size // shape_desc.bs - direction_name = getattr(direction, "name", str(direction)) - - for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs): - start = chunk_idx * blocks_per_chunk - end = start + blocks_per_chunk - chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long) - - dest_slot_offset = 0 - if skip_blocks and chunk_idx == 0: - chunk_blocks = chunk_blocks[int(skip_blocks):] - dest_slot_offset = int(skip_blocks) * shape_desc.bs - if chunk_blocks.numel() == 0: - continue - - num_slots = int(chunk_blocks.numel()) * shape_desc.bs - tmp = _tmp_view( - int(tmp_ptr), - shape_desc, - num_layers, - lmcache_chunk_size, - dtype, - device, - ) - - for layer_idx in range(num_layers): - paged = _paged_view( - int(group_kv_pointers[layer_idx].item()), - shape_desc, - dtype, - device, - ) - tmp_slice = tmp[ - 0, - layer_idx, - dest_slot_offset : dest_slot_offset + num_slots, - :, - ] - if direction_name == "D2H": - gathered = paged.index_select(0, chunk_blocks).reshape( - num_slots, shape_desc.nh * shape_desc.hs - ) - tmp_slice.copy_(gathered) - elif direction_name == "H2D": - src = tmp_slice.reshape( - int(chunk_blocks.numel()), - shape_desc.bs, - shape_desc.nh * shape_desc.hs, - ) - paged.index_copy_(0, chunk_blocks, src) - else: - raise ValueError(f"Unsupported transfer direction: {direction}") - - lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer - -# ---- Chunked KV loading (prevents GPU block exhaustion at high concurrency) ---- -if os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "0") != "0": - import chunked_connector_patch # noqa: F401 - -# ---- vLLM scheduler assertion fix (stale KV transfer notifications) ---- -import scheduler_assertion_patch # noqa: F401 -PY -} - -write_chunked_connector_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/chunked_connector_patch.py" <<'PY' -""" -Monkey-patch for LMCacheMPConnector to add chunked KV loading. - -Fixes GPU block exhaustion deadlock at high concurrency by capping -the number of external tokens reported AND retrieved per scheduling step. - -Usage: set CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD= and import this -module from sitecustomize.py before LMCache is loaded. -""" - -import logging -import os -import sys -import builtins - -logger = logging.getLogger("chunked_lmcache_patch") - -_MAX_TOKENS = int(os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "32768")) - -# Per-request chunk tracking (module-level, survives across calls) -_chunk_state: dict[str, dict] = {} - - -def _apply_patch(): - """Patch LMCacheMPConnector in-place.""" - mod = sys.modules.get("lmcache.integration.vllm.lmcache_mp_connector") - if mod is None: - return - cls = getattr(mod, "LMCacheMPConnector", None) - if cls is None or getattr(cls, "_chunked_patch_applied", False): - return - - LMCacheMPRequestState = getattr(mod, "LMCacheMPRequestState", None) - _orig_get_matched = cls.get_num_new_matched_tokens - _orig_get_finished = cls.get_finished - - def _get_blocks_per_chunk(self): - block_size = getattr(self, "block_size", 1) - return max(1, _MAX_TOKENS // block_size) - - def _patched_get_num_new_matched_tokens(self, request, num_computed_tokens): - full_match = _orig_get_matched(self, request, num_computed_tokens) - if full_match <= 0 or _MAX_TOKENS <= 0: - return full_match - - req_id = request.request_id - block_size = getattr(self, "block_size", 1) - blocks_per_chunk = _get_blocks_per_chunk(self) - full_match_blocks = full_match // block_size - - state = _chunk_state.get(req_id) - if state is None or state.get("num_computed_at_start") != num_computed_tokens: - state = { - "full_match_blocks": full_match_blocks, - "chunk_end_blocks": 0, - "num_computed_at_start": num_computed_tokens, - "lookup_done": False, - } - _chunk_state[req_id] = state - - if state["lookup_done"]: - return 0 - - remaining = state["full_match_blocks"] - state["chunk_end_blocks"] - if remaining <= 0: - state["lookup_done"] = True - return 0 - - this_chunk = min(remaining, blocks_per_chunk) - state["chunk_end_blocks"] += this_chunk - if state["chunk_end_blocks"] >= state["full_match_blocks"]: - state["lookup_done"] = True - - capped = this_chunk * block_size - if capped < full_match: - logger.debug( - "Chunked LMCache: req %s capped %d -> %d tokens " - "(chunk %d/%d blocks)", - req_id, full_match, capped, this_chunk, full_match_blocks, - ) - - # Cap the tracker's hit blocks to match what we report - tracker = getattr(request, "kv_transfer_params", None) - if tracker is not None: - orig_hits = getattr(tracker, "num_lmcache_hit_blocks", 0) - if orig_hits > this_chunk: - tracker.num_lmcache_hit_blocks = this_chunk - - return capped - - def _patched_get_finished(self, scheduler_output): - result = _orig_get_finished(self, scheduler_output) - # Clean up chunk state for finished requests. - # vLLM passes scheduler_output as a set of request-ID strings - # (not a SchedulerOutput object), so iterate directly when it - # is a set/frozenset; fall back to the attribute path for - # forward compatibility. - if isinstance(scheduler_output, (set, frozenset)): - finished = scheduler_output - else: - finished = getattr(scheduler_output, "finished_req_ids", []) - for req in finished: - _chunk_state.pop(req, None) - return result - - cls.get_num_new_matched_tokens = _patched_get_num_new_matched_tokens - cls.get_finished = _patched_get_finished - cls._chunked_patch_applied = True - logger.info( - "Chunked LMCache connector patch applied " - "(max_tokens_per_load=%d)", _MAX_TOKENS, - ) - - -_orig_import = builtins.__import__ - - -def _patching_import(name, *args, **kwargs): - module = _orig_import(name, *args, **kwargs) - if ( - name == "lmcache.integration.vllm.lmcache_mp_connector" - or ( - name.startswith("lmcache") - and "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules - ) - ): - _apply_patch() - return module - - -builtins.__import__ = _patching_import -_apply_patch() -PY -} - -write_scheduler_assertion_patch() { - local patch_dir="$1" - mkdir -p "$patch_dir" - cat > "$patch_dir/scheduler_assertion_patch.py" <<'PY' -""" -Patch vLLM scheduler to handle stale finished_recving gracefully. - -The assertion at scheduler.py crashes when a KV transfer reports -"finished recving" but the request is already in RUNNING state. -This happens when transfers complete asynchronously and the scheduler -has already moved the request forward. - -Fix: Instead of asserting, log a warning and skip. -""" - -import logging -import sys -import builtins - -logger = logging.getLogger("scheduler_assertion_patch") - - -def _apply_patch(): - """Patch vLLM scheduler's _update_from_kv_xfer_finished.""" - sched_mod = sys.modules.get("vllm.v1.core.sched.scheduler") - if sched_mod is None: - return - req_mod = sys.modules.get("vllm.v1.request") - if req_mod is None: - return - Scheduler = getattr(sched_mod, "Scheduler", None) - RequestStatus = getattr(req_mod, "RequestStatus", None) - if Scheduler is None or RequestStatus is None: - return - if getattr(Scheduler, "_kv_xfer_patch_applied", False): - return - - _orig_update = Scheduler._update_from_kv_xfer_finished - - def _patched_update(self, kv_connector_output): - if self.connector is not None: - self.connector.update_connector_output(kv_connector_output) - for req_id in kv_connector_output.finished_recving or (): - if req_id not in self.requests: - continue - req = self.requests[req_id] - if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: - self.finished_recving_kv_req_ids.add(req_id) - elif RequestStatus.is_finished(req.status): - self._free_blocks(self.requests[req_id]) - else: - logger.warning( - "Stale finished_recving for req %s in status %s; skipping.", - req_id, req.status.name, - ) - for req_id in kv_connector_output.finished_sending or (): - if req_id not in self.requests: - continue - self._free_blocks(self.requests[req_id]) - - Scheduler._update_from_kv_xfer_finished = _patched_update - Scheduler._kv_xfer_patch_applied = True - logger.info("Scheduler KV transfer assertion patch applied") - - -_orig_import = builtins.__import__ - - -def _patching_import(name, *args, **kwargs): - module = _orig_import(name, *args, **kwargs) - if ( - name == "vllm.v1.core.sched.scheduler" - or ( - name.startswith("vllm") - and "vllm.v1.core.sched.scheduler" in sys.modules - ) - ): - _apply_patch() - return module - - -builtins.__import__ = _patching_import -_apply_patch() -PY -} - -# Workaround for MEC FW <177 RCCL memory reclaim issue -version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') -if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" @@ -591,6 +52,8 @@ mkdir -p "$RESULT_DIR" OFFLOAD_ARGS=() PREFIX_CACHE_ARGS=() + +# ---- Lmcache config ---------------------------------------------------------- LMCACHE_PID="" cleanup_lmcache_server() { @@ -648,7 +111,9 @@ case "$OFFLOADING" in # MI355X nodes have ~2.7 TiB of host DRAM available for offload; # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for # worker RSS / page cache / slurm cgroup). - TOTAL_CPU_DRAM_GB=2500 + #TODO: fix + TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" # Use vLLM's regular native KV-offload path (OffloadingConnector), # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 @@ -659,7 +124,7 @@ case "$OFFLOADING" in # (vllm/config/vllm.py:662). OFFLOAD_ARGS=( --kv_offloading_backend native - --kv_offloading_size "$TOTAL_CPU_DRAM_GB" + --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" --disable-hybrid-kv-cache-manager ) ;; @@ -667,74 +132,20 @@ case "$OFFLOADING" in { set +x; } 2>/dev/null unset VLLM_USE_SIMPLE_KV_OFFLOAD - agentic_pip_install --quiet --no-cache-dir lmcache - # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and - # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and - # during Kimi fused-MoE model inspection it imports nixl_ep whenever - # that module is importable, even when this run is not using EP/NIXL - # kernels. The CUDA extension then fails immediately on AMD nodes with - # "ImportError: libcuda.so.1". - # - # LMCache MP also uses CuPy stream APIs while registering vLLM's KV - # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime - # with cudaErrorInsufficientDriver when LMCache touches the stream. Use - # the ROCm 7 CuPy wheel so the same API dispatches through HIP. - python3 -m pip uninstall -y \ - nixl nixl-cu12 nixl-cu13 nixl_ep \ - >/dev/null 2>&1 || true - python3 -m pip uninstall -y \ - cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \ - >/dev/null 2>&1 || true - agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0 - python3 - <<'PY' -import importlib.util -import sys - -spec = importlib.util.find_spec("nixl_ep") -if spec is not None: - locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"]) - print( - "Error: nixl_ep is still importable after LMCache install; " - "this ROCm Kimi run would import a CUDA-only nixl_ep module. " - f"location={locations}", - file=sys.stderr, - ) - sys.exit(1) - -try: - from cupy_backends.cuda.api import runtime as cupy_runtime -except Exception as exc: - print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr) - sys.exit(1) - -if not getattr(cupy_runtime, "is_hip", False): - print( - "Error: CuPy is still using the CUDA backend after installing " - "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.", - file=sys.stderr, - ) - sys.exit(1) -PY - LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch" - write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR" - write_chunked_connector_patch "$LMCACHE_ROCM_PATCH_DIR" - write_scheduler_assertion_patch "$LMCACHE_ROCM_PATCH_DIR" - export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1 - export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16 - export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1 - # Cap external KV tokens loaded per scheduling step to prevent GPU - # block exhaustion deadlock at high concurrency (c>=32). Default - # 32768 keeps peak block demand within the GPU KV pool. Set to 0 to - # disable chunking (only safe at low concurrency). - export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD="${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:-32768}" - export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" + git clone https://github.com/LMCache/LMCache.git + cd LMCache + pip install -r requirements/build.txt + CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation + cd .. + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV # pool, but let the external MP server own that pool so vLLM does not # split --kv-offloading-size across TP ranks through the integrated # LMCache backend. - TOTAL_CPU_DRAM_GB=2500 + #TODO: fix + TOTAL_CPU_DRAM_GB=3000 LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" LMCACHE_PORT="${LMCACHE_PORT:-5555}" LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" @@ -742,7 +153,7 @@ PY # ZMQ endpoint. Bind the server to a raw host, but pass the connector a # ZMQ-style host string. LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" - LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" # LMCache read locks are leases on chunks that lookup has promised # vLLM can retrieve. The default 300s TTL is too short for this @@ -750,10 +161,11 @@ PY # lookup and retrieve while GPU KV is saturated, which leaves the # object present in L1 but no longer readable. Keep the 2.5 TB pool # size unchanged and only extend the lookup-to-retrieve lease. - LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}" + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + export LMCACHE_BLOCKING_TIMEOUT_SECS=120 echo "Starting LMCache MP server..." LMCACHE_CMD=( @@ -786,6 +198,7 @@ PY *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; esac +# ---- LLM server config ---------------------------------------------------------- EP_ARGS=() if [ "$EP_SIZE" -gt 1 ]; then EP_ARGS=(--enable-expert-parallel) @@ -794,17 +207,34 @@ fi echo "Starting vllm server..." export PYTHONNOUSERSITE=1 +# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) +pip install amd-quark + +# Disable AITER RMSNorm for TP < 8 due to accuracy issues +if [ "${TP}" -lt 8 ]; then + export VLLM_ROCM_USE_AITER_RMSNORM=0 +fi + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL_PATH" --served-model-name "$MODEL" + vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" "${EP_ARGS[@]}" --gpu-memory-utilization 0.90 + --kv-cache-dtype fp8 \ --block-size=1 --trust-remote-code - --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$CONC" --mm-encoder-tp-mode data "${PREFIX_CACHE_ARGS[@]}" @@ -821,4 +251,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh new file mode 100755 index 000000000..f36fc59e9 --- /dev/null +++ b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh @@ -0,0 +1,256 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - vLLM GPU KV only. +# cpu - vLLM native CPU offload. +# lmcache - LMCache MP server + vLLM LMCacheMPConnector. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION + +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# ROCR/HIP visibility for vLLM 0.14+ +if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 +# corpus has requests up to ~1M proxy tokens that would be rejected. +# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" +mkdir -p "$RESULT_DIR" + +OFFLOAD_ARGS=() +PREFIX_CACHE_ARGS=() + +# ---- Lmcache config ---------------------------------------------------------- +LMCACHE_PID="" + +cleanup_lmcache_server() { + if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then + kill "$LMCACHE_PID" 2>/dev/null || true + wait "$LMCACHE_PID" 2>/dev/null || true + fi +} + +trap cleanup_lmcache_server EXIT + +wait_for_lmcache_ready() { + { set +x; } 2>/dev/null + local attempts="${LMCACHE_READY_ATTEMPTS:-120}" + local tail_pid="" + + while [ ! -f "$LMCACHE_LOG" ]; do + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before creating log file. Exiting." >&2 + exit 1 + fi + sleep 1 + done + + tail -f -n +1 "$LMCACHE_LOG" & + tail_pid=$! + + for ((i = 1; i <= attempts; i++)); do + if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + return 0 + fi + if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then + echo "LMCache server died before becoming healthy. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 + fi + sleep 1 + done + + echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 + kill "$tail_pid" 2>/dev/null || true + wait "$tail_pid" 2>/dev/null || true + cat "$LMCACHE_LOG" >&2 || true + exit 1 +} + +case "$OFFLOADING" in + none) ;; + cpu) + unset VLLM_USE_SIMPLE_KV_OFFLOAD + # MI355X nodes have ~2.7 TiB of host DRAM available for offload; + # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for + # worker RSS / page cache / slurm cgroup). + TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + # Use vLLM's regular native KV-offload path (OffloadingConnector), + # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to + # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 + # would switch it to SimpleCPUOffloadConnector. We intentionally leave + # that env var UNSET here so the regular OffloadingConnector path is + # used. The shortcut --kv_offloading_backend native + --kv_offloading_size + # form constructs the KVTransferConfig at engine startup + # (vllm/config/vllm.py:662). + + # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default) + # This gives extra cache hit than disabling hybrid kv cache manager + # srok, + # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma + # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60 + OFFLOAD_ARGS=( + --kv_offloading_backend native + --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" + --disable-hybrid-kv-cache-manager + ) + ;; + lmcache) + { set +x; } 2>/dev/null + unset VLLM_USE_SIMPLE_KV_OFFLOAD + + git clone https://github.com/LMCache/LMCache.git + cd LMCache + pip install -r requirements/build.txt + CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation + cd .. + + python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null + + # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV + # pool, but let the external MP server own that pool so vLLM does not + # split --kv-offloading-size across TP ranks through the integrated + # LMCache backend. + TOTAL_CPU_DRAM_GB=3000 + LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" + LMCACHE_PORT="${LMCACHE_PORT:-5555}" + LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" + # LMCacheMPConnector concatenates lmcache.mp.host and port into the + # ZMQ endpoint. Bind the server to a raw host, but pass the connector a + # ZMQ-style host string. + LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" + # LMCache read locks are leases on chunks that lookup has promised + # vLLM can retrieve. The default 300s TTL is too short for this + # long-context agentic queue: TP8/conc32 can spend >300s between + # lookup and retrieve while GPU KV is saturated, which leaves the + # object present in L1 but no longer readable. Keep the 2.5 TB pool + # size unchanged and only extend the lookup-to-retrieve lease. + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" + LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" + LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" + export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" + export LMCACHE_BLOCKING_TIMEOUT_SECS=120 + + set -x + echo "Starting LMCache MP server..." + LMCACHE_CMD=( + lmcache server + --host "$LMCACHE_HOST" + --port "$LMCACHE_PORT" + --http-host "$LMCACHE_HOST" + --http-port "$LMCACHE_HTTP_PORT" + --l1-size-gb "$LMCACHE_L1_SIZE_GB" + --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" + --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS" + --chunk-size "$LMCACHE_CHUNK_SIZE" + --max-workers "$LMCACHE_MAX_WORKERS" + --eviction-policy LRU + ) + printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" + printf '\n' >> "$RESULT_DIR/lmcache_command.txt" + "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & + LMCACHE_PID=$! + echo "LMCache server PID: $LMCACHE_PID" + wait_for_lmcache_ready + + PREFIX_CACHE_ARGS=(--enable-prefix-caching) + # srok, + # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma + # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60 + OFFLOAD_ARGS=( + --kv-transfer-config + "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" + --disable-hybrid-kv-cache-manager + ) + ;; + *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; +esac + +# ---- LLM server config ---------------------------------------------------------- +EP_ARGS=() +if [ "$EP_SIZE" -gt 1 ]; then + EP_ARGS=(--enable-expert-parallel) +fi + +echo "Starting vllm server..." +export PYTHONNOUSERSITE=1 + +# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) +pip install -q amd-quark + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +{ set +x; } 2>/dev/null +VLLM_CMD=( + vllm serve "$MODEL" + --host 0.0.0.0 + --port "$PORT" + --tensor-parallel-size="$TP" + "${EP_ARGS[@]}" + --gpu-memory-utilization 0.95 + --kv-cache-dtype fp8 \ + --block-size=32 + --trust-remote-code + --attention-backend "ROCM_AITER_FA" + --max-num-seqs "$CONC" + "${PREFIX_CACHE_ARGS[@]}" + "${OFFLOAD_ARGS[@]}" +) +printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" +printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" +"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index ff901b674..656e924dc 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -2,51 +2,117 @@ set -euo pipefail set -x -# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang. +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang. +# +# Base server recipe follows the upstream MI300X reference +# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe): +# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75. +# The agentic harness (resolve_trace_source / build_replay_cmd / +# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and +# --disable-radix-cache is dropped because agentic replay needs prefix reuse. # # Required env vars: -# MODEL, TP, CONC, RESULT_DIR +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE +# +# OFFLOADING values: +# none - SGLang GPU KV with the default RadixAttention prefix cache. +# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix. source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi +PORT=${PORT:-8888} +DURATION=${DURATION:-1800} +EP_SIZE=${EP_SIZE:-1} + +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi +if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi rocm-smi || true amd-smi || true +# ---- Resolve traces and install deps ---------------------------------------- +# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the +# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf +# signal at high concurrency. +#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k +#060226 +export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k + # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps -# ---- Start SGLang server ---------------------------------------------------- +# ---- Cache / offload config ------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" mkdir -p "$RESULT_DIR" +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per + # TP rank (one hierarchical KV, one hierarchical Mamba), so the + # node-total DRAM budget divides by TP and the host-pool count. + TOTAL_CPU_DRAM_GB=3000 + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which + # requires page_size=1. Keep the safer direct/layer_first copy path; + # kernel/page_first faults on first prefill in this mode on ROCm. + HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size "$HICACHE_PAGE_SIZE" + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # HiCache startup reaches API readiness but SGLang's internal warmup + # request can time out on this path; let aiperf own benchmark traffic. + WARMUP_ARGS=(--skip-server-warmup) + # Don't force ROCm graph capture at every high concurrency point; conc=16 + # is the highest known-good capture size for this model/server path. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + echo "Starting SGLang server..." export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ - --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ + --model-path $MODEL \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ @@ -56,10 +122,10 @@ python3 -m sglang.launch_server \ --enable-aiter-allreduce-fusion \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \ - --max-prefill-tokens 32768 \ - --scheduler-recv-interval 30 \ + --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --mem-fraction-static 0.8 \ - --context-length $MAX_MODEL_LEN \ + "${CACHE_ARGS[@]}" \ + "${WARMUP_ARGS[@]}" \ --enable-metrics > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -69,4 +135,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" +run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh deleted file mode 100755 index cdded8860..000000000 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang. -# -# Required env vars: -# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR -# -# OFFLOADING values: -# none - SGLang GPU KV only with radix cache disabled. -# hicache - SGLang HiCache with local CPU hierarchical cache. - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE - -SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} -if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then - MAX_MODEL_LEN=131072 -fi - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -# `hf download` creates the target dir if missing and is itself idempotent. -# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE -# Either way, MODEL_PATH is what the server is launched with. -if [[ -n "${MODEL_PATH:-}" ]]; then - if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then - hf download "$MODEL" --local-dir "$MODEL_PATH" - fi -else - hf download "$MODEL" - export MODEL_PATH="$MODEL" -fi -rocm-smi || true -amd-smi || true - -# ---- Resolve traces and install deps ---------------------------------------- -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -mkdir -p "$RESULT_DIR" - -CACHE_ARGS=() -WARMUP_ARGS=() -CUDA_GRAPH_MAX_BS="$CONC" -case "$OFFLOADING" in - none) - # Leave SGLang's default RadixAttention prefix cache on — agentic - # replay needs it; --disable-radix-cache would zero the hit rate. - ;; - hicache) - # MI355X nodes have about 3 TB of host DRAM, but Qwen3.5's hybrid - # GDN/Mamba path allocates two HiCache host pools per TP rank: one for - # hierarchical KV cache and one for hierarchical Mamba cache. A 2 TB - # node-total target at TP=8 is therefore 2000 / (8 * 2) = 125 GB per - # host pool, not 250 GB. Keep overrides for one-off tuning. - TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-2000}" - HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" - HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-180}}" - HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" - # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler on - # MI355X, which requires page_size=1. The kernel/page_first HiCache - # transfer path faults on first prefill in this mode on ROCm, so keep - # the default on the safer direct/layer_first copy path. These remain - # env-overridable for future SGLang/ROCm fixes. - HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" - HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" - HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" - # SGLang --hicache-size is per rank per host pool, while the workflow - # input is a node-total DRAM budget. Divide by TP and the number of - # host pools unless HICACHE_SIZE_GB is set directly for one-off tuning. - HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" - if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then - HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" - fi - if [ "$HICACHE_SIZE_GB" -lt 1 ]; then - echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 - exit 1 - fi - echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" - CACHE_ARGS=( - --page-size "$HICACHE_PAGE_SIZE" - --enable-hierarchical-cache - --hicache-size "$HICACHE_SIZE_GB" - --hicache-io-backend "$HICACHE_IO_BACKEND" - --hicache-mem-layout "$HICACHE_MEM_LAYOUT" - --hicache-write-policy "$HICACHE_WRITE_POLICY" - ) - # HiCache startup reaches API readiness, but SGLang's internal warmup - # request has timed out after 600s on this Qwen MI355X path. Let aiperf - # own benchmark traffic instead of blocking server readiness on it. - WARMUP_ARGS=(--skip-server-warmup) - # Keep request concurrency as the swept variable, but do not force - # HiCache runs to capture ROCm graphs at every high concurrency point. - # The conc=32 HiCache job crashed after startup readiness, before any - # aiperf traffic, while conc=16 is the highest known-good capture size - # for this model/server path. Requests above the capture size can still - # run; they just do not require a larger captured graph at startup. - HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}" - if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then - CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" - fi - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 - exit 1 - ;; -esac - -echo "Starting SGLang server..." -export PYTHONNOUSERSITE=1 - -{ set +x; } 2>/dev/null -SGLANG_CMD=( - python3 -m sglang.launch_server - --attention-backend triton - --model-path "$MODEL_PATH" --served-model-name "$MODEL" - --host=0.0.0.0 - --port "$PORT" - --tensor-parallel-size "$TP" - --ep-size "$EP_SIZE" - --trust-remote-code - --tokenizer-worker-num 6 - --enable-aiter-allreduce-fusion - --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" - --max-running-requests "$CONC" - --max-prefill-tokens 32768 - --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" - --mem-fraction-static 0.8 - --context-length "$MAX_MODEL_LEN" - --enable-metrics - "${CACHE_ARGS[@]}" - "${WARMUP_ARGS[@]}" -) -printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" -printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" -"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" From 574d8914475ba7d5f8cc0ec9d17ea79aca03e95d Mon Sep 17 00:00:00 2001 From: seungrokj Date: Wed, 3 Jun 2026 14:46:29 +0900 Subject: [PATCH 15/15] Revert "[AMD] agentx-v0.4: add MiniMax agentic script, refactor Kimi/Qwen scripts" and "[AMD] agentx-v0.4: add MiniMax/Kimi lmcache agentic entries, update Qwen hicache config" Co-Authored-By: Claude Sonnet 4.6 --- .github/configs/amd-master.yaml | 35 +- .../agentic/kimik2.5_fp4_mi355x.sh | 674 ++++++++++++++++-- .../agentic/minimaxm2.5_fp4_mi355x.sh | 256 ------- .../single_node/agentic/qwen3.5_fp8_mi355x.sh | 112 +-- .../agentic/qwen3.5_fp8_mi355x_sglang.sh | 152 ++++ 5 files changed, 804 insertions(+), 425 deletions(-) delete mode 100755 benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh create mode 100755 benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 134af929a..7f1c8192d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -872,21 +872,6 @@ minimaxm2.5-fp4-mi355x-atom: - { tp: 4, conc-start: 4, conc-end: 128 } - { tp: 8, conc-start: 4, conc-end: 16 } -minimaxm2.5-fp4-mi355x-vllm-agentic-lmcache: - image: vllm/vllm-openai-rocm:v0.22.0 - model: amd/MiniMax-M2.5-MXFP4 - model-prefix: minimaxm2.5 - runner: mi355x - precision: fp4 - framework: vllm - multinode: false - scenarios: - agentic-coding: - - duration: 1800 - search-space: - - { tp: 1, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48] } - - { tp: 1, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48] } - minimaxm2.5-fp4-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.22.0 model: amd/MiniMax-M2.5-MXFP4 @@ -2533,16 +2518,6 @@ kimik2.5-fp4-mi355x-vllm-agentic: - { tp: 4, offloading: none, conc-list: [16, 24, 32, 40] } - { tp: 4, offloading: cpu, conc-list: [16, 24, 32, 40] } -kimik2.5-fp4-mi355x-vllm-agentic-lmcache: - image: vllm/vllm-openai-rocm:v0.22.0 - model: amd/Kimi-K2.5-MXFP4 - model-prefix: kimik2.5 - agentic-coding: - - duration: 1800 - search-space: - - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] } - - { tp: 4, ep: 1, offloading: lmcache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 72] } - minimaxm2.5-fp8-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 model: MiniMaxAI/MiniMax-M2.5 @@ -2599,15 +2574,19 @@ minimaxm2.5-fp8-mi325x-vllm-agentic: - { tp: 4, offloading: cpu, conc-list: [16, 20, 24, 28, 32] } qwen3.5-fp8-mi355x-sglang-agentic-hicache: - image: lmsysorg/sglang-rocm:v0.5.12.post1-rocm720-mi35x-20260531 + image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260521 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x + precision: fp8 + framework: sglang + multinode: false + scenarios: agentic-coding: - duration: 1800 search-space: - - { tp: 4, ep: 1, offloading: none, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] } - - { tp: 4, ep: 1, offloading: hicache, conc-list: [4, 8, 16, 32, 40, 48, 56, 64, 128] } + - { tp: 8, ep: 1, offloading: none, conc-list: [1, 2, 4, 8, 16, 32] } + - { tp: 8, ep: 1, offloading: hicache, conc-list: [16, 32, 48, 64] } dsv4-fp4-mi355x-vllm-agentic: image: vllm/vllm-openai-rocm:v0.22.0 diff --git a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh index d05b27253..139b12256 100755 --- a/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh +++ b/benchmarks/single_node/agentic/kimik2.5_fp4_mi355x.sh @@ -14,11 +14,15 @@ set -x source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -EP_SIZE=${EP_SIZE:-1} +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + +# Kimi-K2.5 advertises a 262144-token context window in vLLM 0.21.0. +# Matrix defaults may export MAX_MODEL_LEN=0 to mean "server default"; for this +# script we need the concrete value so AgentX filters prompt+max_tokens against +# the same limit vLLM enforces. +if [[ -z "${MAX_MODEL_LEN:-}" || "$MAX_MODEL_LEN" == "0" ]]; then + MAX_MODEL_LEN=262144 +fi if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" @@ -29,22 +33,557 @@ if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true -# ---- Resolve traces and install deps ---------------------------------------- -# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the -# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf -# signal at high concurrency. -#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k -#060226 -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k - # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps +# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) +pip install amd-quark + +# Disable AITER RMSNorm for TP < 8 due to accuracy issues +if [ "${TP}" -lt 8 ]; then + export VLLM_ROCM_USE_AITER_RMSNORM=0 +fi + +write_lmcache_rocm_mp_patch() { + local patch_dir="$1" + mkdir -p "$patch_dir" + cat > "$patch_dir/sitecustomize.py" <<'PY' +"""Runtime compatibility for LMCache MP on ROCm Kimi MLA KV caches.""" + +import os +import threading + +if os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR") == "1": + import builtins + import sys + + _orig_import = builtins.__import__ + + def _patch_lazy_memory_allocator(_lazy_memory_allocator) -> None: + _LazyMemoryAllocator = _lazy_memory_allocator.LazyMemoryAllocator + + if getattr(_LazyMemoryAllocator, "_agentic_rocm_demand_patch", False): + return + + _orig_init = _LazyMemoryAllocator.__init__ + _orig_allocate = _LazyMemoryAllocator.allocate + _orig_batched_allocate = _LazyMemoryAllocator.batched_allocate + + def _expand_to(self, target_size: int) -> None: + target_size = min( + self._final_size, + _lazy_memory_allocator.align_to(target_size, self.PIN_CHUNK_SIZE), + ) + lock = self._agentic_rocm_demand_expand_lock + with lock: + if target_size <= self._curr_size: + return + + start_size = self._curr_size + while self._curr_size < target_size: + commit_start = self._curr_size + commit_target = min(target_size, self._curr_size + self.COMMIT_SIZE) + while self._curr_size < commit_target: + self._pin_memory_chunk(self._curr_size, self.PIN_CHUNK_SIZE) + self._curr_size += self.PIN_CHUNK_SIZE + self._commit_expansion(self._curr_size - commit_start) + + self._log_expansion_progress(self._curr_size - start_size) + + def _retry_with_demand_expansion(self, allocate_once): + obj = allocate_once() + step_gb = float(os.environ.get("LMCACHE_ROCM_DEMAND_PINNED_STEP_GB", "64")) + step_bytes = max(self.COMMIT_SIZE, int(step_gb * (1024**3))) + + while obj is None and self._curr_size < self._final_size: + _expand_to(self, self._curr_size + step_bytes) + obj = allocate_once() + + return obj + + def _patched_init(self, *args, **kwargs): + _orig_init(self, *args, **kwargs) + self._agentic_rocm_demand_expand_lock = threading.Lock() + + # LMCache MP's upstream LazyMemoryAllocator currently expands to + # the final pinned size in a background thread. On ROCm Kimi TP4, + # vLLM reaches KV-cache registration only after that 2.5 TB pool + # is fully pinned, and the server-side IPC open path can stall + # before acknowledging register_kv_caches. Keep the same final + # capacity, but pin/commit extra host memory only when L1 + # allocations actually need it. + self._stop_expand.set() + self._expand_thread.join() + _lazy_memory_allocator.logger.info( + "Agentic ROCm patch: using demand-driven LMCache pinned " + "memory expansion; final capacity remains %s MB", + self._final_size >> 20, + ) + + def _patched_allocate( + self, + shapes, + dtypes, + fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, + allocator_type=None, + ): + return _retry_with_demand_expansion( + self, + lambda: _orig_allocate(self, shapes, dtypes, fmt, allocator_type), + ) + + def _patched_batched_allocate( + self, + shapes, + dtypes, + batch_size, + fmt=_lazy_memory_allocator.MemoryFormat.UNDEFINED, + allocator_type=None, + ): + return _retry_with_demand_expansion( + self, + lambda: _orig_batched_allocate( + self, shapes, dtypes, batch_size, fmt, allocator_type + ), + ) + + _LazyMemoryAllocator.__init__ = _patched_init + _LazyMemoryAllocator.allocate = _patched_allocate + _LazyMemoryAllocator.batched_allocate = _patched_batched_allocate + _LazyMemoryAllocator._agentic_rocm_demand_patch = True + + def _patch_l1_memory_manager(_memory_manager) -> None: + _L1MemoryManager = getattr(_memory_manager, "L1MemoryManager", None) + _LazyMemoryAllocator = getattr(_memory_manager, "LazyMemoryAllocator", None) + if _L1MemoryManager is None or _LazyMemoryAllocator is None: + return + if getattr(_L1MemoryManager, "_agentic_rocm_final_capacity_patch", False): + return + + _orig_get_memory_usage = _L1MemoryManager.get_memory_usage + + def _patched_get_memory_usage(self): + allocator = getattr(self, "_allocator", None) + if isinstance(allocator, _LazyMemoryAllocator): + address_manager = allocator.get_address_manager() + used_size = ( + address_manager.get_heap_size() - address_manager.get_free_size() + ) + return used_size, allocator._final_size + return _orig_get_memory_usage(self) + + _L1MemoryManager.get_memory_usage = _patched_get_memory_usage + _L1MemoryManager._agentic_rocm_final_capacity_patch = True + + def _maybe_patch_lazy_memory_allocator() -> None: + module = sys.modules.get("lmcache.v1.lazy_memory_allocator") + if module is not None and hasattr(module, "LazyMemoryAllocator"): + _patch_lazy_memory_allocator(module) + + def _maybe_patch_l1_memory_manager() -> None: + module = sys.modules.get("lmcache.v1.distributed.memory_manager") + if module is not None and hasattr(module, "L1MemoryManager"): + _patch_l1_memory_manager(module) + + def _agentic_rocm_import(name, globals=None, locals=None, fromlist=(), level=0): + module = _orig_import(name, globals, locals, fromlist, level) + if name == "lmcache.v1.lazy_memory_allocator" or ( + name.startswith("lmcache") and "lmcache.v1.lazy_memory_allocator" in sys.modules + ): + _maybe_patch_lazy_memory_allocator() + if name == "lmcache.v1.distributed.memory_manager" or ( + name.startswith("lmcache") + and "lmcache.v1.distributed.memory_manager" in sys.modules + ): + _maybe_patch_l1_memory_manager() + return module + + builtins.__import__ = _agentic_rocm_import + _maybe_patch_lazy_memory_allocator() + _maybe_patch_l1_memory_manager() + +if os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK") == "1": + import torch + import lmcache.non_cuda_equivalents as lmc + + if not hasattr(lmc, "multi_layer_block_kv_transfer"): + _DTYPE_BY_NAME = { + "bfloat16": torch.bfloat16, + "float16": torch.float16, + "float32": torch.float32, + } + + def _dtype_from_env() -> torch.dtype: + name = os.environ.get("LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE", "bfloat16") + try: + return _DTYPE_BY_NAME[name] + except KeyError as exc: + raise ValueError(f"Unsupported LMCache ROCm fallback dtype: {name}") from exc + + def _paged_view(ptr: int, shape_desc, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + block_stride = shape_desc.block_stride_elems or ( + shape_desc.bs * shape_desc.nh * shape_desc.hs + ) + base = lmc._tensor_from_ptr( + ptr, + (shape_desc.nb * block_stride,), + dtype, + device, + ) + return torch.as_strided( + base, + (shape_desc.nb, shape_desc.bs, shape_desc.nh * shape_desc.hs), + (block_stride, shape_desc.nh * shape_desc.hs, 1), + ) + + def _tmp_view(ptr: int, shape_desc, num_layers: int, chunk_slots: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor: + return lmc._tensor_from_ptr( + ptr, + (shape_desc.kv_size, num_layers, chunk_slots, shape_desc.nh * shape_desc.hs), + dtype, + device, + ) + + def multi_layer_block_kv_transfer( + group_kv_pointers, + tmp_buffer_ptrs, + block_ids, + paged_memory_device, + direction, + shape_desc, + lmcache_chunk_size, + gpu_kv_format, + skip_blocks=0, + ) -> None: + # Kimi K2.5 uses vLLM MLA: one KV tensor per layer with + # shape [num_blocks, block_size, hidden_size]. LMCache's Python + # fallback has no block-transfer entrypoint yet, so implement the + # same gather/scatter contract with torch indexing on ROCm. + if shape_desc.kv_size != 1: + raise NotImplementedError( + "ROCm LMCache MP block fallback currently supports MLA KV caches only" + ) + + dtype = _dtype_from_env() + device = ( + paged_memory_device + if isinstance(paged_memory_device, torch.device) + else torch.device(paged_memory_device) + ) + num_layers = int(group_kv_pointers.numel()) + blocks_per_chunk = lmcache_chunk_size // shape_desc.bs + direction_name = getattr(direction, "name", str(direction)) + + for chunk_idx, tmp_ptr in enumerate(tmp_buffer_ptrs): + start = chunk_idx * blocks_per_chunk + end = start + blocks_per_chunk + chunk_blocks = block_ids[start:end].to(device=device, dtype=torch.long) + + dest_slot_offset = 0 + if skip_blocks and chunk_idx == 0: + chunk_blocks = chunk_blocks[int(skip_blocks):] + dest_slot_offset = int(skip_blocks) * shape_desc.bs + if chunk_blocks.numel() == 0: + continue + + num_slots = int(chunk_blocks.numel()) * shape_desc.bs + tmp = _tmp_view( + int(tmp_ptr), + shape_desc, + num_layers, + lmcache_chunk_size, + dtype, + device, + ) + + for layer_idx in range(num_layers): + paged = _paged_view( + int(group_kv_pointers[layer_idx].item()), + shape_desc, + dtype, + device, + ) + tmp_slice = tmp[ + 0, + layer_idx, + dest_slot_offset : dest_slot_offset + num_slots, + :, + ] + if direction_name == "D2H": + gathered = paged.index_select(0, chunk_blocks).reshape( + num_slots, shape_desc.nh * shape_desc.hs + ) + tmp_slice.copy_(gathered) + elif direction_name == "H2D": + src = tmp_slice.reshape( + int(chunk_blocks.numel()), + shape_desc.bs, + shape_desc.nh * shape_desc.hs, + ) + paged.index_copy_(0, chunk_blocks, src) + else: + raise ValueError(f"Unsupported transfer direction: {direction}") + + lmc.multi_layer_block_kv_transfer = multi_layer_block_kv_transfer + +# ---- Chunked KV loading (prevents GPU block exhaustion at high concurrency) ---- +if os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "0") != "0": + import chunked_connector_patch # noqa: F401 + +# ---- vLLM scheduler assertion fix (stale KV transfer notifications) ---- +import scheduler_assertion_patch # noqa: F401 +PY +} + +write_chunked_connector_patch() { + local patch_dir="$1" + mkdir -p "$patch_dir" + cat > "$patch_dir/chunked_connector_patch.py" <<'PY' +""" +Monkey-patch for LMCacheMPConnector to add chunked KV loading. + +Fixes GPU block exhaustion deadlock at high concurrency by capping +the number of external tokens reported AND retrieved per scheduling step. + +Usage: set CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD= and import this +module from sitecustomize.py before LMCache is loaded. +""" + +import logging +import os +import sys +import builtins + +logger = logging.getLogger("chunked_lmcache_patch") + +_MAX_TOKENS = int(os.environ.get("CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD", "32768")) + +# Per-request chunk tracking (module-level, survives across calls) +_chunk_state: dict[str, dict] = {} + + +def _apply_patch(): + """Patch LMCacheMPConnector in-place.""" + mod = sys.modules.get("lmcache.integration.vllm.lmcache_mp_connector") + if mod is None: + return + cls = getattr(mod, "LMCacheMPConnector", None) + if cls is None or getattr(cls, "_chunked_patch_applied", False): + return + + LMCacheMPRequestState = getattr(mod, "LMCacheMPRequestState", None) + _orig_get_matched = cls.get_num_new_matched_tokens + _orig_get_finished = cls.get_finished + + def _get_blocks_per_chunk(self): + block_size = getattr(self, "block_size", 1) + return max(1, _MAX_TOKENS // block_size) + + def _patched_get_num_new_matched_tokens(self, request, num_computed_tokens): + full_match = _orig_get_matched(self, request, num_computed_tokens) + if full_match <= 0 or _MAX_TOKENS <= 0: + return full_match + + req_id = request.request_id + block_size = getattr(self, "block_size", 1) + blocks_per_chunk = _get_blocks_per_chunk(self) + full_match_blocks = full_match // block_size + + state = _chunk_state.get(req_id) + if state is None or state.get("num_computed_at_start") != num_computed_tokens: + state = { + "full_match_blocks": full_match_blocks, + "chunk_end_blocks": 0, + "num_computed_at_start": num_computed_tokens, + "lookup_done": False, + } + _chunk_state[req_id] = state + + if state["lookup_done"]: + return 0 + + remaining = state["full_match_blocks"] - state["chunk_end_blocks"] + if remaining <= 0: + state["lookup_done"] = True + return 0 + + this_chunk = min(remaining, blocks_per_chunk) + state["chunk_end_blocks"] += this_chunk + if state["chunk_end_blocks"] >= state["full_match_blocks"]: + state["lookup_done"] = True + + capped = this_chunk * block_size + if capped < full_match: + logger.debug( + "Chunked LMCache: req %s capped %d -> %d tokens " + "(chunk %d/%d blocks)", + req_id, full_match, capped, this_chunk, full_match_blocks, + ) + + # Cap the tracker's hit blocks to match what we report + tracker = getattr(request, "kv_transfer_params", None) + if tracker is not None: + orig_hits = getattr(tracker, "num_lmcache_hit_blocks", 0) + if orig_hits > this_chunk: + tracker.num_lmcache_hit_blocks = this_chunk + + return capped + + def _patched_get_finished(self, scheduler_output): + result = _orig_get_finished(self, scheduler_output) + # Clean up chunk state for finished requests. + # vLLM passes scheduler_output as a set of request-ID strings + # (not a SchedulerOutput object), so iterate directly when it + # is a set/frozenset; fall back to the attribute path for + # forward compatibility. + if isinstance(scheduler_output, (set, frozenset)): + finished = scheduler_output + else: + finished = getattr(scheduler_output, "finished_req_ids", []) + for req in finished: + _chunk_state.pop(req, None) + return result + + cls.get_num_new_matched_tokens = _patched_get_num_new_matched_tokens + cls.get_finished = _patched_get_finished + cls._chunked_patch_applied = True + logger.info( + "Chunked LMCache connector patch applied " + "(max_tokens_per_load=%d)", _MAX_TOKENS, + ) + + +_orig_import = builtins.__import__ + + +def _patching_import(name, *args, **kwargs): + module = _orig_import(name, *args, **kwargs) + if ( + name == "lmcache.integration.vllm.lmcache_mp_connector" + or ( + name.startswith("lmcache") + and "lmcache.integration.vllm.lmcache_mp_connector" in sys.modules + ) + ): + _apply_patch() + return module + + +builtins.__import__ = _patching_import +_apply_patch() +PY +} + +write_scheduler_assertion_patch() { + local patch_dir="$1" + mkdir -p "$patch_dir" + cat > "$patch_dir/scheduler_assertion_patch.py" <<'PY' +""" +Patch vLLM scheduler to handle stale finished_recving gracefully. + +The assertion at scheduler.py crashes when a KV transfer reports +"finished recving" but the request is already in RUNNING state. +This happens when transfers complete asynchronously and the scheduler +has already moved the request forward. + +Fix: Instead of asserting, log a warning and skip. +""" + +import logging +import sys +import builtins + +logger = logging.getLogger("scheduler_assertion_patch") + + +def _apply_patch(): + """Patch vLLM scheduler's _update_from_kv_xfer_finished.""" + sched_mod = sys.modules.get("vllm.v1.core.sched.scheduler") + if sched_mod is None: + return + req_mod = sys.modules.get("vllm.v1.request") + if req_mod is None: + return + Scheduler = getattr(sched_mod, "Scheduler", None) + RequestStatus = getattr(req_mod, "RequestStatus", None) + if Scheduler is None or RequestStatus is None: + return + if getattr(Scheduler, "_kv_xfer_patch_applied", False): + return + + _orig_update = Scheduler._update_from_kv_xfer_finished + + def _patched_update(self, kv_connector_output): + if self.connector is not None: + self.connector.update_connector_output(kv_connector_output) + for req_id in kv_connector_output.finished_recving or (): + if req_id not in self.requests: + continue + req = self.requests[req_id] + if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + self.finished_recving_kv_req_ids.add(req_id) + elif RequestStatus.is_finished(req.status): + self._free_blocks(self.requests[req_id]) + else: + logger.warning( + "Stale finished_recving for req %s in status %s; skipping.", + req_id, req.status.name, + ) + for req_id in kv_connector_output.finished_sending or (): + if req_id not in self.requests: + continue + self._free_blocks(self.requests[req_id]) + + Scheduler._update_from_kv_xfer_finished = _patched_update + Scheduler._kv_xfer_patch_applied = True + logger.info("Scheduler KV transfer assertion patch applied") + + +_orig_import = builtins.__import__ + + +def _patching_import(name, *args, **kwargs): + module = _orig_import(name, *args, **kwargs) + if ( + name == "vllm.v1.core.sched.scheduler" + or ( + name.startswith("vllm") + and "vllm.v1.core.sched.scheduler" in sys.modules + ) + ): + _apply_patch() + return module + + +builtins.__import__ = _patching_import +_apply_patch() +PY +} + +# Workaround for MEC FW <177 RCCL memory reclaim issue +version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') +if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + # ---- Server config ---------------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" @@ -52,8 +591,6 @@ mkdir -p "$RESULT_DIR" OFFLOAD_ARGS=() PREFIX_CACHE_ARGS=() - -# ---- Lmcache config ---------------------------------------------------------- LMCACHE_PID="" cleanup_lmcache_server() { @@ -111,9 +648,7 @@ case "$OFFLOADING" in # MI355X nodes have ~2.7 TiB of host DRAM available for offload; # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for # worker RSS / page cache / slurm cgroup). - #TODO: fix - TOTAL_CPU_DRAM_GB=3000 - TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + TOTAL_CPU_DRAM_GB=2500 # Use vLLM's regular native KV-offload path (OffloadingConnector), # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 @@ -124,7 +659,7 @@ case "$OFFLOADING" in # (vllm/config/vllm.py:662). OFFLOAD_ARGS=( --kv_offloading_backend native - --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" + --kv_offloading_size "$TOTAL_CPU_DRAM_GB" --disable-hybrid-kv-cache-manager ) ;; @@ -132,20 +667,74 @@ case "$OFFLOADING" in { set +x; } 2>/dev/null unset VLLM_USE_SIMPLE_KV_OFFLOAD - git clone https://github.com/LMCache/LMCache.git - cd LMCache - pip install -r requirements/build.txt - CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation - cd .. - + agentic_pip_install --quiet --no-cache-dir lmcache + # LMCache's current dependency chain can install NVIDIA/CUDA NIXL and + # CuPy packages on ROCm. vLLM 0.21.0 treats ROCm as "cuda-like", and + # during Kimi fused-MoE model inspection it imports nixl_ep whenever + # that module is importable, even when this run is not using EP/NIXL + # kernels. The CUDA extension then fails immediately on AMD nodes with + # "ImportError: libcuda.so.1". + # + # LMCache MP also uses CuPy stream APIs while registering vLLM's KV + # caches. The CUDA CuPy wheel imports on ROCm, but it fails at runtime + # with cudaErrorInsufficientDriver when LMCache touches the stream. Use + # the ROCm 7 CuPy wheel so the same API dispatches through HIP. + python3 -m pip uninstall -y \ + nixl nixl-cu12 nixl-cu13 nixl_ep \ + >/dev/null 2>&1 || true + python3 -m pip uninstall -y \ + cupy cupy-cuda11x cupy-cuda12x cupy-cuda13x \ + >/dev/null 2>&1 || true + agentic_pip_install --quiet --no-cache-dir cupy-rocm-7-0 + python3 - <<'PY' +import importlib.util +import sys + +spec = importlib.util.find_spec("nixl_ep") +if spec is not None: + locations = ", ".join(spec.submodule_search_locations or [spec.origin or "unknown"]) + print( + "Error: nixl_ep is still importable after LMCache install; " + "this ROCm Kimi run would import a CUDA-only nixl_ep module. " + f"location={locations}", + file=sys.stderr, + ) + sys.exit(1) + +try: + from cupy_backends.cuda.api import runtime as cupy_runtime +except Exception as exc: + print(f"Error: failed to import CuPy runtime after ROCm CuPy install: {exc}", file=sys.stderr) + sys.exit(1) + +if not getattr(cupy_runtime, "is_hip", False): + print( + "Error: CuPy is still using the CUDA backend after installing " + "cupy-rocm-7-0; LMCache MP would fail during KV-cache registration.", + file=sys.stderr, + ) + sys.exit(1) +PY + LMCACHE_ROCM_PATCH_DIR="$RESULT_DIR/lmcache_rocm_patch" + write_lmcache_rocm_mp_patch "$LMCACHE_ROCM_PATCH_DIR" + write_chunked_connector_patch "$LMCACHE_ROCM_PATCH_DIR" + write_scheduler_assertion_patch "$LMCACHE_ROCM_PATCH_DIR" + export LMCACHE_ROCM_MP_BLOCK_FALLBACK=1 + export LMCACHE_ROCM_MP_BLOCK_FALLBACK_DTYPE=bfloat16 + export LMCACHE_ROCM_DEMAND_PINNED_ALLOCATOR=1 + # Cap external KV tokens loaded per scheduling step to prevent GPU + # block exhaustion deadlock at high concurrency (c>=32). Default + # 32768 keeps peak block demand within the GPU KV pool. Set to 0 to + # disable chunking (only safe at low concurrency). + export CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD="${CHUNKED_LMCACHE_MAX_TOKENS_PER_LOAD:-32768}" + export PYTHONPATH="$LMCACHE_ROCM_PATCH_DIR${PYTHONPATH:+:$PYTHONPATH}" python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV # pool, but let the external MP server own that pool so vLLM does not # split --kv-offloading-size across TP ranks through the integrated # LMCache backend. - #TODO: fix - TOTAL_CPU_DRAM_GB=3000 + TOTAL_CPU_DRAM_GB=2500 LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" LMCACHE_PORT="${LMCACHE_PORT:-5555}" LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" @@ -153,7 +742,7 @@ case "$OFFLOADING" in # ZMQ endpoint. Bind the server to a raw host, but pass the connector a # ZMQ-style host string. LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" - LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" + LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$TOTAL_CPU_DRAM_GB}" LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" # LMCache read locks are leases on chunks that lookup has promised # vLLM can retrieve. The default 300s TTL is too short for this @@ -161,11 +750,10 @@ case "$OFFLOADING" in # lookup and retrieve while GPU KV is saturated, which leaves the # object present in L1 but no longer readable. Keep the 2.5 TB pool # size unchanged and only extend the lookup-to-retrieve lease. - LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" + LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}" LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" - export LMCACHE_BLOCKING_TIMEOUT_SECS=120 echo "Starting LMCache MP server..." LMCACHE_CMD=( @@ -198,7 +786,6 @@ case "$OFFLOADING" in *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; esac -# ---- LLM server config ---------------------------------------------------------- EP_ARGS=() if [ "$EP_SIZE" -gt 1 ]; then EP_ARGS=(--enable-expert-parallel) @@ -207,34 +794,17 @@ fi echo "Starting vllm server..." export PYTHONNOUSERSITE=1 -# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) -pip install amd-quark - -# Disable AITER RMSNorm for TP < 8 due to accuracy issues -if [ "${TP}" -lt 8 ]; then - export VLLM_ROCM_USE_AITER_RMSNORM=0 -fi - -# Workaround for MEC FW <177 RCCL memory reclaim issue -version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') -if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - { set +x; } 2>/dev/null VLLM_CMD=( - vllm serve "$MODEL" + vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" --tensor-parallel-size="$TP" "${EP_ARGS[@]}" --gpu-memory-utilization 0.90 - --kv-cache-dtype fp8 \ --block-size=1 --trust-remote-code + --max-model-len "$MAX_MODEL_LEN" --max-num-seqs "$CONC" --mm-encoder-tp-mode data "${PREFIX_CACHE_ARGS[@]}" @@ -251,4 +821,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh b/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh deleted file mode 100755 index f36fc59e9..000000000 --- a/benchmarks/single_node/agentic/minimaxm2.5_fp4_mi355x.sh +++ /dev/null @@ -1,256 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -set -x - -# Agentic trace replay benchmark for Kimi-K2.5 FP4 on MI355X using vLLM. -# -# Required env vars: -# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR -# -# OFFLOADING values: -# none - vLLM GPU KV only. -# cpu - vLLM native CPU offload. -# lmcache - LMCache MP server + vLLM LMCacheMPConnector. - -source "$(dirname "$0")/../../benchmark_lib.sh" - -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION - -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -EP_SIZE=${EP_SIZE:-1} - -if [[ -n "${SLURM_JOB_ID:-}" ]]; then - echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" -fi - -# ROCR/HIP visibility for vLLM 0.14+ -if [ -n "${ROCR_VISIBLE_DEVICES:-}" ]; then - export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" -fi - -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi -rocm-smi || true -amd-smi || true - -# ---- Resolve traces and install deps ---------------------------------------- -# MiniMax-M2.5 servers run at max_model_len ~256k; the unfiltered 052726 -# corpus has requests up to ~1M proxy tokens that would be rejected. -# Switch to the 256k-capped variant (470 traces, max in+out <= 256k). -#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k -#060226 -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k - -resolve_trace_source -install_agentic_deps - -# ---- Server config ---------------------------------------------------------- -SERVER_LOG="$RESULT_DIR/server.log" -LMCACHE_LOG="$RESULT_DIR/lmcache_server.log" -mkdir -p "$RESULT_DIR" - -OFFLOAD_ARGS=() -PREFIX_CACHE_ARGS=() - -# ---- Lmcache config ---------------------------------------------------------- -LMCACHE_PID="" - -cleanup_lmcache_server() { - if [[ -n "$LMCACHE_PID" ]] && kill -0 "$LMCACHE_PID" 2>/dev/null; then - kill "$LMCACHE_PID" 2>/dev/null || true - wait "$LMCACHE_PID" 2>/dev/null || true - fi -} - -trap cleanup_lmcache_server EXIT - -wait_for_lmcache_ready() { - { set +x; } 2>/dev/null - local attempts="${LMCACHE_READY_ATTEMPTS:-120}" - local tail_pid="" - - while [ ! -f "$LMCACHE_LOG" ]; do - if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then - echo "LMCache server died before creating log file. Exiting." >&2 - exit 1 - fi - sleep 1 - done - - tail -f -n +1 "$LMCACHE_LOG" & - tail_pid=$! - - for ((i = 1; i <= attempts; i++)); do - if curl --output /dev/null --silent --fail "http://127.0.0.1:${LMCACHE_HTTP_PORT}/healthcheck"; then - kill "$tail_pid" 2>/dev/null || true - wait "$tail_pid" 2>/dev/null || true - return 0 - fi - if [[ -n "$LMCACHE_PID" ]] && ! kill -0 "$LMCACHE_PID" 2>/dev/null; then - echo "LMCache server died before becoming healthy. Log follows:" >&2 - kill "$tail_pid" 2>/dev/null || true - wait "$tail_pid" 2>/dev/null || true - cat "$LMCACHE_LOG" >&2 || true - exit 1 - fi - sleep 1 - done - - echo "Timed out waiting for LMCache server healthcheck. Log follows:" >&2 - kill "$tail_pid" 2>/dev/null || true - wait "$tail_pid" 2>/dev/null || true - cat "$LMCACHE_LOG" >&2 || true - exit 1 -} - -case "$OFFLOADING" in - none) ;; - cpu) - unset VLLM_USE_SIMPLE_KV_OFFLOAD - # MI355X nodes have ~2.7 TiB of host DRAM available for offload; - # reserve 2.5 TB for the offload pool (leaves ~200 GB headroom for - # worker RSS / page cache / slurm cgroup). - TOTAL_CPU_DRAM_GB=3000 - TOTAL_CPU_DRAM_PARTITION_GB="${TOTAL_CPU_DRAM_PARTITION_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" - # Use vLLM's regular native KV-offload path (OffloadingConnector), - # NOT the SimpleCPUOffloadConnector. The "native" backend resolves to - # OffloadingConnector by default; setting VLLM_USE_SIMPLE_KV_OFFLOAD=1 - # would switch it to SimpleCPUOffloadConnector. We intentionally leave - # that env var UNSET here so the regular OffloadingConnector path is - # used. The shortcut --kv_offloading_backend native + --kv_offloading_size - # form constructs the KVTransferConfig at engine startup - # (vllm/config/vllm.py:662). - - # Remove --disable-hybrid-kv-cache-manager and enable hybrid kv cache manager (default) - # This gives extra cache hit than disabling hybrid kv cache manager - # srok, - # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma - # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60 - OFFLOAD_ARGS=( - --kv_offloading_backend native - --kv_offloading_size "$TOTAL_CPU_DRAM_PARTITION_GB" - --disable-hybrid-kv-cache-manager - ) - ;; - lmcache) - { set +x; } 2>/dev/null - unset VLLM_USE_SIMPLE_KV_OFFLOAD - - git clone https://github.com/LMCache/LMCache.git - cd LMCache - pip install -r requirements/build.txt - CXX=hipcc BUILD_WITH_HIP=1 pip install -e . --no-build-isolation - cd .. - - python3 -c "import lmcache.integration.vllm.lmcache_mp_connector" >/dev/null - - # Match the B200 Kimi LMCache setup: keep a 2.5 TB semantic CPU KV - # pool, but let the external MP server own that pool so vLLM does not - # split --kv-offloading-size across TP ranks through the integrated - # LMCache backend. - TOTAL_CPU_DRAM_GB=3000 - LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}" - LMCACHE_PORT="${LMCACHE_PORT:-5555}" - LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}" - # LMCacheMPConnector concatenates lmcache.mp.host and port into the - # ZMQ endpoint. Bind the server to a raw host, but pass the connector a - # ZMQ-style host string. - LMCACHE_CONNECT_HOST="${LMCACHE_CONNECT_HOST:-tcp://$LMCACHE_HOST}" - LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / (8 / TP)))}" - LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}" - # LMCache read locks are leases on chunks that lookup has promised - # vLLM can retrieve. The default 300s TTL is too short for this - # long-context agentic queue: TP8/conc32 can spend >300s between - # lookup and retrieve while GPU KV is saturated, which leaves the - # object present in L1 but no longer readable. Keep the 2.5 TB pool - # size unchanged and only extend the lookup-to-retrieve lease. - LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-7200}" - LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}" - LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-$TP}" - export PYTHONHASHSEED="${PYTHONHASHSEED:-0}" - export LMCACHE_BLOCKING_TIMEOUT_SECS=120 - - set -x - echo "Starting LMCache MP server..." - LMCACHE_CMD=( - lmcache server - --host "$LMCACHE_HOST" - --port "$LMCACHE_PORT" - --http-host "$LMCACHE_HOST" - --http-port "$LMCACHE_HTTP_PORT" - --l1-size-gb "$LMCACHE_L1_SIZE_GB" - --l1-init-size-gb "$LMCACHE_L1_INIT_SIZE_GB" - --l1-read-ttl-seconds "$LMCACHE_L1_READ_TTL_SECONDS" - --chunk-size "$LMCACHE_CHUNK_SIZE" - --max-workers "$LMCACHE_MAX_WORKERS" - --eviction-policy LRU - ) - printf '%q ' "${LMCACHE_CMD[@]}" > "$RESULT_DIR/lmcache_command.txt" - printf '\n' >> "$RESULT_DIR/lmcache_command.txt" - "${LMCACHE_CMD[@]}" > "$LMCACHE_LOG" 2>&1 & - LMCACHE_PID=$! - echo "LMCache server PID: $LMCACHE_PID" - wait_for_lmcache_ready - - PREFIX_CACHE_ARGS=(--enable-prefix-caching) - # srok, - # --no-disable-hybrid-kv-cache-manager is not compatible with lmcache, even for non-hma - # https://github.com/vllm-project/vllm/blob/0585b5ba2eaa7860d6976bc7ba376bdbca5119fc/vllm/distributed/kv_transfer/kv_connector/factory.py#L56-L60 - OFFLOAD_ARGS=( - --kv-transfer-config - "{\"kv_connector\":\"LMCacheMPConnector\",\"kv_connector_module_path\":\"lmcache.integration.vllm.lmcache_mp_connector\",\"kv_role\":\"kv_both\",\"kv_connector_extra_config\":{\"lmcache.mp.host\":\"$LMCACHE_CONNECT_HOST\",\"lmcache.mp.port\":$LMCACHE_PORT}}" - --disable-hybrid-kv-cache-manager - ) - ;; - *) echo "Error: unsupported OFFLOADING value '$OFFLOADING'" >&2; exit 1 ;; -esac - -# ---- LLM server config ---------------------------------------------------------- -EP_ARGS=() -if [ "$EP_SIZE" -gt 1 ]; then - EP_ARGS=(--enable-expert-parallel) -fi - -echo "Starting vllm server..." -export PYTHONNOUSERSITE=1 - -# Install amd-quark for MXFP4 (manual install due to ROCm vLLM bug) -pip install -q amd-quark - -# Workaround for MEC FW <177 RCCL memory reclaim issue -version=$(rocm-smi --showfw 2>/dev/null | grep MEC | head -n 1 | awk '{print $NF}') -if [[ "$version" == "" || ${version:-0} -lt 177 ]]; then - export HSA_NO_SCRATCH_RECLAIM=1 -fi - -export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 - -{ set +x; } 2>/dev/null -VLLM_CMD=( - vllm serve "$MODEL" - --host 0.0.0.0 - --port "$PORT" - --tensor-parallel-size="$TP" - "${EP_ARGS[@]}" - --gpu-memory-utilization 0.95 - --kv-cache-dtype fp8 \ - --block-size=32 - --trust-remote-code - --attention-backend "ROCM_AITER_FA" - --max-num-seqs "$CONC" - "${PREFIX_CACHE_ARGS[@]}" - "${OFFLOAD_ARGS[@]}" -) -printf '%q ' "${VLLM_CMD[@]}" | tee "$RESULT_DIR/vllm_command.txt" -printf '\n' | tee -a "$RESULT_DIR/vllm_command.txt" -"${VLLM_CMD[@]}" > "$SERVER_LOG" 2>&1 & -SERVER_PID=$! -echo "Server PID: $SERVER_PID" - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -# ---- Run benchmark ---------------------------------------------------------- -build_replay_cmd "$RESULT_DIR" - -run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh index 656e924dc..ff901b674 100755 --- a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x.sh @@ -2,117 +2,51 @@ set -euo pipefail set -x -# Agentic trace replay benchmark for Qwen3.5 FP8 on MI300X using SGLang. -# -# Base server recipe follows the upstream MI300X reference -# (benchmarks/single_node/qwen3.5_fp8_mi300x.sh, the "AMD Andy" recipe): -# aiter attention backend, aiter allreduce fusion, mem-fraction 0.75. -# The agentic harness (resolve_trace_source / build_replay_cmd / -# run_agentic_replay_and_write_outputs) replaces run_benchmark_serving, and -# --disable-radix-cache is dropped because agentic replay needs prefix reuse. +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang. # # Required env vars: -# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR, DURATION, EP_SIZE -# -# OFFLOADING values: -# none - SGLang GPU KV with the default RadixAttention prefix cache. -# hicache - SGLang HiCache with a local CPU hierarchical cache on top of radix. +# MODEL, TP, CONC, RESULT_DIR source "$(dirname "$0")/../../benchmark_lib.sh" -check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR EP_SIZE DP_ATTENTION +check_env_vars MODEL TP CONC RESULT_DIR DURATION EP_SIZE -PORT=${PORT:-8888} -DURATION=${DURATION:-1800} -EP_SIZE=${EP_SIZE:-1} - -SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=131072 +fi if [[ -n "${SLURM_JOB_ID:-}" ]]; then echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi rocm-smi || true amd-smi || true -# ---- Resolve traces and install deps ---------------------------------------- -# Cap the replay corpus at 256k (470 traces, max in+out <= 256k) instead of the -# unfiltered 052726 corpus whose ~1M-token traces get rejected and add no perf -# signal at high concurrency. -#export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k -#060226 -export WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_060226_256k - # ---- Resolve traces and install deps ---------------------------------------- resolve_trace_source install_agentic_deps -# ---- Cache / offload config ------------------------------------------------- +# ---- Start SGLang server ---------------------------------------------------- SERVER_LOG="$RESULT_DIR/server.log" mkdir -p "$RESULT_DIR" -CACHE_ARGS=() -WARMUP_ARGS=() -CUDA_GRAPH_MAX_BS="$CONC" -case "$OFFLOADING" in - none) - # Leave SGLang's default RadixAttention prefix cache on — agentic - # replay needs it; --disable-radix-cache would zero the hit rate. - ;; - hicache) - # Qwen3.5's hybrid GDN/Mamba path allocates two HiCache host pools per - # TP rank (one hierarchical KV, one hierarchical Mamba), so the - # node-total DRAM budget divides by TP and the host-pool count. - TOTAL_CPU_DRAM_GB=3000 - HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" - HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-300}}" - HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" - # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler, which - # requires page_size=1. Keep the safer direct/layer_first copy path; - # kernel/page_first faults on first prefill in this mode on ROCm. - HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" - HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" - HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" - HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" - if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then - HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" - fi - if [ "$HICACHE_SIZE_GB" -lt 1 ]; then - echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 - exit 1 - fi - echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" - CACHE_ARGS=( - --page-size "$HICACHE_PAGE_SIZE" - --enable-hierarchical-cache - --hicache-size "$HICACHE_SIZE_GB" - --hicache-io-backend "$HICACHE_IO_BACKEND" - --hicache-mem-layout "$HICACHE_MEM_LAYOUT" - --hicache-write-policy "$HICACHE_WRITE_POLICY" - ) - # HiCache startup reaches API readiness but SGLang's internal warmup - # request can time out on this path; let aiperf own benchmark traffic. - WARMUP_ARGS=(--skip-server-warmup) - # Don't force ROCm graph capture at every high concurrency point; conc=16 - # is the highest known-good capture size for this model/server path. - HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-256}" - if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then - CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" - fi - ;; - *) - echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 - exit 1 - ;; -esac - echo "Starting SGLang server..." export PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --attention-backend triton \ - --model-path $MODEL \ + --model-path "$MODEL_PATH" --served-model-name "$MODEL" \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ @@ -122,10 +56,10 @@ python3 -m sglang.launch_server \ --enable-aiter-allreduce-fusion \ --cuda-graph-max-bs $CONC \ --max-running-requests $CONC \ - --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ + --max-prefill-tokens 32768 \ + --scheduler-recv-interval 30 \ --mem-fraction-static 0.8 \ - "${CACHE_ARGS[@]}" \ - "${WARMUP_ARGS[@]}" \ + --context-length $MAX_MODEL_LEN \ --enable-metrics > "$SERVER_LOG" 2>&1 & SERVER_PID=$! echo "Server PID: $SERVER_PID" @@ -135,4 +69,4 @@ wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$S # ---- Run benchmark ---------------------------------------------------------- build_replay_cmd "$RESULT_DIR" -run_agentic_replay_and_write_outputs "$RESULT_DIR" \ No newline at end of file +run_agentic_replay_and_write_outputs "$RESULT_DIR" diff --git a/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh new file mode 100755 index 000000000..cdded8860 --- /dev/null +++ b/benchmarks/single_node/agentic/qwen3.5_fp8_mi355x_sglang.sh @@ -0,0 +1,152 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +# Agentic trace replay benchmark for Qwen3.5 FP8 on MI355X using SGLang. +# +# Required env vars: +# MODEL, TP, CONC, OFFLOADING, TOTAL_CPU_DRAM_GB, RESULT_DIR +# +# OFFLOADING values: +# none - SGLang GPU KV only with radix cache disabled. +# hicache - SGLang HiCache with local CPU hierarchical cache. + +source "$(dirname "$0")/../../benchmark_lib.sh" + +check_env_vars MODEL TP CONC OFFLOADING TOTAL_CPU_DRAM_GB RESULT_DIR DURATION EP_SIZE + +SCHEDULER_RECV_INTERVAL=${SCHEDULER_RECV_INTERVAL:-30} +if [ -z "${MAX_MODEL_LEN:-}" ] || [ "$MAX_MODEL_LEN" = "0" ]; then + MAX_MODEL_LEN=131072 +fi + +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + echo "JOB $SLURM_JOB_ID running on ${SLURMD_NODENAME:-unknown}" +fi + +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi +rocm-smi || true +amd-smi || true + +# ---- Resolve traces and install deps ---------------------------------------- +resolve_trace_source +install_agentic_deps + +# ---- Server config ---------------------------------------------------------- +SERVER_LOG="$RESULT_DIR/server.log" +mkdir -p "$RESULT_DIR" + +CACHE_ARGS=() +WARMUP_ARGS=() +CUDA_GRAPH_MAX_BS="$CONC" +case "$OFFLOADING" in + none) + # Leave SGLang's default RadixAttention prefix cache on — agentic + # replay needs it; --disable-radix-cache would zero the hit rate. + ;; + hicache) + # MI355X nodes have about 3 TB of host DRAM, but Qwen3.5's hybrid + # GDN/Mamba path allocates two HiCache host pools per TP rank: one for + # hierarchical KV cache and one for hierarchical Mamba cache. A 2 TB + # node-total target at TP=8 is therefore 2000 / (8 * 2) = 125 GB per + # host pool, not 250 GB. Keep overrides for one-off tuning. + TOTAL_CPU_DRAM_GB="${HICACHE_TOTAL_CPU_DRAM_GB:-2000}" + HICACHE_HOST_POOL_COUNT="${HICACHE_HOST_POOL_COUNT:-2}" + HICACHE_MAX_SIZE_GB_PER_RANK_POOL="${HICACHE_MAX_SIZE_GB_PER_RANK_POOL:-${HICACHE_MAX_SIZE_GB_PER_RANK:-180}}" + HICACHE_WRITE_POLICY="${HICACHE_WRITE_POLICY:-write_through_selective}" + # Qwen3.5's hybrid Mamba path runs SGLang's no_buffer scheduler on + # MI355X, which requires page_size=1. The kernel/page_first HiCache + # transfer path faults on first prefill in this mode on ROCm, so keep + # the default on the safer direct/layer_first copy path. These remain + # env-overridable for future SGLang/ROCm fixes. + HICACHE_PAGE_SIZE="${HICACHE_PAGE_SIZE:-1}" + HICACHE_IO_BACKEND="${HICACHE_IO_BACKEND:-direct}" + HICACHE_MEM_LAYOUT="${HICACHE_MEM_LAYOUT:-layer_first}" + # SGLang --hicache-size is per rank per host pool, while the workflow + # input is a node-total DRAM budget. Divide by TP and the number of + # host pools unless HICACHE_SIZE_GB is set directly for one-off tuning. + HICACHE_SIZE_GB="${HICACHE_SIZE_GB:-$((TOTAL_CPU_DRAM_GB / TP / HICACHE_HOST_POOL_COUNT))}" + if [ "$HICACHE_SIZE_GB" -gt "$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" ]; then + HICACHE_SIZE_GB="$HICACHE_MAX_SIZE_GB_PER_RANK_POOL" + fi + if [ "$HICACHE_SIZE_GB" -lt 1 ]; then + echo "Error: computed HICACHE_SIZE_GB=$HICACHE_SIZE_GB from TOTAL_CPU_DRAM_GB=$TOTAL_CPU_DRAM_GB, TP=$TP, HICACHE_HOST_POOL_COUNT=$HICACHE_HOST_POOL_COUNT" >&2 + exit 1 + fi + echo "HiCache CPU pool: ${HICACHE_SIZE_GB} GB per rank per host pool across TP=${TP}, host_pool_count=${HICACHE_HOST_POOL_COUNT}" + CACHE_ARGS=( + --page-size "$HICACHE_PAGE_SIZE" + --enable-hierarchical-cache + --hicache-size "$HICACHE_SIZE_GB" + --hicache-io-backend "$HICACHE_IO_BACKEND" + --hicache-mem-layout "$HICACHE_MEM_LAYOUT" + --hicache-write-policy "$HICACHE_WRITE_POLICY" + ) + # HiCache startup reaches API readiness, but SGLang's internal warmup + # request has timed out after 600s on this Qwen MI355X path. Let aiperf + # own benchmark traffic instead of blocking server readiness on it. + WARMUP_ARGS=(--skip-server-warmup) + # Keep request concurrency as the swept variable, but do not force + # HiCache runs to capture ROCm graphs at every high concurrency point. + # The conc=32 HiCache job crashed after startup readiness, before any + # aiperf traffic, while conc=16 is the highest known-good capture size + # for this model/server path. Requests above the capture size can still + # run; they just do not require a larger captured graph at startup. + HICACHE_CUDA_GRAPH_MAX_BS="${HICACHE_CUDA_GRAPH_MAX_BS:-16}" + if [ "$HICACHE_CUDA_GRAPH_MAX_BS" -lt "$CUDA_GRAPH_MAX_BS" ]; then + CUDA_GRAPH_MAX_BS="$HICACHE_CUDA_GRAPH_MAX_BS" + fi + ;; + *) + echo "Error: unsupported OFFLOADING value '$OFFLOADING' (expected one of: none, hicache)" >&2 + exit 1 + ;; +esac + +echo "Starting SGLang server..." +export PYTHONNOUSERSITE=1 + +{ set +x; } 2>/dev/null +SGLANG_CMD=( + python3 -m sglang.launch_server + --attention-backend triton + --model-path "$MODEL_PATH" --served-model-name "$MODEL" + --host=0.0.0.0 + --port "$PORT" + --tensor-parallel-size "$TP" + --ep-size "$EP_SIZE" + --trust-remote-code + --tokenizer-worker-num 6 + --enable-aiter-allreduce-fusion + --cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS" + --max-running-requests "$CONC" + --max-prefill-tokens 32768 + --scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL" + --mem-fraction-static 0.8 + --context-length "$MAX_MODEL_LEN" + --enable-metrics + "${CACHE_ARGS[@]}" + "${WARMUP_ARGS[@]}" +) +printf '%q ' "${SGLANG_CMD[@]}" | tee "$RESULT_DIR/sglang_command.txt" +printf '\n' | tee -a "$RESULT_DIR/sglang_command.txt" +"${SGLANG_CMD[@]}" > "$SERVER_LOG" 2>&1 & +SERVER_PID=$! +echo "Server PID: $SERVER_PID" + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +# ---- Run benchmark ---------------------------------------------------------- +build_replay_cmd "$RESULT_DIR" + +run_agentic_replay_and_write_outputs "$RESULT_DIR"