From 041f84c071d1505b21c2b5879203feebc62f96b3 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Tue, 2 Jun 2026 17:22:30 -0700 Subject: [PATCH 1/3] [B300][vLLM] Add MiniMax-M2.5 FP4 disagg Dynamo configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split of SemiAnalysisAI/InferenceX#1560 — B300 half. - Add minimaxm2.5-fp4-b300-dynamo-vllm to nvidia-master.yaml (1k1k + 8k1k search spaces; image vllm/vllm-openai:v0.20.1, model nvidia/MiniMax-M2.5-NVFP4). - Add srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/. - Wire minimax + dynamo-vllm routing into runners/launch_b300-nv.sh. - Append perf-changelog entry. --- .github/configs/nvidia-master.yaml | 213 ++++++++++++++++++ .../minimax-m2.5-b300/1k1k/dep2-1p2d.yaml | 72 ++++++ .../1k1k/dep2-2p3d-c6144.yaml | 72 ++++++ .../minimax-m2.5-b300/1k1k/dep2-2p3d.yaml | 72 ++++++ .../minimax-m2.5-b300/1k1k/dep8-2p1d.yaml | 71 ++++++ .../vllm/minimax-m2.5-b300/1k1k/tp4-1p1d.yaml | 73 ++++++ .../vllm/minimax-m2.5-b300/1k1k/tp4-1p2d.yaml | 69 ++++++ .../minimax-m2.5-b300/1k1k/tp4ep-1p1d.yaml | 71 ++++++ .../minimax-m2.5-b300/1k1k/tp4ep-1p3d.yaml | 69 ++++++ .../vllm/minimax-m2.5-b300/1k1k/tp8-1p1d.yaml | 78 +++++++ .../minimax-m2.5-b300/8k1k/dep4-4p1d.yaml | 71 ++++++ .../minimax-m2.5-b300/8k1k/dep8-4p1d.yaml | 71 ++++++ .../vllm/minimax-m2.5-b300/8k1k/tp4-1p1d.yaml | 69 ++++++ .../minimax-m2.5-b300/8k1k/tp4ep-1p1d.yaml | 69 ++++++ .../minimax-m2.5-b300/8k1k/tp4ep-2p1d.yaml | 69 ++++++ .../vllm/minimax-m2.5-b300/8k1k/tp8-1p1d.yaml | 73 ++++++ perf-changelog.yaml | 8 + runners/launch_b300-nv.sh | 11 +- 18 files changed, 1300 insertions(+), 1 deletion(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-1p2d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d-c6144.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep8-2p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p2d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p3d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp8-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep4-4p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep8-4p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-2p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp8-1p1d.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b8db07e0d..e73a7a654 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -10144,3 +10144,216 @@ minimaxm2.5-fp4-gb200-dynamo-vllm: tp: 4 ep: 4 dp-attn: true + +minimaxm2.5-fp4-b300-dynamo-vllm: + image: vllm/vllm-openai:v0.20.1 + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + runner: b300 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [4, 16] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp8-1p1d.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [8, 16] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [32, 64, 128] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [64, 128, 256, 1024] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [4096] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-1p2d.yaml" + decode: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [2048, 4096] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml" + decode: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [6144, 8192] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml" + decode: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [1024, 1536, 2048, 4096] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [2, 4, 8, 16] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp8-1p1d.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + - conc-list: [32, 128] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [64, 128, 256, 512] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [384, 512] + prefill: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [384] + prefill: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-1p2d.yaml new file mode 100644 index 000000000..d6e6dc53c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-1p2d.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b300-decode-2xdep2" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 1 + gpus_per_decode: 2 +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d-c6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d-c6144.yaml new file mode 100644 index 000000000..3fd24aa25 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d-c6144.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b300-decode-2p3xdep2-c6144" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 2 +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "6144x8192" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d.yaml new file mode 100644 index 000000000..bc68f6d59 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep2-2p3d.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b300-decode-2p3xdep2" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 2 +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048x4096" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep8-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep8-2p1d.yaml new file mode 100644 index 000000000..516e51f11 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/dep8-2p1d.yaml @@ -0,0 +1,71 @@ +name: "minimax-m2.5-vllm-disagg-b300-decode-2p1xdep8" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024x1536x2048x4096" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p1d.yaml new file mode 100644 index 000000000..726b5a63b --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p1d.yaml @@ -0,0 +1,73 @@ +name: "minimax-m2.5-vllm-disagg-b300-decode-focus-tp4-1p1d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + UCX_RCACHE_MAX_UNRELEASED: "1024" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + UCX_RCACHE_MAX_UNRELEASED: "1024" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x16" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p2d.yaml new file mode 100644 index 000000000..77329ffcc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4-1p2d.yaml @@ -0,0 +1,69 @@ +name: "minimax-m2.5-vllm-disagg-b300-decode-focus-tp4-1p2d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "8x16" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p1d.yaml new file mode 100644 index 000000000..4f25aee38 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p1d.yaml @@ -0,0 +1,71 @@ +name: "minimax-m2.5-vllm-disagg-b300-decode-focus-tp4ep-1p1d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32x64x128" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p3d.yaml new file mode 100644 index 000000000..8da4cb7ca --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp4ep-1p3d.yaml @@ -0,0 +1,69 @@ +name: "minimax-m2.5-vllm-disagg-b300-decode-focus-tp4ep-1p3d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "64x128x256x1024" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp8-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp8-1p1d.yaml new file mode 100644 index 000000000..757eeed97 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/1k1k/tp8-1p1d.yaml @@ -0,0 +1,78 @@ +name: "minimax-m2.5-vllm-disagg-b300-decode-focus-tp8-1p1d" + +# B300-only: full-node TP=8 decode (the 8 GPUs of a single B300 node). +# Cousin of tp4-1p1d.yaml but exercises the wider TP that B300's per-node +# GPU count makes available. Only the smallest concurrencies (1,4,8) — +# this topology is decode-latency focused, not throughput. + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + UCX_RCACHE_MAX_UNRELEASED: "1024" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + UCX_RCACHE_MAX_UNRELEASED: "1024" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep4-4p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep4-4p1d.yaml new file mode 100644 index 000000000..258e9ba4f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep4-4p1d.yaml @@ -0,0 +1,71 @@ +name: "minimax-m2.5-vllm-disagg-b300-8k1k-4p1xdep4" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "384x512" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep8-4p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep8-4p1d.yaml new file mode 100644 index 000000000..1f41e52e2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/dep8-4p1d.yaml @@ -0,0 +1,71 @@ +name: "minimax-m2.5-vllm-disagg-b300-8k1k-4p1xdep8" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "384" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4-1p1d.yaml new file mode 100644 index 000000000..91761b75f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4-1p1d.yaml @@ -0,0 +1,69 @@ +name: "minimax-m2.5-vllm-disagg-b300-8k1k-1p1xtp4" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2x4x8x16" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-1p1d.yaml new file mode 100644 index 000000000..76b000e8c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-1p1d.yaml @@ -0,0 +1,69 @@ +name: "minimax-m2.5-vllm-disagg-b300-8k1k-1p1xtp4ep" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x128" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-2p1d.yaml new file mode 100644 index 000000000..b34025ee2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp4ep-2p1d.yaml @@ -0,0 +1,69 @@ +name: "minimax-m2.5-vllm-disagg-b300-8k1k-2p1xtp4ep" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64x128x256x512" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp8-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp8-1p1d.yaml new file mode 100644 index 000000000..ea276c25a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300/8k1k/tp8-1p1d.yaml @@ -0,0 +1,73 @@ +name: "minimax-m2.5-vllm-disagg-b300-8k1k-1p1xtp8" + +# B300-only: full-node TP=8 decode at 8k input. Cousin of tp4-1p1d.yaml +# but exercises the wider TP that B300's per-node GPU count makes +# available. Smallest concurrencies only (1,4,8). + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4" + random_range_ratio: 0.8 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 28523da86..7fba477b0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3409,3 +3409,11 @@ description: - "Add MiniMax-M2.5 NVFP4 GB200 disaggregated multinode vLLM benchmarks via Dynamo" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1642 + +- config-keys: + - minimaxm2.5-fp4-b300-dynamo-vllm + description: + - "Add MiniMax-M2.5 NVFP4 B300 disaggregated multinode vLLM benchmarks via Dynamo" + - "Image: vllm/vllm-openai:v0.20.1" + - "Same 1k/1k and 8k/1k search space as gb300, plus a new tp8-1p1d at low concurrencies for both ISLs" + pr-link: PLACEHOLDER_PR_LINK diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 67e8b48cc..e4a253ba3 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -39,8 +39,11 @@ elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" && $FRAMEWORK == "dynamo- fi export MODEL_PATH="${SELECTED_MODEL_PATH:-/data/models/dsv4-pro}" export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" +elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" && $FRAMEWORK == "dynamo-vllm" ]]; then + export MODEL_PATH="/data/models/MiniMax-M2.5-NVFP4" + export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-nvfp4" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm" exit 1 fi @@ -61,6 +64,12 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git checkout aflowers/vllm-gb200-v0.20.0 mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" || exit 1 + git checkout main + mkdir -p recipes/vllm/minimax-m2.5 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300" recipes/vllm/minimax-m2.5 else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 From 57fb086b7305ea99ccc1d3846df16023f21362a8 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Tue, 2 Jun 2026 17:23:00 -0700 Subject: [PATCH 2/3] perf-changelog: link minimaxm2.5-fp4-b300 entry to PR #83 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7fba477b0..58cf58e6d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3416,4 +3416,4 @@ - "Add MiniMax-M2.5 NVFP4 B300 disaggregated multinode vLLM benchmarks via Dynamo" - "Image: vllm/vllm-openai:v0.20.1" - "Same 1k/1k and 8k/1k search space as gb300, plus a new tp8-1p1d at low concurrencies for both ISLs" - pr-link: PLACEHOLDER_PR_LINK + pr-link: https://github.com/NVIDIA/InferenceMAX/pull/83 From 6a5a06901b41cf772641619931ef8f2517470e03 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Tue, 2 Jun 2026 17:37:59 -0700 Subject: [PATCH 3/3] perf-changelog: link minimaxm2.5-fp4-b300 entry to PR #1652 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 58cf58e6d..83bdf91cd 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3416,4 +3416,4 @@ - "Add MiniMax-M2.5 NVFP4 B300 disaggregated multinode vLLM benchmarks via Dynamo" - "Image: vllm/vllm-openai:v0.20.1" - "Same 1k/1k and 8k/1k search space as gb300, plus a new tp8-1p1d at low concurrencies for both ISLs" - pr-link: https://github.com/NVIDIA/InferenceMAX/pull/83 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652