From 90e5193d133db3ce8378597a9d3f10d047d4120b Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 2 Jun 2026 13:55:27 -0700 Subject: [PATCH 1/7] Add MiniMax-M2.5 FP8 GB200 Dynamo vLLM recipes --- .github/configs/nvidia-master.yaml | 135 ++++++++++++++++++ .../1k1k/disagg-gb200-1p1d-tp4.yaml | 67 +++++++++ .../1k1k/disagg-gb200-1p2d-tp4.yaml | 67 +++++++++ .../1k1k/disagg-gb200-1p3d-tp4ep.yaml | 72 ++++++++++ .../1k1k/disagg-gb200-1p4d-dep2.yaml | 74 ++++++++++ .../1k1k/disagg-gb200-2p1d-dep8.yaml | 86 +++++++++++ .../1k1k/disagg-gb200-2p3d-dep4.yaml | 74 ++++++++++ .../8k1k/disagg-gb200-1p1d-tp4.yaml | 68 +++++++++ .../8k1k/disagg-gb200-1p1d-tp4ep.yaml | 69 +++++++++ .../8k1k/disagg-gb200-3p2d-dep4.yaml | 76 ++++++++++ perf-changelog.yaml | 7 + runners/launch_gb200-nv.sh | 92 +++++++++++- 12 files changed, 881 insertions(+), 6 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d3b1b6729..46dc4cfd8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9905,3 +9905,138 @@ qwen3.5-fp8-h100-sglang-agentic: search-space: - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] } - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] } + +minimaxm2.5-fp8-gb200-dynamo-vllm: + image: vllm/vllm-openai:v0.20.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: gb200 + precision: fp8 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [2, 32, 64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [1024] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [512, 1024] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml" + decode: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [4096, 8192] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [1, 4, 8, 16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [256, 512] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [1024, 2048, 4096] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml new file mode 100644 index 000000000..120a35e45 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml @@ -0,0 +1,67 @@ +name: "minimax-m2.5-vllm-disagg-gb200-1p1d-tp4" + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml new file mode 100644 index 000000000..6b5e76e42 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml @@ -0,0 +1,67 @@ +name: "minimax-m2.5-vllm-disagg-gb200-1p2d-tp4" + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2x32x64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml new file mode 100644 index 000000000..765562d0c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-gb200-1p3d-tp4ep" + +# Rate-matched tp4ep for FP8 GB200 1k/1k. +# X_tp4ep_fp8_gb200 = 17.9k tok/s; P_per_worker = 48k; ideal X/P = 0.37; 1P:3D = 0.33 ✓ + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 3 + prefill_workers: 1 + decode_workers: 3 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml new file mode 100644 index 000000000..aeeb8a012 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml @@ -0,0 +1,74 @@ +name: "minimax-m2.5-vllm-disagg-gb200-1p4d-dep2" + +# Rate-matched dep2 for FP8 GB200 1k/1k. +# X_dep2_fp8_gb200 = 12.7k tok/s; P_per_worker = 48k; ideal X/P = 0.27; 1P:4D = 0.25 ✓ + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 2 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml new file mode 100644 index 000000000..83bc7aeb2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml @@ -0,0 +1,86 @@ +name: "minimax-m2.5-vllm-disagg-gb200-2p1d-dep8" + +# model: +# path: "minimax-m2.5-fp8" +# container: "v0.18.1" +# precision: "fp8" + +# dynamo: +# version: 1.0.1 +# install: true + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + + + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x1024" + # warmup_prompts: 1 + # use_chat_template: false + # req_rate: "inf" + # random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml new file mode 100644 index 000000000..534019222 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml @@ -0,0 +1,74 @@ +name: "minimax-m2.5-vllm-disagg-gb200-2p3d-dep4" + +# Rate-matched dep4 for FP8 GB200 1k/1k. +# X_dep4_fp8_gb200 = 30.9k tok/s; P_per_worker = 48k; ideal X/P = 0.64; 2P:3D = 0.67 ✓ + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 3 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096x8192" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml new file mode 100644 index 000000000..847c4b138 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-gb200-8k1k-1p1d-tp4" + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + max-num-batched-tokens: 16384 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16x32x64x128" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml new file mode 100644 index 000000000..15f50e108 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml @@ -0,0 +1,69 @@ +name: "minimax-m2.5-vllm-disagg-gb200-1p1d-tp4ep" + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml new file mode 100644 index 000000000..c92757146 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml @@ -0,0 +1,76 @@ +name: "minimax-m2.5-vllm-disagg-gb200-8k1k-3p2d-dep4" + +# Rate-matched dep4 for FP8 GB200 8k/1k. +# X_dep4_fp8_gb200_8k ≈ 9.8k tok/s (from 5p2d-dep4 saturation); +# P_per_worker_8k = 57k; ratio = X*8/P = 78.4/57 = 1.38; 3P:2D = 1.5 ✓ (closest int fit) + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024x2048x4096" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d201e9f3b..996c36585 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3395,3 +3395,10 @@ description: - "Add DeepSeek-V4-Pro FP4 MI355X ATOM MTP3 benchmark; image rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1627 + +- config-keys: + - minimaxm2.5-fp8-gb200-dynamo-vllm + description: + - "Add MiniMax-M2.5 FP8 GB200 disaggregated multinode vLLM benchmarks via Dynamo" + - "Add 1k1k/8k1k FP8 recipe set under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index dada98bd6..25640bef9 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -54,8 +54,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then # model.path alias in our DSV4 recipes. export MODEL_PATH="/mnt/numa1/models/deepseek-v4-pro/" export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" + elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then + export MODEL_PATH="/mnt/lustre01/models/MiniMax-M2.5" + export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" else - echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4" + echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp8" exit 1 fi else @@ -141,6 +144,37 @@ fi echo "Cloning srt-slurm repository..." SRT_REPO_DIR="srt-slurm" +# On the watchtower (Oracle) gb200 cluster, /home/slurm-shared is not +# cross-mounted to compute nodes. Put the srt-slurm workspace and staged +# InferenceX checkout on a writable shared-FS path that compute can see. +# Per-run-unique paths avoid races between parallel sweep jobs. +if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then + SHARED_BASE="" + for cand in \ + /mnt/lustre01/users-public/sa-shared/gha-runs \ + /mnt/lustre01/users/slurm-shared/gha-runs \ + /mnt/lustre01/users-public/slurm-shared/gha-runs \ + /mnt/lustre01/groups/slurm-shared/gha-runs \ + /nfs/slurm-shared/gha-runs \ + /home/slurm-shared/gharunners/gha-runs + do + if mkdir -p "$cand" 2>/dev/null && touch "$cand/.write-probe.$$" 2>/dev/null; then + rm -f "$cand/.write-probe.$$" 2>/dev/null + SHARED_BASE="$cand" + echo "Selected SHARED_BASE=$SHARED_BASE (first writable candidate)" + break + else + echo " not writable: $cand" + fi + done + if [ -z "$SHARED_BASE" ]; then + echo "Error: no writable shared run directory candidate found on this cluster" >&2 + exit 1 + fi + RUN_KEY="${GITHUB_RUN_ID:-manual}-${GITHUB_RUN_ATTEMPT:-0}-${RUNNER_NAME:-gb200-nv}-$$" + SRT_REPO_DIR="${SHARED_BASE}/srt-slurm-${RUN_KEY}" + echo "Using shared-FS SRT_REPO_DIR=$SRT_REPO_DIR (compute-visible)" +fi if [ -d "$SRT_REPO_DIR" ]; then echo "Removing existing $SRT_REPO_DIR..." rm -rf "$SRT_REPO_DIR" @@ -170,6 +204,12 @@ elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsv4" ]]; then git checkout sa-submission-q2-2026 mkdir -p recipes/sglang/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" recipes/sglang/deepseek-v4 +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 + cd "$SRT_REPO_DIR" || exit 1 + git checkout main || exit 1 + mkdir -p recipes/vllm/minimax-m2.5-gb200-fp8 || exit 1 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8" recipes/vllm/minimax-m2.5-gb200-fp8 || exit 1 elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" @@ -187,7 +227,17 @@ echo "Installing srtctl..." curl -LsSf https://astral.sh/uv/install.sh | sh source $HOME/.local/bin/env -uv venv +# Watchtower: the launcher runs on the head node but compute nodes +# inherit the activated .venv (via VIRTUAL_ENV) through SRT_REPO_DIR +# which is now on shared FS. If uv's default python install lives +# under a head-node-only path, .venv/bin/python3 becomes a broken +# symlink on compute. Pin the venv to /usr/bin/python3 — a system +# path that exists at the same location on both head and compute. +if [[ $MODEL_PREFIX == "minimaxm2.5" && -x /usr/bin/python3 ]]; then + uv venv --seed --python /usr/bin/python3 +else + uv venv --seed +fi source .venv/bin/activate uv pip install -e . @@ -200,6 +250,12 @@ echo "Configs available at: $SRT_REPO_DIR/" # Create srtslurm.yaml for srtctl (used by both frameworks) SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" +# Minimax on watchtower: SRT_REPO_DIR was moved to a shared-FS path +# above so srtctl's outputs/ directory (which lives under +# SRTCTL_ROOT) is visible to compute nodes. +if [[ $MODEL_PREFIX == "minimaxm2.5" ]]; then + SRTCTL_ROOT="$SRT_REPO_DIR" +fi echo "Creating srtslurm.yaml configuration..." cat > srtslurm.yaml <&2 + echo "Current directory: $(pwd)" >&2 + exit 1 +fi +sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH" if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then - SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) + SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) else - SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) + SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) fi echo "$SRTCTL_OUTPUT" From 7d861f6bc7cce41143ee4571aaa2d6f7721d9b50 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 2 Jun 2026 14:12:08 -0700 Subject: [PATCH 2/7] Update MiniMax-M2.5 FP8 GB200 PR link --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 996c36585..42cec9655 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3401,4 +3401,4 @@ description: - "Add MiniMax-M2.5 FP8 GB200 disaggregated multinode vLLM benchmarks via Dynamo" - "Add 1k1k/8k1k FP8 recipe set under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1648 From f601cfed3ceff86e9650db3436199de02199d007 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 2 Jun 2026 14:17:45 -0700 Subject: [PATCH 3/7] Add GB200 FP8 8k prefill batch limit --- .../minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml index 15f50e108..61497185a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml @@ -44,6 +44,7 @@ backend: tensor-parallel-size: 1 pipeline-parallel-size: 1 data-parallel-size: 2 + max-num-batched-tokens: 16384 data-parallel-rpc-port: 13346 enable-expert-parallel: true safetensors-load-strategy: "prefetch" From ac2327418b72fa90d097fbaa41a9c1586292d562 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 2 Jun 2026 11:14:48 -0700 Subject: [PATCH 4/7] Add MiniMax-M2.5 FP4 GB300 Dynamo vLLM recipes --- .github/configs/nvidia-master.yaml | 174 ++++++++++++++++++ .../minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml | 73 ++++++++ .../vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml | 73 ++++++++ .../vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml | 70 +++++++ .../vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml | 72 ++++++++ .../vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml | 68 +++++++ .../vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml | 70 +++++++ .../vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml | 68 +++++++ .../vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml | 70 +++++++ .../vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml | 70 +++++++ .../vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml | 68 +++++++ .../vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml | 68 +++++++ .../vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml | 68 +++++++ perf-changelog.yaml | 7 + runners/launch_gb300-nv.sh | 11 +- 15 files changed, 1029 insertions(+), 1 deletion(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b8db07e0d..69f56fa08 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9261,6 +9261,180 @@ qwen3.5-fp8-h100-sglang-mtp: search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } +minimaxm2.5-fp4-gb300-dynamo-vllm: + image: vllm/vllm-openai:v0.20.1 + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + runner: gb300 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [2, 4, 16] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [4, 8, 16, 64] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [32, 64, 128] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [64, 128, 256, 512, 1024] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [2048] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml" + decode: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [6144, 8192] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml" + decode: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [1024, 2048, 4096] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [2, 4, 8, 16] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [32, 64, 128, 256] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [64, 128] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [256] + prefill: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [1024, 2048] + prefill: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + glm5-fp4-gb300-dynamo-sglang: image: lmsysorg/sglang:v0.5.11-cu130 model: nvidia/GLM-5-NVFP4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml new file mode 100644 index 000000000..c7f7e28af --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml @@ -0,0 +1,73 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-2p3xdep2-c6144" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 3 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 2 + spread_workers: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "6144x8192" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml new file mode 100644 index 000000000..adaf6f271 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml @@ -0,0 +1,73 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-2p3xdep2" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 3 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 2 + spread_workers: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml new file mode 100644 index 000000000..28427e002 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-2p1xdep8" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024x2048x4096" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml new file mode 100644 index 000000000..eee93c9f8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-focus-tp4-1p1d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + UCX_RCACHE_MAX_UNRELEASED: "1024" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + UCX_RCACHE_MAX_UNRELEASED: "1024" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2x4x16" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml new file mode 100644 index 000000000..10ba980ca --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-focus-tp4-1p2d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x64" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml new file mode 100644 index 000000000..ebff26fb0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-focus-tp4ep-1p1d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32x64x128" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml new file mode 100644 index 000000000..5353e4dd0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-focus-tp4ep-1p3d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 3 + prefill_workers: 1 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "64x128x256x512x1024" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml new file mode 100644 index 000000000..d3c777618 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-gb300-8k1k-4p1xdep4" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml new file mode 100644 index 000000000..a56c095af --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-gb300-8k1k-4p1xdep8" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024x2048" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml new file mode 100644 index 000000000..a92975c57 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-gb300-8k1k-1p1xtp4" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2x4x8x16" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml new file mode 100644 index 000000000..53daeafbd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-gb300-8k1k-1p1xtp4ep" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64x128x256" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml new file mode 100644 index 000000000..163d412f5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-gb300-8k1k-2p1xtp4ep" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64x128" + random_range_ratio: 0.8 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 28523da86..c4f5232b9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3409,3 +3409,10 @@ description: - "Add MiniMax-M2.5 NVFP4 GB200 disaggregated multinode vLLM benchmarks via Dynamo" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1642 + +- config-keys: + - minimaxm2.5-fp4-gb300-dynamo-vllm + description: + - "Add MiniMax-M2.5 NVFP4 GB300 disaggregated multinode vLLM benchmarks via Dynamo" + - "Add 1k1k/8k1k minimax recipe set under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1641 diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index b47e103fd..0a24120f1 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -45,8 +45,11 @@ elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp4" ]]; then elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp8" ]]; then export MODEL_PATH=/scratch/models/GLM-5-FP8 export SRT_SLURM_MODEL_PREFIX="glm-5-fp8" +elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then + export MODEL_PATH=/data/models/MiniMax-M2.5-NVFP4 + export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-nvfp4" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4, glm5-fp4, glm5-fp8" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4, glm5-fp4, glm5-fp8, minimaxm2.5-fp4" exit 1 fi @@ -137,6 +140,12 @@ elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "glm5" ]]; then git checkout sa-submission-q2-2026 mkdir -p recipes/sglang/glm5 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5" recipes/sglang/glm5 +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout main + mkdir -p recipes/vllm/minimax-m2.5 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5" recipes/vllm/minimax-m2.5 else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" From 97fbd88df1da6b8577d8b4db553e727738b0d733 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 2 Jun 2026 11:31:05 -0700 Subject: [PATCH 5/7] fix: pin minimax gb300 sweep to nv runners --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 69f56fa08..3a4f1a4f1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9265,7 +9265,7 @@ minimaxm2.5-fp4-gb300-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 model: nvidia/MiniMax-M2.5-NVFP4 model-prefix: minimaxm2.5 - runner: gb300 + runner: gb300-nv precision: fp4 framework: dynamo-vllm multinode: true From 45759a70815cb4ce7a43223e094bb13e4d99e4e5 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 2 Jun 2026 18:24:42 -0700 Subject: [PATCH 6/7] Fix GB300 eval artifact copy --- runners/launch_gb300-nv.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 0a24120f1..4677e5004 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -379,7 +379,9 @@ if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then shopt -s nullglob for eval_file in "$EVAL_DIR"/*; do [ -f "$eval_file" ] || continue - cp "$eval_file" "$GITHUB_WORKSPACE/" + eval_dest="$GITHUB_WORKSPACE/$(basename "$eval_file")" + rm -f "$eval_dest" + cp "$eval_file" "$eval_dest" echo "Copied eval artifact: $(basename "$eval_file")" done shopt -u nullglob From fd35ba3eb568102fa5c7b67967dc92b0b77a8e7c Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Wed, 3 Jun 2026 07:30:27 -0700 Subject: [PATCH 7/7] Limit PR 1648 sweep to GB200 FP8 --- perf-changelog.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 756810ac9..b0eb01451 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3431,13 +3431,6 @@ - "Same 1k/1k and 8k/1k search space as gb300, plus a new tp8-1p1d at low concurrencies for both ISLs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1652 -- config-keys: - - minimaxm2.5-fp4-gb300-dynamo-vllm - description: - - "Add MiniMax-M2.5 NVFP4 GB300 disaggregated multinode vLLM benchmarks via Dynamo" - - "Add 1k1k/8k1k minimax recipe set under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1641 - - config-keys: - minimaxm2.5-fp8-gb200-dynamo-vllm description: