From 300adb4a7466292f0e72fc031a15ad1549ff4d3c Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 2 Jun 2026 11:51:04 -0700 Subject: [PATCH 1/3] Add MiniMax-M2.5 FP4 B200 Dynamo vLLM recipes --- .github/configs/nvidia-master.yaml | 239 ++++++++++++++++++ .../minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml | 72 ++++++ .../1k1k/dep2-2p3d-c6144.yaml | 72 ++++++ .../minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml | 72 ++++++ .../minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml | 74 ++++++ .../minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml | 70 +++++ .../minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml | 72 ++++++ .../minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml | 68 +++++ .../1k1k/tp4ep-1p1d-hi-conc.yaml | 68 +++++ .../1k1k/tp4ep-1p1d.yaml | 70 +++++ .../1k1k/tp4ep-1p2d.yaml | 68 +++++ .../1k1k/tp4ep-1p3d-hi-conc.yaml | 68 +++++ .../1k1k/tp4ep-1p3d.yaml | 68 +++++ .../1k1k/tp4ep-2p3d.yaml | 72 ++++++ .../minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml | 75 ++++++ .../minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml | 68 +++++ .../8k1k/tp4ep-1p1d-hi-conc.yaml | 68 +++++ .../8k1k/tp4ep-1p1d.yaml | 68 +++++ perf-changelog.yaml | 6 + runners/launch_b200-dgxc-slurm.sh | 1 + runners/launch_b200-dgxc.sh | 64 ++++- 21 files changed, 1497 insertions(+), 6 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml create mode 120000 runners/launch_b200-dgxc-slurm.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d3b1b6729..77f896b63 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9905,3 +9905,242 @@ qwen3.5-fp8-h100-sglang-agentic: search-space: - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] } - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] } + +minimaxm2.5-fp4-b200-dynamo-vllm: + image: vllm/vllm-openai:v0.20.1 + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + runner: b200-multinode + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [16] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [32, 64, 128] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [256] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [128, 256] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [1024, 2048] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [256, 1024] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [2048, 4096, 8192] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml" + decode: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml" + decode: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [6144] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml" + decode: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [1024, 2048] + prefill: + num-worker: 3 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [1024, 2048, 4096] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [4, 8, 16] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [32, 64] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [256, 512, 1024] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [256, 512, 1024, 2048] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml new file mode 100644 index 000000000..badf45403 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-2xdep2" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 1 + gpus_per_decode: 2 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048x4096x8192" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml new file mode 100644 index 000000000..c3c994bca --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-2p3xdep2-c6144" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 2 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "6144" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml new file mode 100644 index 000000000..5b352e35f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-2p3xdep2" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 2 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml new file mode 100644 index 000000000..b7809a9e2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml @@ -0,0 +1,74 @@ +name: "minimax-m2.5-vllm-disagg-b200-1k1k-3p2xdep4" + +# Rate-matched dep4 at 1k/1k. +# Measured X_dep4/P = 56.8k / 38k = 1.49; 3P:2D ratio = 1.5 ✓ + + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024x2048" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml new file mode 100644 index 000000000..683f4c72d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-2p1xdep8" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024x2048x4096" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml new file mode 100644 index 000000000..bc6a6a1ac --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4-1p1d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + UCX_RCACHE_MAX_UNRELEASED: "1024" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + UCX_RCACHE_MAX_UNRELEASED: "1024" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "16" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml new file mode 100644 index 000000000..5d7072ea5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4-1p2d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml new file mode 100644 index 000000000..23ec9444c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p1d-hi-conc" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml new file mode 100644 index 000000000..4a56ab27e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p1d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32x64x128" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml new file mode 100644 index 000000000..87c928c63 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p2d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml new file mode 100644 index 000000000..e82838715 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p3d-hi-conc" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024x2048" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml new file mode 100644 index 000000000..268a58535 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p3d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "64x128x256x512" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml new file mode 100644 index 000000000..0d83e2e63 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b200-1k1k-2p3xtp4ep" + +# Better-matched tp4ep at 1k/1k. +# Measured X_tp4ep/P = 24.1k / 38k = 0.63; 2P:3D ratio = 0.67 ✓ + + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256x1024" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml new file mode 100644 index 000000000..0a867e508 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml @@ -0,0 +1,75 @@ +name: "minimax-m2.5-vllm-disagg-b200-8k1k-2p1xdep4" + +# Rate-matched dep4 at 8k/1k. +# Measured X_dep4_8k = 13.6k tok/s; rate-match ratio = X*8/P_8k = 13.6*8/58 = 1.88 +# 2P:1D = 2.0, much closer to optimum than 4P:1D (2× over-prefilled). + + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024x2048" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml new file mode 100644 index 000000000..75c7b9d73 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-8k1k-1p1xtp4" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml new file mode 100644 index 000000000..c43abe595 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-8k1k-1p1xtp4ep-hi-conc" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml new file mode 100644 index 000000000..3d295e290 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-8k1k-1p1xtp4ep" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64" + random_range_ratio: 0.8 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d201e9f3b..6458d00e4 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3395,3 +3395,9 @@ description: - "Add DeepSeek-V4-Pro FP4 MI355X ATOM MTP3 benchmark; image rocm/atom:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.10.0_atom0.1.3" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1627 + +- config-keys: + - minimaxm2.5-fp4-b200-dynamo-vllm + description: + - "Add MiniMax-M2.5 NVFP4 B200 disaggregated multinode vLLM benchmarks via Dynamo" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh new file mode 120000 index 000000000..f45b7edcb --- /dev/null +++ b/runners/launch_b200-dgxc-slurm.sh @@ -0,0 +1 @@ +launch_b200-dgxc.sh \ No newline at end of file diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index bb3bf9ed1..f57df7953 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -59,7 +59,7 @@ elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then export SRT_SLURM_MODEL_PREFIX="minimaxm2.5" elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/lustre/fsw/models/MiniMax-M2.5-NVFP4" - export SRT_SLURM_MODEL_PREFIX="minimaxm2.5-fp4" + export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-nvfp4" elif [[ $MODEL_PREFIX == "gptoss" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/lustre/fsw/models/gpt-oss-120b" export SRT_SLURM_MODEL_PREFIX="gptoss" @@ -73,6 +73,10 @@ fi export AIPERF_MMAP_CACHE_HOST_PATH="/lustre/fsw/gharunners/aiperf-cache" if [[ "$IS_MULTINODE" == "true" ]]; then + if [[ $MODEL_PREFIX == "minimaxm2.5" && $FRAMEWORK == "dynamo-vllm" ]]; then + SLURM_PARTITION="${B200_MULTINODE_SLURM_PARTITION:-gpu-1}" + SLURM_ACCOUNT="${B200_MULTINODE_SLURM_ACCOUNT:-restricted}" + fi # Validate framework if [[ $FRAMEWORK != "dynamo-sglang" && $FRAMEWORK != "dynamo-trt" && $FRAMEWORK != "dynamo-vllm" ]]; then @@ -105,6 +109,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then git checkout aflowers/vllm-gb200-v0.20.0 mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 + elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" || exit 1 + git checkout main + mkdir -p recipes/vllm/minimax-m2.5-b200-fp4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4" recipes/vllm/minimax-m2.5-b200-fp4 elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "glm5" && $PRECISION == "fp8" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 @@ -122,7 +132,11 @@ if [[ "$IS_MULTINODE" == "true" ]]; then curl -LsSf https://astral.sh/uv/install.sh | sh export PATH="$UV_INSTALL_DIR:$PATH" - uv venv "$GITHUB_WORKSPACE/.venv" + if [[ $MODEL_PREFIX == "minimaxm2.5" && $FRAMEWORK == "dynamo-vllm" ]]; then + uv venv --seed "$GITHUB_WORKSPACE/.venv" + else + uv venv "$GITHUB_WORKSPACE/.venv" + fi source "$GITHUB_WORKSPACE/.venv/bin/activate" uv pip install -e . @@ -133,12 +147,48 @@ if [[ "$IS_MULTINODE" == "true" ]]; then # Map container images to local squash files NGINX_IMAGE="nginx:1.27.4" - SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - NGINX_SQUASH_FILE="/home/sa-shared/containers/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + SQUASH_DIR="${B200_SQUASH_DIR:-/home/sa-shared/containers}" + if [[ $MODEL_PREFIX == "minimaxm2.5" && $FRAMEWORK == "dynamo-vllm" ]]; then + SQUASH_DIR="${B200_SQUASH_DIR:-/home/slurm-shared/gharunners/squash}" + fi + if ! mkdir -p "$SQUASH_DIR" 2>/dev/null || [[ ! -w "$SQUASH_DIR" ]]; then + echo "Warning: $SQUASH_DIR is not writable; using workspace-local squash cache" >&2 + SQUASH_DIR="$GITHUB_WORKSPACE/.container-squash" + mkdir -p "$SQUASH_DIR" + fi + chmod a+rx "$SQUASH_DIR" || true + + SQUASH_FILE="$SQUASH_DIR/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + NGINX_SQUASH_FILE="$SQUASH_DIR/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" # Import containers via enroot - enroot import -o $SQUASH_FILE docker://$IMAGE - enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE + import_squash() { + local squash_file="$1" + local image_ref="$2" + local image_key + image_key=$(echo "$image_ref" | sed 's/[\/:@#]/_/g') + local lock_dir="${SQUASH_DIR}/.locks" + mkdir -p "$lock_dir" + local lock_file="${lock_dir}/${image_key}.lock" + + ( + flock -w 600 9 || { echo "Failed to acquire lock for $squash_file" >&2; exit 1; } + if unsquashfs -l "$squash_file" > /dev/null 2>&1; then + echo "Squash file already exists and is valid, skipping import: $squash_file" + else + rm -f "$squash_file" + enroot import -o "$squash_file" "docker://$image_ref" + if ! unsquashfs -l "$squash_file" > /dev/null 2>&1; then + echo "Error: enroot import did not produce a valid squash file: $squash_file" >&2 + exit 1 + fi + chmod a+r "$squash_file" || true + fi + ) 9>"$lock_file" + } + + import_squash "$SQUASH_FILE" "$IMAGE" || exit 1 + import_squash "$NGINX_SQUASH_FILE" "$NGINX_IMAGE" || exit 1 export ISL="$ISL" export OSL="$OSL" @@ -182,6 +232,8 @@ EOF export INFMAX_WORKSPACE="$GITHUB_WORKSPACE" echo "Submitting job with srtctl..." + echo "MODEL_PATH=$MODEL_PATH (exists=$(test -d "$MODEL_PATH" && echo yes || echo NO))" + ls -ld "$MODEL_PATH" 2>&1 || ls /lustre/fsw/models/ 2>&1 | head -40 if [[ -z "$CONFIG_FILE" ]]; then echo "Error: CONFIG_FILE is not set. The srt-slurm path requires a CONFIG_FILE in additional-settings." >&2 From 9dfc6c889eab333193174a1ee9c2c44376c43a55 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 2 Jun 2026 11:59:17 -0700 Subject: [PATCH 2/3] Update B200 MiniMax changelog PR link --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 6458d00e4..76556cb21 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3400,4 +3400,4 @@ - minimaxm2.5-fp4-b200-dynamo-vllm description: - "Add MiniMax-M2.5 NVFP4 B200 disaggregated multinode vLLM benchmarks via Dynamo" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1643 From 00ef06af0527b1ca28bf042ce4a42a3528e1c231 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 2 Jun 2026 12:17:23 -0700 Subject: [PATCH 3/3] Fix B200 MiniMax Slurm account defaults --- runners/launch_b200-dgxc.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index f57df7953..18e0a7883 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -73,11 +73,6 @@ fi export AIPERF_MMAP_CACHE_HOST_PATH="/lustre/fsw/gharunners/aiperf-cache" if [[ "$IS_MULTINODE" == "true" ]]; then - if [[ $MODEL_PREFIX == "minimaxm2.5" && $FRAMEWORK == "dynamo-vllm" ]]; then - SLURM_PARTITION="${B200_MULTINODE_SLURM_PARTITION:-gpu-1}" - SLURM_ACCOUNT="${B200_MULTINODE_SLURM_ACCOUNT:-restricted}" - fi - # Validate framework if [[ $FRAMEWORK != "dynamo-sglang" && $FRAMEWORK != "dynamo-trt" && $FRAMEWORK != "dynamo-vllm" ]]; then echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, dynamo-vllm"