From ac2327418b72fa90d097fbaa41a9c1586292d562 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 2 Jun 2026 11:14:48 -0700 Subject: [PATCH 1/3] Add MiniMax-M2.5 FP4 GB300 Dynamo vLLM recipes --- .github/configs/nvidia-master.yaml | 174 ++++++++++++++++++ .../minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml | 73 ++++++++ .../vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml | 73 ++++++++ .../vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml | 70 +++++++ .../vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml | 72 ++++++++ .../vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml | 68 +++++++ .../vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml | 70 +++++++ .../vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml | 68 +++++++ .../vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml | 70 +++++++ .../vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml | 70 +++++++ .../vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml | 68 +++++++ .../vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml | 68 +++++++ .../vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml | 68 +++++++ perf-changelog.yaml | 7 + runners/launch_gb300-nv.sh | 11 +- 15 files changed, 1029 insertions(+), 1 deletion(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b8db07e0d..69f56fa08 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9261,6 +9261,180 @@ qwen3.5-fp8-h100-sglang-mtp: search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } +minimaxm2.5-fp4-gb300-dynamo-vllm: + image: vllm/vllm-openai:v0.20.1 + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + runner: gb300 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [2, 4, 16] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [4, 8, 16, 64] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [32, 64, 128] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [64, 128, 256, 512, 1024] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [2048] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml" + decode: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [6144, 8192] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml" + decode: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [1024, 2048, 4096] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [2, 4, 8, 16] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [32, 64, 128, 256] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [64, 128] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [256] + prefill: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [1024, 2048] + prefill: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + glm5-fp4-gb300-dynamo-sglang: image: lmsysorg/sglang:v0.5.11-cu130 model: nvidia/GLM-5-NVFP4 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml new file mode 100644 index 000000000..c7f7e28af --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml @@ -0,0 +1,73 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-2p3xdep2-c6144" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 3 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 2 + spread_workers: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "6144x8192" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml new file mode 100644 index 000000000..adaf6f271 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml @@ -0,0 +1,73 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-2p3xdep2" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 3 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 2 + spread_workers: true + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml new file mode 100644 index 000000000..28427e002 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-2p1xdep8" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024x2048x4096" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml new file mode 100644 index 000000000..eee93c9f8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-focus-tp4-1p1d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + UCX_RCACHE_MAX_UNRELEASED: "1024" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + UCX_RCACHE_MAX_UNRELEASED: "1024" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2x4x16" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml new file mode 100644 index 000000000..10ba980ca --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-focus-tp4-1p2d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x64" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml new file mode 100644 index 000000000..ebff26fb0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-focus-tp4ep-1p1d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32x64x128" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml new file mode 100644 index 000000000..5353e4dd0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-gb300-decode-focus-tp4ep-1p3d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 3 + prefill_workers: 1 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "64x128x256x512x1024" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml new file mode 100644 index 000000000..d3c777618 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-gb300-8k1k-4p1xdep4" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml new file mode 100644 index 000000000..a56c095af --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-gb300-8k1k-4p1xdep8" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024x2048" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml new file mode 100644 index 000000000..a92975c57 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-gb300-8k1k-1p1xtp4" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "2x4x8x16" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml new file mode 100644 index 000000000..53daeafbd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-gb300-8k1k-1p1xtp4ep" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64x128x256" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml new file mode 100644 index 000000000..163d412f5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-gb300-8k1k-2p1xtp4ep" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "64x128" + random_range_ratio: 0.8 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 28523da86..c4f5232b9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3409,3 +3409,10 @@ description: - "Add MiniMax-M2.5 NVFP4 GB200 disaggregated multinode vLLM benchmarks via Dynamo" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1642 + +- config-keys: + - minimaxm2.5-fp4-gb300-dynamo-vllm + description: + - "Add MiniMax-M2.5 NVFP4 GB300 disaggregated multinode vLLM benchmarks via Dynamo" + - "Add 1k1k/8k1k minimax recipe set under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1641 diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index b47e103fd..0a24120f1 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -45,8 +45,11 @@ elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp4" ]]; then elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp8" ]]; then export MODEL_PATH=/scratch/models/GLM-5-FP8 export SRT_SLURM_MODEL_PREFIX="glm-5-fp8" +elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then + export MODEL_PATH=/data/models/MiniMax-M2.5-NVFP4 + export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-nvfp4" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4, glm5-fp4, glm5-fp8" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4, glm5-fp4, glm5-fp8, minimaxm2.5-fp4" exit 1 fi @@ -137,6 +140,12 @@ elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "glm5" ]]; then git checkout sa-submission-q2-2026 mkdir -p recipes/sglang/glm5 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5" recipes/sglang/glm5 +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout main + mkdir -p recipes/vllm/minimax-m2.5 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5" recipes/vllm/minimax-m2.5 else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" From 97fbd88df1da6b8577d8b4db553e727738b0d733 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 2 Jun 2026 11:31:05 -0700 Subject: [PATCH 2/3] fix: pin minimax gb300 sweep to nv runners --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 69f56fa08..3a4f1a4f1 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9265,7 +9265,7 @@ minimaxm2.5-fp4-gb300-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 model: nvidia/MiniMax-M2.5-NVFP4 model-prefix: minimaxm2.5 - runner: gb300 + runner: gb300-nv precision: fp4 framework: dynamo-vllm multinode: true From 45759a70815cb4ce7a43223e094bb13e4d99e4e5 Mon Sep 17 00:00:00 2001 From: jasonlizhengjian Date: Tue, 2 Jun 2026 18:24:42 -0700 Subject: [PATCH 3/3] Fix GB300 eval artifact copy --- runners/launch_gb300-nv.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 0a24120f1..4677e5004 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -379,7 +379,9 @@ if [[ "${RUN_EVAL:-false}" == "true" || "${EVAL_ONLY:-false}" == "true" ]]; then shopt -s nullglob for eval_file in "$EVAL_DIR"/*; do [ -f "$eval_file" ] || continue - cp "$eval_file" "$GITHUB_WORKSPACE/" + eval_dest="$GITHUB_WORKSPACE/$(basename "$eval_file")" + rm -f "$eval_dest" + cp "$eval_file" "$eval_dest" echo "Copied eval artifact: $(basename "$eval_file")" done shopt -u nullglob