diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b1ab08968..855ba0555 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -10080,6 +10080,141 @@ qwen3.5-fp8-h100-sglang-agentic: - { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] } - { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] } +minimaxm2.5-fp8-gb200-dynamo-vllm: + image: vllm/vllm-openai:v0.20.1 + model: MiniMaxAI/MiniMax-M2.5 + model-prefix: minimaxm2.5 + runner: gb200 + precision: fp8 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [1, 4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [2, 32, 64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [1024] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [512, 1024] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml" + decode: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [4096, 8192] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [1, 4, 8, 16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [256, 512] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [1024, 2048, 4096] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + minimaxm2.5-fp8-b200-dynamo-vllm: image: vllm/vllm-openai:v0.20.1 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml new file mode 100644 index 000000000..120a35e45 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml @@ -0,0 +1,67 @@ +name: "minimax-m2.5-vllm-disagg-gb200-1p1d-tp4" + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16x32x64" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml new file mode 100644 index 000000000..6b5e76e42 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml @@ -0,0 +1,67 @@ +name: "minimax-m2.5-vllm-disagg-gb200-1p2d-tp4" + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2x32x64x128x256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml new file mode 100644 index 000000000..765562d0c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-gb200-1p3d-tp4ep" + +# Rate-matched tp4ep for FP8 GB200 1k/1k. +# X_tp4ep_fp8_gb200 = 17.9k tok/s; P_per_worker = 48k; ideal X/P = 0.37; 1P:3D = 0.33 ✓ + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 3 + prefill_workers: 1 + decode_workers: 3 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml new file mode 100644 index 000000000..aeeb8a012 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml @@ -0,0 +1,74 @@ +name: "minimax-m2.5-vllm-disagg-gb200-1p4d-dep2" + +# Rate-matched dep2 for FP8 GB200 1k/1k. +# X_dep2_fp8_gb200 = 12.7k tok/s; P_per_worker = 48k; ideal X/P = 0.27; 1P:4D = 0.25 ✓ + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 2 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml new file mode 100644 index 000000000..83bc7aeb2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml @@ -0,0 +1,86 @@ +name: "minimax-m2.5-vllm-disagg-gb200-2p1d-dep8" + +# model: +# path: "minimax-m2.5-fp8" +# container: "v0.18.1" +# precision: "fp8" + +# dynamo: +# version: 1.0.1 +# install: true + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + + + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x1024" + # warmup_prompts: 1 + # use_chat_template: false + # req_rate: "inf" + # random_range_ratio: 1.0 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml new file mode 100644 index 000000000..534019222 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml @@ -0,0 +1,74 @@ +name: "minimax-m2.5-vllm-disagg-gb200-2p3d-dep4" + +# Rate-matched dep4 for FP8 GB200 1k/1k. +# X_dep4_fp8_gb200 = 30.9k tok/s; P_per_worker = 48k; ideal X/P = 0.64; 2P:3D = 0.67 ✓ + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 3 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096x8192" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml new file mode 100644 index 000000000..847c4b138 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-gb200-8k1k-1p1d-tp4" + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + max-num-batched-tokens: 16384 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16x32x64x128" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml new file mode 100644 index 000000000..61497185a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-gb200-1p1d-tp4ep" + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + max-num-batched-tokens: 16384 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + pipeline-parallel-size: 1 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml new file mode 100644 index 000000000..c92757146 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml @@ -0,0 +1,76 @@ +name: "minimax-m2.5-vllm-disagg-gb200-8k1k-3p2d-dep4" + +# Rate-matched dep4 for FP8 GB200 8k/1k. +# X_dep4_fp8_gb200_8k ≈ 9.8k tok/s (from 5p2d-dep4 saturation); +# P_per_worker_8k = 57k; ratio = X*8/P = 78.4/57 = 1.38; 3P:2D = 1.5 ✓ (closest int fit) + +model: + path: "minimax-m2.5-fp8" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp8" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + prefill_nodes: 2 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13346 + enable-expert-parallel: true + max-num-batched-tokens: 16384 + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + safetensors-load-strategy: "prefetch" + trust-remote-code: true + no-enable-prefix-caching: true + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1024x2048x4096" diff --git a/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom_mtp.sh b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom_mtp.sh index 159598a07..2af10d749 100644 --- a/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/dsr1_fp8_mi355x_atom_mtp.sh @@ -36,7 +36,7 @@ if [ "$DP_ATTENTION" = "true" ]; then else #DP+TP PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention ) fi -fi +fi SPEC_ARGS=(--method mtp --num-speculative-tokens 3 ) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5bee49782..e28374a87 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3444,3 +3444,10 @@ - "Add MiniMax-M2.5 NVFP4 GB300 disaggregated multinode vLLM benchmarks via Dynamo" - "Add 1k1k/8k1k minimax recipe set under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5/" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1641 + +- config-keys: + - minimaxm2.5-fp8-gb200-dynamo-vllm + description: + - "Add MiniMax-M2.5 FP8 GB200 disaggregated multinode vLLM benchmarks via Dynamo" + - "Add 1k1k/8k1k FP8 recipe set under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8/" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1648 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index dc7f83460..45ef3a952 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -57,8 +57,11 @@ elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/lustre01/models/MiniMax-M2.5-NVFP4" export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-nvfp4" + elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then + export MODEL_PATH="/mnt/lustre01/models/MiniMax-M2.5" + export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" else - echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4" + echo "Unsupported model prefix/precision combination: $MODEL_PREFIX/$PRECISION. Supported combinations for dynamo-vllm: kimik2.5/fp4, dsv4/fp4, minimaxm2.5/fp4, minimaxm2.5/fp8" exit 1 fi else @@ -259,8 +262,16 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" || exit 1 cd "$SRT_REPO_DIR" || exit 1 git checkout main || exit 1 - mkdir -p recipes/vllm/minimax-m2.5-gb200 || exit 1 - cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200" recipes/vllm/minimax-m2.5-gb200 || exit 1 + if [[ $PRECISION == "fp8" ]]; then + mkdir -p recipes/vllm/minimax-m2.5-gb200-fp8 || exit 1 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200-fp8" recipes/vllm/minimax-m2.5-gb200-fp8 || exit 1 + elif [[ $PRECISION == "fp4" ]]; then + mkdir -p recipes/vllm/minimax-m2.5-gb200 || exit 1 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-gb200" recipes/vllm/minimax-m2.5-gb200 || exit 1 + else + echo "Unsupported minimaxm2.5 precision for GB200 dynamo-vllm: $PRECISION" >&2 + exit 1 + fi elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR"