diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c1412b8f4..e60515eee 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -10293,3 +10293,242 @@ minimaxm2.5-fp4-gb200-dynamo-vllm: tp: 4 ep: 4 dp-attn: true + +minimaxm2.5-fp4-b200-dynamo-vllm: + image: vllm/vllm-openai:v0.20.1 + model: nvidia/MiniMax-M2.5-NVFP4 + model-prefix: minimaxm2.5 + runner: b200-multinode + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [16] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [4, 8, 16, 32, 64] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [32, 64, 128] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [256] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [128, 256] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [1024, 2048] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [256, 1024] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml" + decode: + num-worker: 3 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [2048, 4096, 8192] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml" + decode: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [4096] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml" + decode: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [6144] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml" + decode: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + - conc-list: [1024, 2048] + prefill: + num-worker: 3 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [1024, 2048, 4096] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [4, 8, 16] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [32, 64] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [256, 512, 1024] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [256, 512, 1024, 2048] + prefill: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml new file mode 100644 index 000000000..badf45403 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-1p2d.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-2xdep2" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 1 + gpus_per_decode: 2 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048x4096x8192" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml new file mode 100644 index 000000000..c3c994bca --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d-c6144.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-2p3xdep2-c6144" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 2 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "6144" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml new file mode 100644 index 000000000..5b352e35f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep2-2p3d.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-2p3xdep2" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 2 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4096" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml new file mode 100644 index 000000000..b7809a9e2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep4-3p2d.yaml @@ -0,0 +1,74 @@ +name: "minimax-m2.5-vllm-disagg-b200-1k1k-3p2xdep4" + +# Rate-matched dep4 at 1k/1k. +# Measured X_dep4/P = 56.8k / 38k = 1.49; 3P:2D ratio = 1.5 ✓ + + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024x2048" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml new file mode 100644 index 000000000..683f4c72d --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/dep8-2p1d.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-2p1xdep8" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 128 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 128 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024x2048x4096" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml new file mode 100644 index 000000000..bc6a6a1ac --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p1d.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4-1p1d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + UCX_RCACHE_MAX_UNRELEASED: "1024" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + UCX_RCACHE_MAX_UNRELEASED: "1024" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "16" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml new file mode 100644 index 000000000..5d7072ea5 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4-1p2d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4-1p2d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x8x16x32x64" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml new file mode 100644 index 000000000..23ec9444c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d-hi-conc.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p1d-hi-conc" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml new file mode 100644 index 000000000..4a56ab27e --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p1d.yaml @@ -0,0 +1,70 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p1d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32x64x128" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml new file mode 100644 index 000000000..87c928c63 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p2d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p2d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "128x256" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml new file mode 100644 index 000000000..e82838715 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d-hi-conc.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p3d-hi-conc" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1024x2048" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml new file mode 100644 index 000000000..268a58535 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-1p3d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-decode-focus-tp4ep-1p3d" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "64x128x256x512" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml new file mode 100644 index 000000000..0d83e2e63 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/1k1k/tp4ep-2p3d.yaml @@ -0,0 +1,72 @@ +name: "minimax-m2.5-vllm-disagg-b200-1k1k-2p3xtp4ep" + +# Better-matched tp4ep at 1k/1k. +# Measured X_tp4ep/P = 24.1k / 38k = 0.63; 2P:3D ratio = 0.67 ✓ + + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 3 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 2048 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "256x1024" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml new file mode 100644 index 000000000..0a867e508 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/dep4-2p1d.yaml @@ -0,0 +1,75 @@ +name: "minimax-m2.5-vllm-disagg-b200-8k1k-2p1xdep4" + +# Rate-matched dep4 at 8k/1k. +# Measured X_dep4_8k = 13.6k tok/s; rate-match ratio = X*8/P_8k = 13.6*8/58 = 1.88 +# 2P:1D = 2.0, much closer to optimum than 4P:1D (2× over-prefilled). + + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + max-num-seqs: 864 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024x2048" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml new file mode 100644 index 000000000..75c7b9d73 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4-1p1d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-8k1k-1p1xtp4" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: false + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4x8x16" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml new file mode 100644 index 000000000..c43abe595 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d-hi-conc.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-8k1k-1p1xtp4ep-hi-conc" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512x1024" + random_range_ratio: 0.8 diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml new file mode 100644 index 000000000..3d295e290 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4/8k1k/tp4ep-1p1d.yaml @@ -0,0 +1,68 @@ +name: "minimax-m2.5-vllm-disagg-b200-8k1k-1p1xtp4ep" + +model: + path: "minimax-m2.5-nvfp4" + container: "vllm/vllm-openai:v0.20.1" + precision: "fp4" + +dynamo: + install: true + wheel: "1.2.0.dev20260526" + +setup_script: install-deps.sh + +resources: + gpu_type: "b200" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 1 + gpus_per_decode: 4 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + VLLM_ENGINE_READY_TIMEOUT_S: "3600" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + trust-remote-code: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + stream-interval: 32 + + decode: + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + kv-cache-dtype: "fp8" + tensor-parallel-size: 4 + enable-expert-parallel: true + no-enable-prefix-caching: true + max-model-len: 9280 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + gpu-memory-utilization: 0.90 + stream-interval: 32 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "32x64" + random_range_ratio: 0.8 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1ce1698b8..c9eb3d294 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3416,3 +3416,9 @@ - "Add MiniMax-M2.5 FP8 B200 disaggregated multinode vLLM benchmarks via Dynamo" - "Add 1k1k/8k1k FP8 recipe set under benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp8/" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1649 + +- config-keys: + - minimaxm2.5-fp4-b200-dynamo-vllm + description: + - "Add MiniMax-M2.5 NVFP4 B200 disaggregated multinode vLLM benchmarks via Dynamo" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1643 diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh index 4faa635f0..9eeed2af6 100644 --- a/runners/launch_b200-dgxc.sh +++ b/runners/launch_b200-dgxc.sh @@ -59,7 +59,7 @@ elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/lustre/fsw/models/MiniMax-M2.5-NVFP4" - export SRT_SLURM_MODEL_PREFIX="minimaxm2.5-fp4" + export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-nvfp4" elif [[ $MODEL_PREFIX == "gptoss" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/lustre/fsw/models/gpt-oss-120b" export SRT_SLURM_MODEL_PREFIX="gptoss" @@ -73,7 +73,6 @@ fi export AIPERF_MMAP_CACHE_HOST_PATH="/lustre/fsw/gharunners/aiperf-cache" if [[ "$IS_MULTINODE" == "true" ]]; then - # Validate framework if [[ $FRAMEWORK != "dynamo-sglang" && $FRAMEWORK != "dynamo-trt" && $FRAMEWORK != "dynamo-vllm" ]]; then echo "Unsupported framework: $FRAMEWORK. Supported frameworks are: dynamo-trt, dynamo-sglang, dynamo-vllm" @@ -105,6 +104,12 @@ if [[ "$IS_MULTINODE" == "true" ]]; then git checkout aflowers/vllm-gb200-v0.20.0 mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 + elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" || exit 1 + git checkout main + mkdir -p recipes/vllm/minimax-m2.5-b200-fp4 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b200-fp4" recipes/vllm/minimax-m2.5-b200-fp4 elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1