Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 213 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10293,3 +10293,216 @@ minimaxm2.5-fp4-gb200-dynamo-vllm:
tp: 4
ep: 4
dp-attn: true

minimaxm2.5-fp8-gb300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: gb300-nv
precision: fp8
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- conc-list: [8, 16, 32, 64, 128]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p1d-tp4.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [32, 64, 128, 256, 512]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p2d-tp4.yaml"
decode:
num-worker: 2
tp: 4
ep: 1
dp-attn: false
- conc-list: [256, 512, 1024]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-1p2d-tp4ep.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: false
- conc-list: [256, 512, 1024]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p1d-dep8.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [512, 1024, 2048]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p2d-dep4.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
- conc-list: [4096, 8192]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p2d-dep4-hi-conc.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
- conc-list: [1024]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/1k1k/disagg-gb300-2p3d-dep2.yaml"
decode:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
- isl: 8192
osl: 1024
search-space:
- conc-list: [16, 64, 128]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-1p1d-tp4ep.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
- conc-list: [256, 512]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-1p1d-tp4ep-hi-conc.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
- conc-list: [32]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-2p1d-tp2.yaml"
decode:
num-worker: 1
tp: 2
ep: 1
dp-attn: false
- conc-list: [64, 128, 256, 512]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-2p1d-tp4ep.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
- conc-list: [64]
prefill:
num-worker: 3
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-tp4.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [256, 512]
prefill:
num-worker: 3
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-dep4.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
- conc-list: [1024, 2048]
prefill:
num-worker: 3
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-3p1d-dep4-hi-conc.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
- conc-list: [512, 1024, 2048]
prefill:
num-worker: 5
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-fp8/8k1k/disagg-gb300-5p2d-dep4.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: "minimax-m2.5-vllm-disagg-gb300-1p1d-tp4"

model:
path: "minimax-m2.5-fp8"
container: "vllm/vllm-openai:v0.20.1"
precision: "fp8"

dynamo:
install: true
wheel: "1.2.0.dev20260526"

setup_script: install-deps.sh

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 1
gpus_per_decode: 4
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing GB300 slurm limits

Medium Severity

New GB300 MiniMax Dynamo vLLM recipes jump straight from setup_script to resources and never set slurm.time_limit or health_check, unlike existing GB300 vLLM disagg recipes on the same runner. Jobs therefore keep the launcher’s four-hour default while engine startup allows up to an hour, so long multi-concurrency sweeps can be killed before benchmarks finish.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit cc720f0. Configure here.


frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"

decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
safetensors-load-strategy: "prefetch"
trust-remote-code: true
no-enable-prefix-caching: true
stream-interval: 32

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
kv-cache-dtype: "fp8"
tensor-parallel-size: 4
safetensors-load-strategy: "prefetch"
trust-remote-code: true
no-enable-prefix-caching: true
stream-interval: 32

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "8x16x32x64x128"
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: "minimax-m2.5-vllm-disagg-gb300-1p2d-tp4"

model:
path: "minimax-m2.5-fp8"
container: "vllm/vllm-openai:v0.20.1"
precision: "fp8"

dynamo:
install: true
wheel: "1.2.0.dev20260526"


setup_script: install-deps.sh

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 2
prefill_workers: 1
decode_workers: 2
gpus_per_prefill: 1
gpus_per_decode: 4

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"

decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
safetensors-load-strategy: "prefetch"
trust-remote-code: true
no-enable-prefix-caching: true
stream-interval: 32

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
kv-cache-dtype: "fp8"
tensor-parallel-size: 4
safetensors-load-strategy: "prefetch"
trust-remote-code: true
no-enable-prefix-caching: true
stream-interval: 32

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "32x64x128x256x512"
# warmup_prompts: 1
# use_chat_template: false
# req_rate: "inf"
# random_range_ratio: 1.0
Loading