Skip to content
309 changes: 309 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9261,6 +9261,180 @@ qwen3.5-fp8-h100-sglang-mtp:
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }

minimaxm2.5-fp4-gb300-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: nvidia/MiniMax-M2.5-NVFP4
model-prefix: minimaxm2.5
runner: gb300-nv
precision: fp4
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- conc-list: [2, 4, 16]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p1d.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [4, 8, 16, 64]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4-1p2d.yaml"
decode:
num-worker: 2
tp: 4
ep: 1
dp-attn: false
- conc-list: [32, 64, 128]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p1d.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
- conc-list: [64, 128, 256, 512, 1024]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/tp4ep-1p3d.yaml"
decode:
num-worker: 3
tp: 4
ep: 4
dp-attn: false
- conc-list: [2048]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d.yaml"
decode:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
- conc-list: [6144, 8192]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep2-2p3d-c6144.yaml"
decode:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
- conc-list: [1024, 2048, 4096]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/1k1k/dep8-2p1d.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- isl: 8192
osl: 1024
search-space:
- conc-list: [2, 4, 8, 16]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4-1p1d.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [32, 64, 128, 256]
prefill:
num-worker: 1
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-1p1d.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
- conc-list: [64, 128]
prefill:
num-worker: 2
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/tp4ep-2p1d.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
- conc-list: [256]
prefill:
num-worker: 4
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep4-4p1d.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
- conc-list: [1024, 2048]
prefill:
num-worker: 4
tp: 1
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5/8k1k/dep8-4p1d.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

glm5-fp4-gb300-dynamo-sglang:
image: lmsysorg/sglang:v0.5.11-cu130
model: nvidia/GLM-5-NVFP4
Expand Down Expand Up @@ -9906,6 +10080,141 @@ qwen3.5-fp8-h100-sglang-agentic:
- { tp: 8, ep: 8, offloading: none, conc-list: [1, 2, 4, 8, 12, 14, 16] }
- { tp: 8, ep: 8, offloading: hicache, conc-list: [12, 14, 16, 20, 24, 28, 32, 42] }

minimaxm2.5-fp8-gb200-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
model: MiniMaxAI/MiniMax-M2.5
model-prefix: minimaxm2.5
runner: gb200
precision: fp8
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- conc-list: [1, 4, 8, 16, 32, 64]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p1d-tp4.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [2, 32, 64, 128, 256, 512]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p2d-tp4.yaml"
decode:
num-worker: 2
tp: 4
ep: 1
dp-attn: false
- conc-list: [1024]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p3d-tp4ep.yaml"
decode:
num-worker: 3
tp: 4
ep: 4
dp-attn: false
- conc-list: [512, 1024]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p1d-dep8.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
- conc-list: [4096]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-1p4d-dep2.yaml"
decode:
num-worker: 4
tp: 2
ep: 2
dp-attn: true
- conc-list: [4096, 8192]
prefill:
num-worker: 2
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/1k1k/disagg-gb200-2p3d-dep4.yaml"
decode:
num-worker: 3
tp: 4
ep: 4
dp-attn: true
- isl: 8192
osl: 1024
search-space:
- conc-list: [1, 4, 8, 16, 32, 64, 128]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
- conc-list: [256, 512]
prefill:
num-worker: 1
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-1p1d-tp4ep.yaml"
decode:
num-worker: 1
tp: 4
ep: 4
dp-attn: false
- conc-list: [1024, 2048, 4096]
prefill:
num-worker: 3
tp: 2
ep: 2
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/minimax-m2.5-gb200-fp8/8k1k/disagg-gb200-3p2d-dep4.yaml"
decode:
num-worker: 2
tp: 4
ep: 4
dp-attn: true


minimaxm2.5-fp8-b200-dynamo-vllm:
image: vllm/vllm-openai:v0.20.1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
name: "minimax-m2.5-vllm-disagg-gb200-1p1d-tp4"

model:
path: "minimax-m2.5-fp8"
container: "vllm/vllm-openai:v0.20.1"
precision: "fp8"

dynamo:
install: true
wheel: "1.2.0.dev20260526"

setup_script: install-deps.sh

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 2
gpus_per_decode: 4

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null

prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"

decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
VLLM_FLASHINFER_ALLREDUCE_BACKEND: "mnnvl"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 2
data-parallel-rpc-port: 13346
enable-expert-parallel: true
safetensors-load-strategy: "prefetch"
trust-remote-code: true
no-enable-prefix-caching: true
stream-interval: 32

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
kv-cache-dtype: "fp8"
tensor-parallel-size: 4
safetensors-load-strategy: "prefetch"
trust-remote-code: true
no-enable-prefix-caching: true
stream-interval: 32

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1x4x8x16x32x64"
Loading
Loading